In [1]:
import pandas as pd
import numpy as np

In [2]:
#Read Data
uni = pd.read_csv("../../data/universities.csv")
prof = pd.read_csv("../../data/professors.csv", index_col=0)
article = pd.read_pickle("../../data/clean_articles.pkl")

In [2]:
# Read data for Parsa
article = pd.read_pickle(r"C:\Users\Parsa\Downloads\clean_articles.pkl")

In [3]:
article

Unnamed: 0,title,GS_link,year,cite,main_authors,more_info,link_ids
0,Observation of a new boson at a mass of 125 Ge...,/citations?view_op=view_citation&hl=en&user=_C...,2012,22285,"S Chatrchyan, V Khachatryan, AM Sirunyan, A Tu...","Physics Letters B 716 (1), 30-61, 2012","0, 5600, 21725, 21726, 33843, 34234"
1,Observation of a new boson with mass near 125 ...,/citations?view_op=view_citation&hl=en&user=_C...,2013,6018,"S Chatrchyan, V Khachatryan, AM Sirunyan, A Tu...","Journal of High Energy Physics 2013 (6), 1-127...","0, 0, 5600, 21725, 21726, 21726, 21726, 21726,..."
3,Precise determination of the mass of the Higgs...,/citations?view_op=view_citation&hl=en&user=_C...,2015,4801,CMS Collaboration,"Eur. Phys. J. C 75 (5), 212, 2015","0, 5600, 21725, 33843, 34234"
4,Evidence for the 125 GeV Higgs boson decaying ...,/citations?view_op=view_citation&hl=en&user=_C...,2014,4605,"S Chatrchyan, V Khachatryan, AM Sirunyan, A Tu...","Journal of High Energy Physics 2014 (5), 1-72,...","0, 0, 5600, 21725, 33843, 34234"
5,Event generator tunes obtained from underlying...,/citations?view_op=view_citation&hl=en&user=_C...,2016,4536,"V Khachatryan, AM Sirunyan, A Tumasyan, W Adam...","The European Physical Journal C 76, 1-52, 2016","0, 5600, 21725, 21726, 33843, 34234"
...,...,...,...,...,...,...,...
2099821,A Non-permutation Flow shop Manufacturing Cell...,/citations?view_op=view_citation&hl=en&user=0p...,2014,6,"B Nikjo, Y zarook",International Journal of Applied Metaheuristic...,49387
2099822,Decision making in best player selection: An i...,/citations?view_op=view_citation&hl=en&user=0p...,2015,5,"B Nikjo, J Rezaeian, N Javadian",International Journal of Research in Industria...,"49367, 49368, 49387"
2099823,New Application of WeFA Framework and Fuzzy De...,/citations?view_op=view_citation&hl=en&user=0p...,2012,3,"BN Iman Radfar, Sarfaraz Hashemkhani Zolfani","American Journal of Scientific Research, 108-1...",49387
2099832,Improving Link Prediction in Social Network wi...,/citations?view_op=view_citation&hl=en&user=Q2...,2014,2,AB Tarnaz chamani1,"International Journal of Mechatronics, Electri...",49392


#### Step 0: Calculate h-index and i10-index for professors

In [3]:
prof_citations = {row['id']: [] for idx, row in prof.iterrows()}

# Populate the citation counts for each professor
for idx, art in article.iterrows():
    cite_count = art['cite']
    authors = str(art['link_ids']).split(',')
    for author_id in authors:
        try:
            id = int(author_id)
            if id in prof_citations:
                prof_citations[id].append(cite_count)
        except:
            pass


In [4]:
def calculate_h_index(citations):
    citations.sort(reverse=True)
    h_index = 0
    for i, cite in enumerate(citations):
        if cite >= i + 1:
            h_index = i + 1
        else:
            break
    return h_index

def calculate_i10_index(citations):
    return sum(1 for cite in citations if cite >= 10)

# Calculate h-index and i10-index for each professor
results = []
for prof_id, citations in prof_citations.items():
    h_index = calculate_h_index(citations)
    i10_index = calculate_i10_index(citations)
    results.append({
        'professor_id': prof_id,
        'h_index': h_index,
        'i10_index': i10_index
    })

results_df = pd.DataFrame(results)

In [5]:
prof = prof.merge(results_df, left_on='id', right_on='professor_id', how='left')
prof.drop(columns=["professor_id"], inplace=True)

In [6]:
uni.is_governmental.fillna(0, inplace=True)
uni.is_governmental = uni.is_governmental.astype('b')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  uni.is_governmental.fillna(0, inplace=True)


#### Step 1: filter universities [keep govermental and non-medical universities]

In [7]:
uni = uni[uni.is_governmental == True]
uni_list = uni.University.to_list()
uni_list[:5]

['University of Tehran',
 'Sharif University of Technology',
 'Ferdowsi University of Mashhad',
 'Amirkabir University of Technology',
 'Shahid Beheshti University']

#### Step 2: filter professors [keep those who are in filtered universities - keep profs with citation more than 100]

In [8]:
del_prof = prof[prof["cited_by"] < 100]
del_prof_ids_1 = set(del_prof.id.to_list())

In [9]:
del_prof = prof[~ prof["university"].isin(uni_list)]
prof = prof[prof["university"].isin(uni_list)]


prof = prof[prof["cited_by"] >= 100]
prof

Unnamed: 0,id,name,university,user_id,affiliation,v_email_at,cited_by,intrests,department,h_index,i10_index
0,0,Ali Fahim,University of Tehran,_C4Iif8AAAAJ,"Assistant Professor of Physics, University of ...",ut.ac.ir,121695,"['Elementary Particle Physics', 'Physics of Co...",Physics,165,348
1,1,Mohammad Reza Ganjali,University of Tehran,IJ3XCecAAAAJ,"Professor of Analytical Chemistry, University ...",khayam.ut.ac.ir,49589,"['Sensor and biosensor', 'Elctroanalytical Che...",Chemistry,106,994
2,2,Norouzi Parviz,University of Tehran,usWI6hMAAAAJ,"Professor of Electrochemistry, Collage of Scie...",ut.ac.ir,24867,"['Electrochemistry', 'Electrochemical Sensors'...",Biology,80,520
3,3,Reza Tavakkoli-Moghaddam,University of Tehran,gqVUx4cAAAAJ,Distinguished Professor of Industrial Engineer...,ut.ac.ir,23651,"['Facilities design', 'Supply chain', 'Schedul...",History,76,479
4,4,Ali Akbar Moosavi-Movahedi,University of Tehran,sZysKwIAAAAJ,University of Tehran,ut.ac.ir,16918,['Biophysical Chemistry'],Chemistry,63,416
...,...,...,...,...,...,...,...,...,...,...,...
49372,49372,azizollah ardeshir-behrestaghi,Mazandaran University of Science and Technology,J0HyouoAAAAJ,Mazandaran university of science and technology,ustmb.ac.ir,400,[],unknown,9,9
49373,49373,Ali ‌Tajdin,Mazandaran University of Science and Technology,aTLGjt8AAAAJ,"Associate Professor of Industrial Engineering,...",ustmb.ac.ir,319,"['Operation Research', 'Fuzzy Logic', 'Statist...",Mathematics,8,6
49374,49374,Mahbod Armin,Mazandaran University of Science and Technology,2GTMJNoAAAAJ,Researcher at Mazandaran University of Science...,ustmb.ac.ir,244,"['Combustion Simulation & Engine\xa0…', 'Heat ...",Physics,6,5
49375,49375,Aydin Aghajani,Mazandaran University of Science and Technology,QhAn1zUAAAAJ,Mazandaran university of Science and technology,ustmb.ac.ir,201,"['Manufacturing systems', 'Reliability', 'Larg...",Mechanical Engineering,5,4


#### Step 3: filter article [keep (2020 to 2022 articles - filterd professors articles)]

In [10]:
article = article[(2020 <= article.year) & (article.year <= 2022)]

In [11]:
del_prof_ids = set(del_prof.id.to_list()).union(del_prof_ids_1)
del_prof_ids = set(map(str, del_prof_ids))

article = article.dropna(subset='link_ids')

for index, row in article.iterrows():
    link_ids = set(row["link_ids"].split(','))
    new_link_ids = link_ids - del_prof_ids
    article.at[index, "link_ids"] = ','.join(new_link_ids)

article = article.dropna(subset='link_ids')

#### Step 5: calculate Deapth and Breadth in feature extraction

In [14]:
article = pd.read_csv("../../build/articles2.csv")

article = article.dropna(subset='link_ids_x')

for index, row in article.iterrows():
    link_ids = set(row["link_ids_x"].split(','))
    new_link_ids = link_ids - del_prof_ids
    article.at[index, "link_ids_x"] = ','.join(new_link_ids)

article = article[article["link_ids_x"] != '']

In [18]:
article.drop(columns=["Unnamed: 0"]).to_csv("../../build/articles2.csv", index=False)