In [10]:
import numpy as np
import pandas as pd

In [2]:
articles = pd.read_csv("../../build/final_artilce.csv", index_col=0)
profs = pd.read_csv("../../build/professors.csv", index_col=0)
unis = pd.read_csv("../../build/universities.csv")

In [11]:
# Parsa's custom file paths
articles = pd.read_csv("../articles_with_main_subject.csv", index_col=0)
profs = pd.read_csv("../professorsV2.csv", index_col=0)
unis = pd.read_csv("../universitiesV2.csv")

#### Step 1: calculate Deapth and Breadth

In [12]:
from collections import defaultdict

# Create dictionaries to map professor ID to university and university to subjects
prof_to_univ = dict(zip(profs['id'], profs['university']))
univ_subjects = defaultdict(lambda: defaultdict(int))

In [13]:
# Iterate over each article
for _, article in articles.iterrows():
    subjects = article['main_subject']
    link_ids = list(map(int, article['link_ids_x'].split(',')))
    
    # Assign main_subject to each professor's university
    for prof_id in link_ids:
        university = prof_to_univ.get(prof_id)
        if university:
            univ_subjects[university][subjects] += 1

In [14]:
# Calculate breadth and depth for each university
breadth_depth = []
subject_numbers = articles['main_subject'].value_counts()

for university, subjects in univ_subjects.items():
    breadth = sum(1 for subject in subjects.values() if subject > 10)
    uni_threshold = np.sqrt(sum(subjects.values()))
    threshold = 200
    depth = sum(1 for count in subjects.values() if count > threshold)  # Subjects with more than the threshold number of articles
    uni_mean_depth = sum(1 for count in subjects.values() if count > uni_threshold)
    subject_mean_depth = sum(1 for subject, count in subjects.items() if count > np.sqrt(subject_numbers[subject]))
    # Append the results to the breadth_depth list
    breadth_depth.append({
        'university': university,
        'subjects':subjects,
        'touch_with_10': breadth,
        'depth_with_uni_mean': uni_mean_depth,
        'depth_with_subject_mean': subject_mean_depth,
        'depth_with_200': depth,
    })

# Convert the breadth_depth list to a DataFrame
breadth_depth_df = pd.DataFrame(breadth_depth)

In [15]:
df = breadth_depth_df.drop(columns=['subjects'])
results = pd.merge(unis, df, left_on='University', right_on="university", how='left').drop(columns=["university", "Unnamed: 0",])

In [16]:
results

Unnamed: 0,#Rank,University,Town,org_id,is_governmental,touch_with_10,depth_with_uni_mean,depth_with_subject_mean,depth_with_200
0,1,University of Tehran,Tehran,3127243484376623607,1,24.0,16.0,24.0,16.0
1,2,Sharif University of Technology,Tehran,14542101698899415237,1,18.0,11.0,13.0,9.0
2,5,Ferdowsi University of Mashhad,Mashhad,12493016978831566375,1,23.0,16.0,22.0,15.0
3,7,Amirkabir University of Technology,Tehran,6418904117290021063,1,20.0,14.0,14.0,10.0
4,8,Shahid Beheshti University,Tehran,15042406923477825387,1,21.0,16.0,17.0,12.0
...,...,...,...,...,...,...,...,...,...
61,140,Ardakan University,Ardakan,9161180858282046579,1,10.0,6.0,1.0,1.0
62,141,Qom University of Technology,Qom,693714768659580315,1,9.0,7.0,0.0,0.0
63,156,Kermanshah University of Technology,Kermanshah,13528429044870125586,1,9.0,8.0,1.0,0.0
64,179,Graduate University of Advanced Technology,Kerman,13516240938106875172,1,11.0,9.0,0.0,0.0


### Last step: Saving universities with new features to universitiesV4 csv file

In [17]:
results.to_csv("../universitiesV4.csv")