In [1]:
import numpy as np
import pandas as pd

In [2]:
articles = pd.read_csv("../../build/final_artilce.csv", index_col=0)
profs = pd.read_csv("../../build/professors.csv", index_col=0)
unis = pd.read_csv("../../build/universities.csv")

#### Step 1: calculate Deapth and Breadth

In [3]:
from collections import defaultdict

# Create dictionaries to map professor ID to university and university to subjects
prof_to_univ = dict(zip(profs['id'], profs['university']))
univ_subjects = defaultdict(lambda: defaultdict(int))

In [4]:
# Iterate over each article
for _, article in articles.iterrows():
    subjects = article['main_subject']
    link_ids = list(map(int, article['link_ids_x'].split(',')))
    
    # Assign main_subject to each professor's university
    for prof_id in link_ids:
        university = prof_to_univ.get(prof_id)
        if university:
            univ_subjects[university][subjects] += 1

In [5]:
# Calculate breadth and depth for each university
breadth_depth = []
subject_numbers = articles['main_subject'].value_counts()

for university, subjects in univ_subjects.items():
    breadth = sum(1 for subject in subjects.values() if subject > 10)
    uni_threshold = np.sqrt(sum(subjects.values()))
    threshold = 200
    depth = sum(1 for count in subjects.values() if count > threshold)  # Subjects with more than the threshold number of articles
    uni_mean_depth = sum(1 for count in subjects.values() if count > uni_threshold)
    subject_mean_depth = sum(1 for subject, count in subjects.items() if count > np.sqrt(subject_numbers[subject]))
    # Append the results to the breadth_depth list
    breadth_depth.append({
        'university': university,
        'subjects':subjects,
        'touch_with_10': breadth,
        'depth_with_uni_mean': uni_mean_depth,
        'depth_with_subject_mean': subject_mean_depth,
        'depth_with_200': depth,
    })

# Convert the breadth_depth list to a DataFrame
breadth_depth_df = pd.DataFrame(breadth_depth)

In [6]:
df = breadth_depth_df.drop(columns=['subjects'])
results = pd.merge(unis, df, left_on='University', right_on="university", how='left').drop(columns=["university", "Unnamed: 0",])

In [8]:
results.to_csv("../../build/universities.csv", index=False)