In [20]:
import pandas as pd
import os
from rapidfuzz import fuzz, process
import numpy as np
from tqdm import tqdm

In [21]:
data_path = r'C:\Projects\connecteddatahub\data'
grants_path = os.path.join(data_path, 'cleaned_dataframes', 'grants', 'cleaned_grants_2010_2018.csv')
regression_path = r'C:\Projects\connecteddatahub\data\statistics\regression_university_board_statistics.csv'

In [22]:
grants_df = pd.read_csv(grants_path)
regression_df = pd.read_csv(regression_path)

In [23]:
print(f"Regression DF Columns:\n {regression_df.columns}")

Regression DF Columns:
 Index(['Year', 'Institution', 'AffiliationId', 'female_president',
       'PrimarySample', 'total_members', 'total_ethnicity', 'board_turnover',
       'carnegie_id', 'state', 'control', 'StateSystem', 'region',
       'num_billionaires', 'student.women', 'faculty.women',
       'faculty.race_ethnicity.white', 'student.size',
       'cost.tuition.out_of_state', 'school.faculty_salary', 'RD_expenditure',
       'female_proportion', 'poc_proportion', 'billionaire_proportion',
       'eigenvector', 'betweenness', 'degree', 'strength', 'clustering',
       'betweenness_unweighted', 'clustering_unweighted', 'Rank'],
      dtype='object')


In [24]:
print(f"Grant DF Columns:\n {grants_df.columns}")

Grant DF Columns:
 Index(['year', 'recip_ein', 'recip_name', 'recip_city', 'recip_state',
       'recip_zip', 'amount', 'recip_status', 'text', 'type'],
      dtype='object')


In [25]:
# # 1. Aggregate grants_df by recip_name and year
# grants_grouped = (
#     grants_df.groupby(['recip_name', 'year'])
#     .agg(num_grants=('recip_name', 'size'),
#          grant_amount=('amount', 'sum'))
#     .reset_index()
# )

# # Pre-group regression by year
# regression_by_year = {yr: subdf for yr, subdf in regression_df.groupby("Year")}

# # Prepare output columns
# regression_df['num_grants'] = pd.NA
# regression_df['grant_amount'] = pd.NA

# # Progress bar over unique years
# for year, grants_sub in tqdm(grants_grouped.groupby("year"), total=grants_grouped['year'].nunique(), desc="Matching by year"):
#     if year not in regression_by_year:
#         continue
    
#     reg_sub = regression_by_year[year]
#     queries = grants_sub['recip_name'].str.lower().tolist()
#     candidates = reg_sub['Institution'].str.lower().tolist()
    
#     # Compute similarity matrix (len(queries) x len(candidates))
#     scores = process.cdist(queries, candidates, scorer=fuzz.token_sort_ratio)
    
#     best_idx = np.argmax(scores, axis=1)
#     best_score = scores[np.arange(len(queries)), best_idx]
    
#     # Apply threshold
#     for i, (idx, score) in enumerate(zip(best_idx, best_score)):
#         if score >= 80:
#             reg_idx = reg_sub.index[idx]
#             regression_df.at[reg_idx, 'num_grants'] = grants_sub.iloc[i]['num_grants']
#             regression_df.at[reg_idx, 'grant_amount'] = grants_sub.iloc[i]['grant_amount']

In [26]:
# 1. Aggregate grants_df by recip_name and year
grants_grouped = (
    grants_df.groupby(['recip_name', 'year'])
    .agg(num_grants=('recip_name', 'size'),
         grant_amount=('amount', 'sum'))
    .reset_index()
)

# Pre-group regression by year
regression_by_year = {yr: subdf for yr, subdf in regression_df.groupby("Year")}

# Prepare output columns
regression_df['num_grants'] = pd.NA
regression_df['grant_amount'] = pd.NA

# Progress bar over unique years
for year, grants_sub in tqdm(grants_grouped.groupby("year"), total=grants_grouped['year'].nunique(), desc="Matching by year"):
    if year not in regression_by_year:
        continue
    
    reg_sub = regression_by_year[year]
    queries = grants_sub['recip_name'].str.lower().tolist()
    candidates = reg_sub['Institution'].str.lower().tolist()
    
    # --- Substring check first ---
    for i, recip in enumerate(queries):
        matched_idx = None
        for j, inst in enumerate(candidates):
            if inst in recip:  # institution is substring of recip_name
                matched_idx = j
                break
        
        if matched_idx is not None:
            reg_idx = reg_sub.index[matched_idx]
            regression_df.at[reg_idx, 'num_grants'] = grants_sub.iloc[i]['num_grants']
            regression_df.at[reg_idx, 'grant_amount'] = grants_sub.iloc[i]['grant_amount']
            # Skip fuzzy matching for this one
            queries[i] = None  
    
    # Remove already matched queries
    remaining = [(i, q) for i, q in enumerate(queries) if q is not None]
    if not remaining:
        continue
    
    rem_idx, rem_queries = zip(*remaining)
    
    # Fuzzy matching only for unmatched queries
    scores = process.cdist(rem_queries, candidates, scorer=fuzz.token_sort_ratio)
    best_idx = np.argmax(scores, axis=1)
    best_score = scores[np.arange(len(rem_queries)), best_idx]
    
    for i, (idx, score) in zip(rem_idx, zip(best_idx, best_score)):
        if score >= 80:
            reg_idx = reg_sub.index[idx]
            regression_df.at[reg_idx, 'num_grants'] = grants_sub.iloc[i]['num_grants']
            regression_df.at[reg_idx, 'grant_amount'] = grants_sub.iloc[i]['grant_amount']


Matching by year: 100%|██████████| 10/10 [00:58<00:00,  5.85s/it]


In [27]:
regression_df.to_csv(os.path.join(data_path, 'statistics', 'regression_data_with_grants.csv'), index = False)