In [83]:
import os
import pandas as pd
import numpy as np

import re
from rapidfuzz import fuzz, process
from tqdm import tqdm

from IPython.display import display

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [84]:
grants_path = r'C:\Projects\connecteddatahub\data\grants\cleaned_grants_2010_2018.csv'
affiliation_path = r'C:\Projects\connecteddatahub\data\maps\cleaned_affiliation.csv'
diversity_path = r"C:\Projects\connecteddatahub\data\external\university_enrollment_race.csv"

In [85]:
grants_df = pd.read_csv(grants_path)
grants_df['Year'] = grants_df['year']
grants_df['Year'] = grants_df['Year'].astype(str)
print(grants_df.shape)

(9262582, 11)


In [86]:
affiliation_df = pd.read_csv(affiliation_path)
print(affiliation_df.head(4))
print(grants_df.columns)

   AffiliationId  carnegie_id  PrimarySample                  FullName  \
0       71965598     188429.0           True        Adelphi University   
1      181401687     131159.0           True       American University   
2      102298084     168740.0           True        Andrews University   
3           1000          NaN           True  Arizona Board of Regents   

   SystemId  
0       NaN  
1       NaN  
2       NaN  
3    1000.0  
Index(['year', 'recip_ein', 'recip_name', 'recip_city', 'recip_state',
       'recip_zip', 'amount', 'recip_status', 'text', 'type', 'Year'],
      dtype='object')


In [87]:
diversity_df = pd.read_csv(diversity_path)
diversity_df = diversity_df.rename(columns={'year': 'Year'})
diversity_df['Year'] = diversity_df['Year'].astype(str)
print(diversity_df.columns)
#merge the number of students into the df

Index(['carnegie_id', 'AffiliationId', 'Year', 'student.size',
       'student.enrollment.all', 'student.demographics.race_ethnicity.white',
       'student.demographics.race_ethnicity.black',
       'student.demographics.race_ethnicity.hispanic',
       'student.demographics.race_ethnicity.asian',
       'student.demographics.race_ethnicity.aian',
       'student.demographics.race_ethnicity.nhpi',
       'student.demographics.race_ethnicity.two_or_more',
       'student.demographics.race_ethnicity.non_resident_alien',
       'student.demographics.race_ethnicity.unknown',
       'student.demographics.race_ethnicity.white_non_hispanic',
       'student.demographics.race_ethnicity.black_non_hispanic',
       'student.demographics.race_ethnicity.asian_pacific_islander',
       'student.demographics.race_ethnicity.aian_prior_2009',
       'student.demographics.race_ethnicity.hispanic_prior_2009',
       'student.demographics.race_ethnicity.unknown_2000',
       'student.demographics.race_e

In [88]:
years = ["1999", "2000", "2002", "2005", "2007", "2008", "2009", "2010", "2011", "2013", "2018"]

year_df = pd.DataFrame({'Year': years})
affiliation_years = affiliation_df.merge(year_df, how = 'cross')
print(affiliation_years.columns)

Index(['AffiliationId', 'carnegie_id', 'PrimarySample', 'FullName', 'SystemId',
       'Year'],
      dtype='object')


In [89]:
#merge the student sizes into the year df

merged_df = affiliation_years.merge(
    diversity_df[['AffiliationId', 'Year', 'student.size']],
    how = "left",
    on = ["AffiliationId", 'Year']
)

print(merged_df.head(10))
affiliation_years = merged_df

   AffiliationId  carnegie_id  PrimarySample            FullName  SystemId  \
0       71965598     188429.0           True  Adelphi University       NaN   
1       71965598     188429.0           True  Adelphi University       NaN   
2       71965598     188429.0           True  Adelphi University       NaN   
3       71965598     188429.0           True  Adelphi University       NaN   
4       71965598     188429.0           True  Adelphi University       NaN   
5       71965598     188429.0           True  Adelphi University       NaN   
6       71965598     188429.0           True  Adelphi University       NaN   
7       71965598     188429.0           True  Adelphi University       NaN   
8       71965598     188429.0           True  Adelphi University       NaN   
9       71965598     188429.0           True  Adelphi University       NaN   

   Year  student.size  
0  1999           NaN  
1  2000           NaN  
2  2002        3726.0  
3  2005        4718.0  
4  2007        4973.0

In [None]:
def normalize_institution(name: str):
    """Lowercase, strip punctuation, preserve institutional terms."""
    if pd.isna(name):
        return ""
    name = name.lower()
    name = re.sub(r"[^\w\s]", "", name)
    name = re.sub(r"\s+", " ", name).strip()
    return name

grants_df["recip_name_norm"] = grants_df["recip_name"].apply(normalize_institution)
affiliation_years["fullname_norm"] = affiliation_years["FullName"].apply(normalize_institution)

#TF-IDF + cosine similarity
def match_names_discriminative(sub_aff, grants_sub, threshold=0.55):
    """
    Performs institutional name matching using TF-IDF + cosine similarity,    """
    if grants_sub.empty or sub_aff.empty:
        sub_aff["matched_name"] = None
        sub_aff["match_score"] = np.nan
        return sub_aff

    # Build vector space model
    vectorizer = TfidfVectorizer(analyzer="char_wb", ngram_range=(3,6))
    X = vectorizer.fit_transform(grants_sub["recip_name_norm"])
    Y = vectorizer.transform(sub_aff["fullname_norm"])

    # Compute cosine similarities
    sims = cosine_similarity(Y, X)
    best_idx = sims.argmax(axis=1)
    best_score = sims.max(axis=1)

    matched = []
    for i, row in enumerate(sub_aff["fullname_norm"]):
        candidate = grants_sub.iloc[best_idx[i]]["recip_name_norm"]
        score = best_score[i]

        if score < threshold:
            matched.append(None)
            continue

        tokens_row = row.split()
        tokens_cand = candidate.split()
        if not tokens_row or not tokens_cand:
            matched.append(None)
            continue

        # Longest word overlap requirement
        longest_token = max(tokens_row, key=len)
        if longest_token not in tokens_cand:
            matched.append(None)
            continue

        # First token overlap requirement
        first_token = tokens_row[0]
        if first_token not in tokens_cand:
            matched.append(None)
            continue

        matched.append(candidate)

    sub_aff["matched_name"] = matched
    sub_aff["match_score"] = best_score
    return sub_aff


matched_chunks = []
years_iter = sorted(affiliation_years["Year"].dropna().unique())

for yr in tqdm(years_iter, desc="Matching by year"):
    grants_subset = grants_df[grants_df["Year"] == yr]
    sub_aff = affiliation_years[affiliation_years["Year"] == yr].copy()
    matched_chunks.append(match_names_discriminative(sub_aff, grants_subset, threshold=0.55))

affiliation_years = pd.concat(matched_chunks, ignore_index=True)


merged = affiliation_years.merge(
    grants_df[["recip_name_norm", "Year", "amount"]],
    how="left",
    left_on=["matched_name", "Year"],
    right_on=["recip_name_norm", "Year"],
)

grant_stats = (
    merged.groupby(["FullName", "Year"], as_index=False)
    .agg(
        num_grants=("amount", "count"),
        total_funding=("amount", "sum"),
        student_size = ("student.size", "first")
    )
)

affiliation_years = affiliation_years.merge(grant_stats, on=["FullName", "Year"], how="left")
affiliation_years["num_grants"] = affiliation_years["num_grants"].fillna(0).astype(int)
affiliation_years["total_funding"] = affiliation_years["total_funding"].fillna(0.0)

Matching by year: 100%|██████████| 11/11 [07:14<00:00, 39.51s/it] 


In [99]:
print(affiliation_years.columns)

Index(['AffiliationId', 'carnegie_id', 'PrimarySample', 'FullName', 'SystemId',
       'Year', 'student.size', 'fullname_norm', 'matched_name', 'match_score',
       'num_grants', 'total_funding', 'student_size'],
      dtype='object')


In [101]:
grant_stats_filtered = affiliation_years[affiliation_years["Year"].astype(int) >= 2010]

grant_stats_filtered_interpolate = grant_stats_filtered.sort_values(["FullName", "Year"])

# Interpolate within each AffiliationId across years
grant_stats_filtered_interpolate["student.size"] = (
    grant_stats_filtered_interpolate.groupby("AffiliationId")["student.size"]
    .transform(lambda s: s.interpolate(method="linear"))
    .fillna(
        grant_stats_filtered_interpolate.groupby("AffiliationId")["student.size"].transform("mean")
    )
)

grant_stats_filtered_interpolate = grant_stats_filtered_interpolate.drop('student_size', axis = 1, errors = "ignore")
# grant_stats_filtered_interpolate = grant_stats.sort_values(by = ['FullName', 'Year'])
grant_stats_filtered_interpolate.to_csv(r'C:\Projects\connecteddatahub\data\grants\university_grants.csv', index = False)

In [None]:
def preview_matches(affiliation_years, grants_df, top_n=20):
    """Show a sample of institution matches by year."""
    # Keep only rows with successful matches
    matched = affiliation_years[affiliation_years["matched_name"].notna()].copy()

    # Merge to bring grant data and similarity if you stored it
    matched = matched.merge(
        grants_df[["recip_name_norm", "Year", "recip_name"]],
        how="left",
        left_on=["matched_name", "Year"],
        right_on=["recip_name_norm", "Year"],
    )

    # Show counts
    print(f"✅ Total matches found: {len(matched):,}")
    print(f"✅ Years matched: {matched['Year'].nunique()}\n")

    # Print a few random or top examples per year
    for yr in sorted(matched['Year'].unique()):
        print(f"\n===== Year {yr} =====")
        sub = matched[matched["Year"] == yr].sample(
            n=min(top_n, len(matched[matched["Year"] == yr])), random_state=42
        )
        display(
            sub[
                [
                    "FullName",
                    "matched_name",
                    "recip_name",   # original name from grants_df
                    "num_grants",
                    "total_funding",
                ]
            ].head(top_n)
        )

# Example usage
preview_matches(affiliation_years, grants_df, top_n = 50)


✅ Total matches found: 90,600
✅ Years matched: 4


===== Year 2010 =====


Unnamed: 0,FullName,matched_name,recip_name,num_grants,total_funding
11833,University of Southern California,university of southern california,UNIVERSITY OF SOUTHERN CALIFORNIA,125,44180971.0
7039,Temple University,temple university,TEMPLE UNIVERSITY,117,6855908.0
8614,University of Chicago,university of chicago,UNIVERSITY OF CHICAGO,198,34395832.0
3798,Lehigh University,lehigh university,LEHIGH UNIVERSITY,70,3689057.0
9303,University of Illinois System,university of illinois,UNIVERSITY OF ILLINOIS,67,50509661.0
1562,Columbia University,columbia university,COLUMBIA UNIVERSITY,153,23178445.0
5683,Pennsylvania State University,pennsylvania state university,PENNSYLVANIA STATE UNIVERSITY,116,48353279.0
9549,University of Kentucky,university of kentucky,UNIVERSITY OF KENTUCKY,62,5397535.0
9313,University of Illinois System,university of illinois,UNIVERSITY OF ILLINOIS,67,50509661.0
3781,Lehigh University,lehigh university,LEHIGH UNIVERSITY,70,3689057.0



===== Year 2011 =====


Unnamed: 0,FullName,matched_name,recip_name,num_grants,total_funding
21483,Pace University,pace university,PACE UNIVERSITY,35,650830.0
27575,University of New Mexico,university of new mexico,UNIVERSITY OF NEW MEXICO,55,13567318.0
28344,University of Pennsylvania,university of pennsylvania,UNIVERSITY OF PENNSYLVANIA,269,104813508.0
26813,University of Miami,university of miami,UNIVERSITY OF MIAMI,128,41175600.0
16733,Cornell University,cornell university,CORNELL UNIVERSITY,252,30318594.0
22108,Rensselaer Polytechnic Institute,rensselaer polytechnic institute,RENSSELAER POLYTECHNIC INSTITUTE,44,3731328.0
32154,Yeshiva University,yeshiva university,YESHIVA UNIVERSITY,57,13409027.0
26647,University of Massachusetts Amherst,university of massachusetts amherst,UNIVERSITY OF MASSACHUSETTS AMHERST,25,2478503.0
31119,Villanova University,villanova university,VILLANOVA UNIVERSITY,59,1462702.0
19934,Mississippi State University,mississippi state university,MISSISSIPPI STATE UNIVERSITY,36,31094060.0



===== Year 2013 =====


Unnamed: 0,FullName,matched_name,recip_name,num_grants,total_funding
48527,University of Louisville,university of louisville,UNIVERSITY OF LOUISVILLE,71,12013358.0
55011,Washington University,washington university,WASHINGTON UNIVERSITY,229,27920201.0
46341,"University of California, San Francisco",university of california san francisco,UNIVERSITY OF CALIFORNIA SAN FRANCISCO,86,17798036.0
46774,University of Cincinnati,university of cincinnati,UNIVERSITY OF CINCINNATI,119,104455410.0
36137,Drexel University,drexel university,DREXEL UNIVERSITY,98,7021199.0
50816,University of Pennsylvania,university of pennsylvania,UNIVERSITY OF PENNSYLVANIA,364,41394533.0
52908,University of Utah,university of utah,UNIVERSITY OF UTAH,200,29285963.0
32815,Baylor College of Medicine,baylor college of medicine,BAYLOR COLLEGE OF MEDICINE,168,41734549.0
36003,Dartmouth College,dartmouth college,DARTMOUTH COLLEGE,167,13607985.0
48818,University of Memphis,university of memphis,UNIVERSITY OF MEMPHIS,26,5242303.0



===== Year 2018 =====


Unnamed: 0,FullName,matched_name,recip_name,num_grants,total_funding
76832,"University of California, San Francisco",university of california san francisco,UNIVERSITY OF CALIFORNIA SAN FRANCISCO,123,49445660.0
58357,California State University East Bay,california state university east bay,CALIFORNIA STATE UNIVERSITY EAST BAY,11,2034723.0
56960,Baylor University,baylor university,BAYLOR UNIVERSITY,141,42833080.0
60784,Cornell University,cornell university,CORNELL UNIVERSITY,549,93443160.0
73403,Florida State University,florida state university,FLORIDA STATE UNIVERSITY,186,107288100.0
90170,Yale University,yale university,YALE UNIVERSITY,710,148696100.0
67639,Montana State University,montana state university,MONTANA STATE UNIVERSITY,80,39622730.0
70904,Princeton University,princeton university,PRINCETON UNIVERSITY,344,28827920.0
60854,Cornell University,cornell university,CORNELL UNIVERSITY,549,93443160.0
90450,Yeshiva University,yeshiva university,YESHIVA UNIVERSITY,97,10979210.0
