In [1]:
import re
import pandas as pd
import numpy as np
from itertools import combinations
from nameparser import HumanName
import os
from nameparser.config import CONSTANTS

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse.csgraph import connected_components
from sparse_dot_topn import sp_matmul_topn

import recordlinkage
from recordlinkage.preprocessing import clean
import networkx as nx

from cleanleader import *
from position_word_banks import *
from tqdm import tqdm

import warnings
import logging 

In [2]:
data_path = r'C:\Projects\connecteddatahub\data'

In [3]:
warnings.filterwarnings("ignore", category=UserWarning, module="recordlinkage")
logging.getLogger("recordlinkage").setLevel(logging.ERROR)


In [4]:
# add some extra terms into nameparser
titles2add = ['trustee', 'chairman', 'very', 'msgr', 'radm']
for t in titles2add:
    CONSTANTS.titles.add(t)
    
suffix_acronyms2add = [
    'mn', 'op', 'facfas', 'ret', 'ri', 'osa', 'sj', 'cssi', 'svd', 'sm',
    'std', 'sc', 'ofm', 'kg', 'cssp', 'rsm', 'chfm', 'bsn', 'cec', 'mbbch'
]
for s in suffix_acronyms2add:
    CONSTANTS.suffix_acronyms.add(s)

suffix_acronyms2remove = ['caro', 'mai', 'pla', 'chi']
for s in suffix_acronyms2remove:
    CONSTANTS.suffix_acronyms.remove(s)

# Adjust suffix rules
CONSTANTS.suffix_not_acronyms.remove('junior')
CONSTANTS.suffix_not_acronyms.add('facsm')


SetManager({'snr', 'iii', 'iv', '2', 'esq', 'v', 'jnr', 'sr', 'i', 'ii', 'jr', 'facsm', 'dr', 'esquire'})

In [5]:
def split_names(df):
    hnames = [HumanName(name, constants=CONSTANTS) for name in df['Name'].values]

    df['FirstName'] = [h.first.strip() for h in hnames]
    df['MiddleName'] = [h.middle.strip() for h in hnames]
    df['LastName'] = [h.last.strip() for h in hnames]
    df['NickName'] = [h.nickname.strip() for h in hnames]
    df['SuffixName'] = [h.suffix.strip() for h in hnames]
    df['PrefixName'] = [h.title.strip() for h in hnames]
    df['MiddleInitials'] = [
        "".join([a[0] for a in h.middle.split(' ') if len(a) > 0])
        for h in hnames
    ]

    # Normalize last names (remove stray periods)
    df['LastName'] = df['LastName'].str.replace('.', '', regex=False)

    return df

def make_raw_name(prow, use_nickname=False):
    if use_nickname:
        fname_col = 'NickName'
    else:
        fname_col = 'FirstName'
    if len(prow[fname_col].replace('.', '')) <= 1:
        rawname = prow[fname_col] + " " + prow['MiddleName']
    else:
        rawname = prow[fname_col] + " " + prow['MiddleInitials']
    
    rawname += " " + prow['LastName']
    if bool(re.fullmatch(r'[Ii\s]+', prow['SuffixName'])):
        rawname += " " + prow['SuffixName']
    
    return rawname


def expand_name_df(df):
    # expand with nicknames
    expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
    expanded_name_list['RawName4Match'] = [make_raw_name(row, use_nickname=True) for i,row in expanded_name_list.iterrows()]
    expanded_df = pd.concat([df, expanded_name_list], ignore_index=True)
    
    # expand with first initials if middle name is long
    expanded_name_list2 = df[df['MiddleName'].str.replace('.', '').str.len() > 1].reset_index(names=['Oidx'])
    expanded_name_list2['FirstName'] = [n[0] for n in expanded_name_list2['FirstName']]
    expanded_name_list2['RawName4Match'] = [make_raw_name(row, use_nickname=False) for i,row in expanded_name_list2.iterrows()]
    expanded_df = pd.concat([expanded_df, expanded_name_list2], ignore_index=True)
    
    return expanded_df

In [6]:
def assign_leader_ids(df):
    df['RawName4Match'] = [make_raw_name(row) for i,row in df.iterrows()]
    df['LeaderId'] = np.nan
    
    lid = 0
    disambig_leader = []

    for affid in tqdm(df['AffiliationId'].unique(), desc="Assigning LeaderIds"):
        affsub = df[df['AffiliationId'] == affid]
        iname = affsub['Institution'].values[0]
        affsub = affsub[affsub['Name'] != 'Vacant'].reset_index(names=['Lindex'])
        
        exp_affsub = expand_name_df(affsub)
        exp_affsub['RawName4Match_clean'] = clean(exp_affsub['RawName4Match'])

        # Build candidate pairs with recordlinkage
        indexer = recordlinkage.Index()
        indexer.full()
        pairs = indexer.index(exp_affsub)

        compare = recordlinkage.Compare()
        compare.string('RawName4Match_clean', 'RawName4Match_clean',
                       method='jaro', threshold=0.85,
                       label='name_match')
        features = compare.compute(pairs, exp_affsub)

        matches = features[features['name_match'] == 1].index.tolist()

        # Graph clustering
        G = nx.Graph()
        G.add_nodes_from(exp_affsub.index)
        G.add_edges_from(matches)

        # link expansions back
        for i in range(exp_affsub.shape[0] - affsub.shape[0]):
            G.add_edge(i+affsub.shape[0], exp_affsub['Oidx'].values[i+affsub.shape[0]])

        for component in nx.connected_components(G):
            candidate_idx = np.sort(list(component))
            canonical = identify_true_name(exp_affsub, candidate_idx)  # picks the most representative version

            pname = [affid, iname, lid]
            pname.extend(canonical)
            syear = exp_affsub.loc[candidate_idx, 'Year'].min()
            eyear = exp_affsub.loc[candidate_idx, 'Year'].max()
            pname.extend([syear, eyear])

            disambig_leader.append(pname)
            df.loc[exp_affsub['Lindex'].loc[candidate_idx], 'LeaderId'] = lid
            lid += 1
    
    disambig_leader = pd.DataFrame(disambig_leader, columns=[
        'AffiliationId','Institution','LeaderId',
        'FirstName','MiddleName','MiddleInitials','LastName',
        'NickName','SuffixName','PrefixName',
        'StartYear','EndYear'
    ])
    return df, disambig_leader


In [7]:
def clean_institution_name(name):
    for s2space in [' – ',' - ', '–', '-','   ', '  ']:
        name = name.replace(s2space, ' ')
    for s2remove in ['.', ',', 'The ', "'", '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']:
        name= name.replace(s2remove, '')
    for miss, corr in [('Universite', 'University'), ('Colege', 'College')]:
        name = name.replace(miss, corr)
    return name.strip().title()

clean_aff = pd.read_csv(os.path.join(data_path, 'maps', 'cleaned_affiliation.csv'))
clean_aff['FullName'] = clean_aff['FullName'].transform(clean_institution_name)
clean_aff.drop_duplicates(inplace=True)


In [8]:
# leader_data = []

# for y in [1999, 2000, 2002, 2005, 2007, 2008, 2009, 2010, 2011, 2013, 2018]:
#     ydf = pd.read_csv(os.path.join(data_path, 'cleaned_dataframes', f'{str(y)}_cleanedDataframe.csv').format(y))
#     for c in ['PrimarySample', 'AffiliationId', 'carnegie_id']:
#         del ydf[c]
#     ydf = ydf.merge(clean_aff.rename(columns={'FullName':'Institution'}), how='left', on='Institution')
#     ydf.dropna(subset=['PrimarySample'], inplace=True)
#     leader_data.append(ydf[ydf['PrimarySample']])

# leader_data = pd.concat(leader_data, ignore_index=True)
# del leader_data['PrimarySample']

# leader_data['Year'] = leader_data['Year'].astype(int)
# leader_data['AffiliationId'] = leader_data['AffiliationId'].astype(int)

# for c in ['Name', 'Institution', 'Position']:
#     leader_data['Raw'+c] = leader_data[c]

# leader_data

In [9]:
leader_data = []

for y in [1999, 2000, 2002, 2005, 2007, 2008, 2009, 2010, 2011, 2013, 2018]:
    ydf = pd.read_csv(os.path.join(data_path, 'cleaned_dataframes', f'{str(y)}_cleanedDataframe.csv').format(y))
    for c in ['PrimarySample', 'AffiliationId', 'carnegie_id']:
        del ydf[c]
    ydf = ydf.merge(
        clean_aff.rename(columns={'FullName': 'Institution'}),
        how='left',
        on='Institution'
    )

    if 'SystemId_y' in ydf.columns:
        ydf['SystemId'] = ydf['SystemId_y']
        ydf = ydf.drop(columns=['SystemId_x', 'SystemId_y'])
    ydf.dropna(subset=['PrimarySample'], inplace=True)
    leader_data.append(ydf[ydf['PrimarySample']])

leader_data = pd.concat(leader_data, ignore_index=True)
del leader_data['PrimarySample']

leader_data['Year'] = leader_data['Year'].astype(int)
leader_data['AffiliationId'] = leader_data['AffiliationId'].astype(int)

for c in ['Name', 'Institution', 'Position']:
    leader_data['Raw'+c] = leader_data[c]

leader_data

Unnamed: 0,Year,Name,Position,Institution,SubInstitution,Education,FixedPosition,Seniority,Designation,AffiliationId,carnegie_id,SystemId,RawName,RawInstitution,RawPosition
0,1999,Matthew Goldstein,President,Adelphi University,,,President,,,71965598,188429.0,,Matthew Goldstein,Adelphi University,President
1,1999,Robert Ptachik,Deputy to the President,Adelphi University,,,,,,71965598,188429.0,,Robert Ptachik,Adelphi University,Deputy to the President
2,1999,Carol Sabino,Assistant Secretary To The Board Of Trustees,Adelphi University,,,,,,71965598,188429.0,,Carol Sabino,Adelphi University,Assistant Secretary To The Board Of Trustees
3,1999,Carol Sabino,Special Assistant To The President,Adelphi University,,,,,,71965598,188429.0,,Carol Sabino,Adelphi University,Special Assistant To The President
4,1999,Armstrong S. Starkey,Provost,Adelphi University,,,Provost,Default,Head Provost,71965598,188429.0,,Armstrong S. Starkey,Adelphi University,Provost
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207272,2018,Moshael J. Straus,Chairman,Yeshiva University,,,Board Member,,,19772626,197708.0,,Moshael J. Straus,Yeshiva University,Chairman
207273,2018,Morry J. Weiss,Chairman Emeritus,Yeshiva University,,,Board Member,,,19772626,197708.0,,Morry J. Weiss,Yeshiva University,Chairman Emeritus
207274,2018,Mark Wilf,Trustee,Yeshiva University,,,Board Member,,,19772626,197708.0,,Mark Wilf,Yeshiva University,Trustee
207275,2018,"Zygmunt ""Zygi"" Wilf",Trustee,Yeshiva University,,,Board Member,,,19772626,197708.0,,"Zygmunt ""Zygi"" Wilf",Yeshiva University,Trustee


In [10]:
leader_data = split_names(leader_data)
leader_data, disambig_leader = assign_leader_ids(leader_data)
leader_data

  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
Assigning LeaderIds: 100%|██████████| 341/341 [07:07<00:00,  1.25s/it]


Unnamed: 0,Year,Name,Position,Institution,SubInstitution,Education,FixedPosition,Seniority,Designation,AffiliationId,...,RawPosition,FirstName,MiddleName,LastName,NickName,SuffixName,PrefixName,MiddleInitials,RawName4Match,LeaderId
0,1999,Matthew Goldstein,President,Adelphi University,,,President,,,71965598,...,President,Matthew,,Goldstein,,,,,Matthew Goldstein,0.0
1,1999,Robert Ptachik,Deputy to the President,Adelphi University,,,,,,71965598,...,Deputy to the President,Robert,,Ptachik,,,,,Robert Ptachik,1.0
2,1999,Carol Sabino,Assistant Secretary To The Board Of Trustees,Adelphi University,,,,,,71965598,...,Assistant Secretary To The Board Of Trustees,Carol,,Sabino,,,,,Carol Sabino,2.0
3,1999,Carol Sabino,Special Assistant To The President,Adelphi University,,,,,,71965598,...,Special Assistant To The President,Carol,,Sabino,,,,,Carol Sabino,2.0
4,1999,Armstrong S. Starkey,Provost,Adelphi University,,,Provost,Default,Head Provost,71965598,...,Provost,Armstrong,S.,Starkey,,,,S,Armstrong S Starkey,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207272,2018,Moshael J. Straus,Chairman,Yeshiva University,,,Board Member,,,19772626,...,Chairman,Moshael,J.,Straus,,,,J,Moshael J Straus,53572.0
207273,2018,Morry J. Weiss,Chairman Emeritus,Yeshiva University,,,Board Member,,,19772626,...,Chairman Emeritus,Morry,J.,Weiss,,,,J,Morry J Weiss,53573.0
207274,2018,Mark Wilf,Trustee,Yeshiva University,,,Board Member,,,19772626,...,Trustee,Mark,,Wilf,,,,,Mark Wilf,53870.0
207275,2018,"Zygmunt ""Zygi"" Wilf",Trustee,Yeshiva University,,,Board Member,,,19772626,...,Trustee,Zygmunt,,Wilf,Zygi,,,,Zygmunt Wilf,53708.0


In [11]:
leader_data.to_csv(os.path.join(data_path, 'cleaned_dataframes', 'master_leadershipdata.csv'), index = False)