In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import json
import pandas as pd
import numpy as np
from itertools import combinations

import re
import unicodedata
from unidecode import  unidecode
from collections import defaultdict

import pyscisci.all as pyscisci
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sparse_dot_topn import awesome_cossim_topn, sp_matmul_topn
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.csgraph import connected_components

import nomquamgender as nqg
from nameparser import HumanName

from cleanleader import *
from position_word_banks import *

path2leadership = '/Users/hgt6rn/Documents/DataSets/LeadershipScans'
path2leadership = '/Users/hgt6rn/Dropbox/Data/LeadershipScans'

path2leadership = r'C:\Projects\connecteddatahub\data'


# Combine over years

In [None]:
def clean_institution_name(name):
    for s2space in [' – ',' - ', '–', '-','   ', '  ']:
        name = name.replace(s2space, ' ')
    for s2remove in ['.', ',', 'The ', "'", '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']:
        name= name.replace(s2remove, '')
    for miss, corr in [('Universite', 'University'), ('Colege', 'College')]:
        name = name.replace(miss, corr)
    return name.strip().title()

clean_aff = pd.read_csv(os.path.join(path2leadership, 'maps' 'cleaned_affiliation.csv'))
clean_aff['FullName'] = clean_aff['FullName'].transform(clean_institution_name)
clean_aff.drop_duplicates(inplace=True)


In [None]:
leader_data = []
for y in [1999, 2000, 2002, 2005, 2007, 2008, 2009, 2010, 2011, 2013, 2018]:
    ydf = pd.read_csv(os.path.join(path2leadership, 'cleaned_dataframes', str(y), 'cleaned_leadership{}.csv').format(y))
    for c in ['PrimarySample', 'AffiliationId', 'carnegie_id']:
        del ydf[c]
    ydf = ydf.merge(clean_aff.rename(columns={'FullName':'Institution'}), how='left', on='Institution')
    ydf.dropna(subset=['PrimarySample'], inplace=True)
    leader_data.append(ydf[ydf['PrimarySample']])

leader_data = pd.concat(leader_data, ignore_index=True)
del leader_data['PrimarySample']

leader_data['Year'] = leader_data['Year'].astype(int)
leader_data['AffiliationId'] = leader_data['AffiliationId'].astype(int)

for c in ['Name', 'Institution', 'Position']:
    leader_data['Raw'+c] = leader_data[c]

leader_data

Unnamed: 0,Year,Name,Position,Institution,SubInstitution,Education,AffiliationId,carnegie_id,SystemId,RawName,RawInstitution,RawPosition
0,1999,Matthew Goldstein,President,Adelphi University,,,71965598,188429.0,,Matthew Goldstein,Adelphi University,President
1,1999,Robert Ptachik,Deputy to the President,Adelphi University,,,71965598,188429.0,,Robert Ptachik,Adelphi University,Deputy to the President
2,1999,Carol Sabino,Assistant Secretary to the Board of Trustees a...,Adelphi University,,,71965598,188429.0,,Carol Sabino,Adelphi University,Assistant Secretary to the Board of Trustees a...
3,1999,Armstrong S. Starkey,Provost,Adelphi University,,,71965598,188429.0,,Armstrong S. Starkey,Adelphi University,Provost
4,1999,Devin Thornburg,Associate Provost,Adelphi University,,,71965598,188429.0,,Devin Thornburg,Adelphi University,Associate Provost
...,...,...,...,...,...,...,...,...,...,...,...,...
189248,2018,Moshael J. Straus,Chairman,Yeshiva University,,,19772626,197708.0,,Moshael J. Straus,Yeshiva University,Chairman
189249,2018,Morry J. Weiss,Chairman Emeritus,Yeshiva University,,,19772626,197708.0,,Morry J. Weiss,Yeshiva University,Chairman Emeritus
189250,2018,Mark Wilf,Trustee,Yeshiva University,,,19772626,197708.0,,Mark Wilf,Yeshiva University,Trustee
189251,2018,"Zygmunt ""Zygi"" Wilf",Trustee,Yeshiva University,,,19772626,197708.0,,"Zygmunt ""Zygi"" Wilf",Yeshiva University,Trustee


# Clean Positions

In [5]:
# split multiple positions in the same line

print(leader_data.shape)
leader_data = apply_splitting_pipeline(leader_data)
print(leader_data.shape)

leader_data = leader_data.drop_duplicates().reset_index(drop=True)
print(leader_data.shape)

# create a processed position
leader_data['WorkingPosition'] = leader_data['Position'].str.lower()
# clean weird controls
leader_data['WorkingPosition'] = leader_data['WorkingPosition'].apply(normalize_text)
leader_data['WorkingPosition'] = leader_data['WorkingPosition'].str.replace(r'[\x00-\x1F\x7F]', '', regex=True)
# remove lots of periods
leader_data['WorkingPosition'] = leader_data['WorkingPosition'].str.replace(r'(?<![a-zA-Z])\.|\.{2,}(?![a-zA-Z])', '', regex=True)
# remove rouge punction
leader_data['WorkingPosition'] = leader_data['WorkingPosition'].str.replace(r'[*_!?:]', '', regex=True)
# remove lots of space
leader_data['WorkingPosition'] = leader_data['WorkingPosition'].str.replace(r'(\S)(?=")', r'\1 ', regex=True)
leader_data['WorkingPosition'] = leader_data['WorkingPosition'].str.replace(r'\s+', ' ', regex=True)
leader_data['WorkingPosition'] = leader_data['WorkingPosition'].str.replace('.', ',')
leader_data['WorkingPosition'] = leader_data['WorkingPosition'].str.strip()

# leader_data['Name'] = leader_data['Name'].str.replace(r'\([^)]*\)', '', regex=True)

leader_data['Position'] = ""
leader_data['Seniority'] = ""


(189253, 12)
(207313, 12)
(206932, 12)


In [6]:
# common spelling mistakes
words2replace = [('trusted', 'trustee')]

In [7]:
# see if interm or acting

leader_data['IsActing'] = leader_data['WorkingPosition'].str.contains(r'\b(acting|interim)\b', regex=True, case=False)

# now remove anything in ( )
leader_data['WorkingPosition'] = leader_data['WorkingPosition'].str.replace(r'\([^)]*\)', '', regex=True)
# and remove the acting or interm
leader_data['WorkingPosition'] = leader_data['WorkingPosition'].str.replace(r'\b(acting|interim)\b', '', regex=True, case=False).str.replace(r'\s{2,}', ' ').str.strip()



  leader_data['IsActing'] = leader_data['WorkingPosition'].str.contains(r'\b(acting|interim)\b', regex=True, case=False)


In [8]:
# focus on cleaning the presidents

def is_true_president(pos):
    return any(p in pos for p in PRESIDENT_WORDS) and "vice" not in pos

ispres = leader_data.drop_duplicates(subset=['AffiliationId', 'Year'], keep='first')['WorkingPosition'].apply(is_true_president)
leader_data.loc[ispres[ispres].index.values, "Position"] = 'President'


In [9]:
# focus on provosts


provost_pattern = r'^(?:' + '|'.join(re.escape(p) for p in PROVOST_WORDS) + r')$'

leader_data.loc[leader_data['WorkingPosition'].str.contains(provost_pattern, regex=True), "Position"] = 'Provost'
leader_data.loc[leader_data['WorkingPosition'].str.split(',').str.get(0).str.contains(provost_pattern, regex=True), "Position"] = 'Provost'
leader_data.loc[leader_data['WorkingPosition'].str.split('.').str.get(0).str.contains(provost_pattern, regex=True), "Position"] = 'Provost'

VICE_PROVOST_WORDS = ["associate provost", "vice provost", 'deputy provost', "assistant provost"]
vice_provost_pattern = r'\b(?:' + '|'.join(re.escape(p) for p in VICE_PROVOST_WORDS) + r')\b'

leader_data.loc[leader_data['WorkingPosition'].str.contains(vice_provost_pattern, regex=True), "Position"] = 'Vice Provost'


In [10]:
# Designations

from position_word_banks import *

leader_data['DesignationCategory'] = ""


print(len(DESIGNATION_SET))
desig_pattern_dict ={desig:r'\b(?:' + '|'.join(re.escape(p) for p in keywords) + r')\b' for desig, keywords in DESIGNATION_SET.items() }

for desig in DESIG_ORDER:
    desig_idx = leader_data['WorkingPosition'].str.contains(desig_pattern_dict[desig], regex=True, case=False)
    print(desig, sum(desig_idx))
    leader_data.loc[desig_idx, 'DesignationCategory'] += desig



21
SubInstitution 1037
Administration 8125
Finance 9330
Satellite Campus 348
Student Affairs 16355
Academic Affairs 7424
Religion 291
Athletics 1417
External Relations 10612
Continued Studies 1584
Faculty Affairs 1516
Board 39520
Operations 2803
Health Affairs 3438
Human Resources 2415
Information Systems 5021
Library 3362
Advancement Foundation 8719
Research 6788
DEI 1526
Graduate 1954


In [11]:
leader_data['DesignationCategory'].value_counts().iloc[20:50]

DesignationCategory
Faculty Affairs                                   800
External RelationsAdvancement Foundation          694
SubInstitution                                    693
BoardAdvancement Foundation                       670
ResearchGraduate                                  548
FinanceStudent Affairs                            517
Student AffairsBoard                              517
Student AffairsAcademic Affairs                   413
Satellite Campus                                  279
Religion                                          279
Advancement FoundationResearch                    278
AdministrationResearch                            250
Faculty AffairsBoard                              172
OperationsInformation Systems                     151
Academic AffairsResearch                          143
Information SystemsLibrary                        130
Student AffairsHuman Resources                    123
Student AffairsExternal Relations                 122
Informat

In [12]:
# AdministrationResearch
leader_data[leader_data['DesignationCategory']=='Student AffairsAdvancement Foundation']
#leader_data[leader_data['DesignationCategory']=='Academic AffairsPlanning']['RawPosition'].values[:3]

Unnamed: 0,Year,Name,Position,Institution,SubInstitution,Education,AffiliationId,carnegie_id,SystemId,RawName,RawInstitution,RawPosition,WorkingPosition,Seniority,IsActing,DesignationCategory
324,1999,Jimmy McCluskey,,Baylor University,,,157394403,223232.0,,Jimmy McCluskey,Baylor University,"Dean, Student Services and Development","dean, student services and development",,False,Student AffairsAdvancement Foundation
454,1999,Elizabeth Corman,,Boston College,Wallace E Carroll School Of Management,,103531236,164924.0,,Elizabeth Corman,Boston College,"Director, Career Services and Alumni Relations","director, career services and alumni relations",,False,Student AffairsAdvancement Foundation
757,1999,Kathy D. Pullins,,Brigham Young University,J Reuben Clark Law School,,100005738,230038.0,,Kathy D. Pullins,Brigham Young University,"Assistant Dean, Student and Alumni Relations","assistant dean, student and alumni relations",,False,Student AffairsAdvancement Foundation
1187,1999,Fallaw Sowell,,Carnegie Mellon University,Graduate School Of Industrial Administration,Mercer 1982 BA; Duke 1983 MA; North Carolina 1...,74973139,211440.0,,Fallaw Sowell,Carnegie Mellon University,"Deputy Dean, Student and Alumni Affairs","deputy dean, student and alumni affairs",,False,Student AffairsAdvancement Foundation
1300,1999,Susan S. Jaros,,Case Western Reserve University,,,58956616,201645.0,,Susan S. Jaros,Case Western Reserve University,"Associate Vice President, Development and Stud...","associate vice president, development and stud...",,False,Student AffairsAdvancement Foundation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187179,2013,Courtney Gonser,,University Of Akron,,,110152177,200800.0,2.802097e+09,Courtney Gonser,University Of Akron,Director. Alumni Relations and Student Engagement,"director, alumni relations and student engagement",,False,Student AffairsAdvancement Foundation
196824,2018,Dean A. Micciche,,Rowan University,,,44265643,184782.0,,Dean A. Micciche,Rowan University,"Director, Alumni and Student Affairs","director, alumni and student affairs",,False,Student AffairsAdvancement Foundation
198736,2018,Dr. Lauretta Flynn Byars,,Prairie View A&M University,,"Morehead State 1970 BA; Kentucky 1972 MSW, 198...",250520410,227526.0,,Dr. Lauretta Flynn Byars,Prairie View A&M University,"Vice President, Student Affairs and Institutio...","vice president, student affairs and institutio...",,False,Student AffairsAdvancement Foundation
204118,2018,"J. Wesley ""Wes"" Norred",,University Of Texas Southwestern Medical Cente...,,,4210094379,228635.0,1.645283e+07,"J. Wesley ""Wes"" Norred",University Of Texas Southwestern Medical Cente...,"Vice President, Student and Alumni Affairs","vice president, student and alumni affairs",,False,Student AffairsAdvancement Foundation


In [13]:
for desig, desigwords in DESIGNATION_SET.items():
    for desig2, desigwords2 in DESIGNATION_SET.items():
        if desig != desig2 and len(desigwords.intersection(desigwords2)) > 0:
            print(desig, desig2, desigwords.intersection(desigwords2))

In [14]:
# vice presidents
def is_vice_president(pos):
    return any(p in pos for p in PRESIDENT_WORDS) and "vice" in pos and not "assistant to the" in pos

ispres = leader_data['WorkingPosition'].apply(is_vice_president)
leader_data.loc[ispres[ispres].index.values, "Position"] = 'Vice President'


In [15]:
vcsub = leader_data[leader_data['Position'] == 'Vice President']
vcsub = vcsub[vcsub['DesignationCategory'] == ""]
print(vcsub['WorkingPosition'].value_counts().shape)
vcsub['WorkingPosition'].value_counts().tail(50)

(119,)


WorkingPosition
associate vice chancellor, constituent development programs                2
vice presidentuniversity outreach                                          2
executive senior vice president                                            2
associate vice chancellor, contract administration                         2
vice president, constituent engagement                                     2
regional vice chancellor                                                   2
assistant vice president, enrollment planning                              2
executive vice president, global centers and global development            1
vice president, global programs                                            1
associate vice president, public and government                            1
vice president, capital planning                                           1
vice president, global strategies                                          1
assistant vice president, enrollment                        

In [16]:
leader_data[leader_data['WorkingPosition'] == 'genior vice president']



Unnamed: 0,Year,Name,Position,Institution,SubInstitution,Education,AffiliationId,carnegie_id,SystemId,RawName,RawInstitution,RawPosition,WorkingPosition,Seniority,IsActing,DesignationCategory


In [17]:
for desig, desigwords in SCHOOL_KEYWORDS.items():
    for desig2, desigwords2 in SCHOOL_KEYWORDS.items():
        if desig != desig2 and len(set(desigwords).intersection(desigwords2)) > 0:
            print(desig, desig2, set(desigwords).intersection(desigwords2))

In [18]:
# Clean Schools

all_desig_keywords = set([])
for desig, desigwords in DESIGNATION_SET.items():
    if not desig in set(['SubInstitution', "Library", "Satellite Campus"]):
        all_desig_keywords.update(desigwords)
print(len(all_desig_keywords))

# create a processed position
leader_data['RawSubInstitution'] = leader_data['SubInstitution'].copy()

leader_data['WorkingSubInstitution'] = leader_data['SubInstitution'].str.lower()
# clean weird controls
leader_data['WorkingSubInstitution'] = leader_data['WorkingSubInstitution'].apply(normalize_text)
leader_data['WorkingSubInstitution'] = leader_data['WorkingSubInstitution'].str.replace(r'[\x00-\x1F\x7F]', '', regex=True)
# remove lots of periods
leader_data['WorkingSubInstitution'] = leader_data['WorkingSubInstitution'].str.replace(r'(?<![a-zA-Z])\.|\.{2,}(?![a-zA-Z])', '', regex=True)
# remove rouge punction
leader_data['WorkingSubInstitution'] = leader_data['WorkingSubInstitution'].str.replace(r'[*_!?:]', '', regex=True)
# remove lots of space
leader_data['WorkingSubInstitution'] = leader_data['WorkingSubInstitution'].str.replace(r'(\S)(?=")', r'\1 ', regex=True)
leader_data['WorkingSubInstitution'] = leader_data['WorkingSubInstitution'].str.replace(r'\s+', ' ', regex=True)
leader_data['WorkingSubInstitution'] = leader_data['WorkingSubInstitution'].str.strip()


subinst2clear = [('students', 'dean of students')]
for wsi, wp in subinst2clear:
    clear_idx = np.logical_and(leader_data['WorkingSubInstitution'] == wsi, leader_data['WorkingPosition'] == wp)
    leader_data.loc[clear_idx, 'WorkingSubInstitution'] = None


for wsi in all_desig_keywords.union(set(["undergraduates"])):
    clear_idx = np.logical_and(leader_data['WorkingSubInstitution'] == wsi, leader_data['DesignationCategory'] == "")
    
    leader_data.loc[clear_idx, 'WorkingPosition'] += ", " + wsi
    leader_data.loc[leader_data['WorkingSubInstitution'] == wsi, 'WorkingSubInstitution'] = None


subinst2clear = ['dean of students', "vacant", "harvard corporation", "executive committee", "council",
"pivot point capital inc", "blackrock inc"]
for wsi in subinst2clear:
    leader_data.loc[leader_data['WorkingSubInstitution'] == wsi, 'WorkingSubInstitution'] = None



1427


In [19]:
# Schools, Colleges, Subinstitions

from position_word_banks import SCHOOL_KEYWORDS



school_pattern_dict ={school:r'\b(?:' + '|'.join(re.escape(p) for p in keywords) + r')\b' for school, keywords in SCHOOL_KEYWORDS.items() }

#
for school, school_pattern in school_pattern_dict.items():
    school_idx = leader_data['WorkingSubInstitution'].str.contains(school_pattern, regex=True, case=False, na=False)
    #print(school, sum(school_idx))
    leader_data.loc[school_idx, 'SubInstitutionCategory'] = school


exact_match_words = ['school of art', 'school for arts', 'tyler school of art', "college and graduate school of art"]
exact_match_pattern = r'^(?:' + '|'.join(re.escape(p) for p in exact_match_words) + r')$'
gradschool_idx = leader_data['WorkingSubInstitution'].str.contains(exact_match_pattern, regex=True, case=False, na=False)
leader_data.loc[gradschool_idx, 'SubInstitutionCategory'] = "Fine Arts"


exact_match_words = ["library", "cunningham memorial library", "library and learning assistance",
"mary couts burnett library", "raymond hfogler library"]
exact_match_pattern = r'^(?:' + '|'.join(re.escape(p) for p in exact_match_words) + r')$'
gradschool_idx = leader_data['WorkingSubInstitution'].str.contains(exact_match_pattern, regex=True, case=False, na=False)
leader_data.loc[gradschool_idx, 'SubInstitutionCategory'] = "library"


GRADUATESCHOOL_WORDS = ["graduate school", "graduate studies", "graduate college", "college of graduate studies", "horace h rackham school of graduate studies",
"school of graduate studies", "school of graduate studies and research", "graduate division", "university graduate school",
"college of graduate studies and research", "faculty of graduate studies", "robert b toulouse school of graduate studies",
"jack n averitt college of graduate studies", "jefferson college of graduate studies", "graduate and postgraduate studies",
"school of extended graduate studies", "graduate school and research", "toulouse school of graduate studies",
"graduate studies and research", "graduate and postdoctoral studies", "laney graduate school",
"graduate studies and sponsored programs", "research and graduate school", "graduate study", "research and graduate studies",
"graqvester ies and sponsored research", "research and graduate services", "college of graduate and outreach programs"]
graduateschool_pattern = r'^(?:' + '|'.join(re.escape(p) for p in GRADUATESCHOOL_WORDS) + r')$'
gradschool_idx = leader_data['WorkingSubInstitution'].str.contains(graduateschool_pattern, regex=True, case=False, na=False)
leader_data.loc[gradschool_idx, 'SubInstitutionCategory'] = "Graduate"

In [20]:
schoolsub = leader_data.dropna(subset=['WorkingSubInstitution'])
print(schoolsub.shape)
schoolsub = schoolsub[schoolsub['WorkingPosition'].str.contains('dean')]
print(schoolsub[schoolsub['SubInstitutionCategory'] == ""]['WorkingSubInstitution'].value_counts().shape)
print(schoolsub[schoolsub['SubInstitutionCategory'].isnull()]['WorkingSubInstitution'].value_counts().shape)

schoolsub[schoolsub['SubInstitutionCategory'].isnull()]['WorkingSubInstitution'].value_counts().iloc[:50]

(50482, 19)
(0,)
(178,)


WorkingSubInstitution
college                                                      29
irving i stone beit midrash program                          12
outreach school                                              10
university of arizona south                                   5
trinity college                                               5
outreach college                                              5
arizona state university main                                 4
university of wyoming/casper college center                   4
newcomb college                                               4
california state university san bernardino in palm desert     4
h john heinz iii college                                      4
usc beaufort                                                  3
cincinnati center                                             3
school of undergraduate studies                               3
speed scientific school                                       3
palm desert       

In [21]:
leader_data['AffiliationId'].unique()

array([  71965598,  181401687,  102298084,   55732556,   82497590,
        198089087,  157394403,  152479009,  103531236,  111088046,
        157417397,    6902469,  100005738,   27804330,  122411786,
        127339247,   67328108,  142934699,  184813773,   59897056,
         43369023,   26538001,   71838634,   74973139,   58956616,
         84470341,    1629065,  185071736,  174216632,  125687163,
         52064589,  130785548,   16944753,    8078737,  102607778,
         16285277,  167576493,   92446798,   78577930,  205783295,
        114493937,  107672454,  118353179,   72816309,  170897317,
        165102784,  119443389,       1003,  138216421,  150468666,
        106959904,  164389053,   66108857,  162714631,  193531525,
        184565670,  151328261,  136199984,  139290212,  137853757,
        106969075,  180949307,   47301684,   86420138,       1001,
       4210119109,   55769427,  173911158,   11883440,  145311948,
        189590672,  149910238,  186143895,   26347476,  138873

In [22]:


cosine_lower_bound = 0.8
use_threads = True
n_jobs = 2

school_id = 0
disambig_schools = []

school_map = pd.DataFrame([])
for affid in [51556381, 1317227900, 35777872]: #leader_data['AffiliationId'].unique():

    school_sub = leader_data[leader_data['AffiliationId'] == affid].dropna(subset=['WorkingSubInstitution']).reset_index(drop=True)
    
    name_popularity = school_sub['WorkingSubInstitution'].value_counts()

    tfidf = TfidfVectorizer(min_df=1, ngram_range = (3,3), analyzer='char', lowercase=False)
    #tfidf = CountVectorizer(min_df=1, ngram_range = (3,3), analyzer='char', lowercase=False)
 
    unique_school_names = name_popularity.index.values
    school_vectors = tfidf.fit_transform(unique_school_names)
    
    co= awesome_cossim_topn(school_vectors, school_vectors.T, ntop=unique_school_names.shape[0], 
                    lower_bound=cosine_lower_bound, 
                    use_threads=use_threads, n_jobs=n_jobs).todok()
    
    nopts = unique_school_names.shape[0]

    for idx in range(nopts):
        for jdx in range(nopts):
            if unique_school_names[idx] in unique_school_names[jdx]:
                co[idx, jdx] = 1
    n_components, labels = connected_components(csgraph=co, directed=False, return_labels=True)
    
    
    aff_school_map = pd.DataFrame(unique_school_names, columns=['OriginalName'])
    aff_school_map['MapName'] = ""
    aff_school_map['AffiliationId'] = affid

    isubschool = 0
    for icomp in range(n_components):
        comp_names = np.where(labels == icomp)[0]
        if 'school' in unique_school_names[comp_names[0]] or 'college' in unique_school_names[comp_names[0]]:
            aff_school_map.loc[comp_names, 'MapName'] = unique_school_names[comp_names[0]]
            aff_school_map.loc[comp_names, 'SchoolId'] = "{}.{}".format(affid, isubschool)
            isubschool += 1
        else:
            aff_school_map.loc[comp_names, 'MapName'] = unique_school_names[comp_names[0]]
            aff_school_map.loc[comp_names, 'SchoolId'] = None

    school_map = pd.concat([school_map, aff_school_map])
    
school_map

  co= awesome_cossim_topn(school_vectors, school_vectors.T, ntop=unique_school_names.shape[0],
  co= awesome_cossim_topn(school_vectors, school_vectors.T, ntop=unique_school_names.shape[0],
  co= awesome_cossim_topn(school_vectors, school_vectors.T, ntop=unique_school_names.shape[0],


Unnamed: 0,OriginalName,MapName,AffiliationId,SchoolId
0,colgate darden graduate school of business adm...,colgate darden graduate school of business adm...,51556381,51556381.0
1,college of arts and sciences,college of arts and sciences,51556381,51556381.1
2,school of law,school of law,51556381,51556381.2
3,school of engineering and applied science,school of engineering and applied science,51556381,51556381.3
4,mcintire school of commerce,mcintire school of commerce,51556381,51556381.4
5,college and graduate school of arts and sciences,college and graduate school of arts and sciences,51556381,51556381.5
6,school of nursing,school of nursing,51556381,51556381.6
7,college at wise,college at wise,51556381,51556381.7
8,graduate school of arts and sciences,college and graduate school of arts and sciences,51556381,51556381.5
9,curry school of education,curry school of education,51556381,51556381.8


In [23]:
school_sub['WorkingSubInstitution'].value_counts()

WorkingSubInstitution
college of engineering                               13
school of graduate studies                           12
school of business and economics                     10
school of education                                  10
school of nursing                                    10
school of technology                                 10
college of arts and sciences                         10
school of agriculture and environmental sciences      8
joint school of nanoscience and nanoengineering       3
school of agriculture                                 2
college of business and economics                     1
college of health and human sciences                  1
college of education                                  1
library services                                      1
college of arts humanities and social sciences        1
college of agriculture and environmental sciences     1
unami                                                 1
college of science and tec

In [24]:
school_sub[school_sub['WorkingSubInstitution'] == 'uvimco']

Unnamed: 0,Year,Name,Position,Institution,SubInstitution,Education,AffiliationId,carnegie_id,SystemId,RawName,RawInstitution,RawPosition,WorkingPosition,Seniority,IsActing,DesignationCategory,RawSubInstitution,WorkingSubInstitution,SubInstitutionCategory


In [25]:
nullsub = schoolsub[schoolsub['SubInstitutionCategory'].isnull()]

nullsub[nullsub['WorkingSubInstitution'].str.contains('graduate')]['WorkingSubInstitution'].value_counts()

WorkingSubInstitution
farquhar center for undergraduate studies    3
school of undergraduate studies              3
westchester graduate center                  1
division of undergraduate education          1
center for graduate studies                  1
hudson graduate centers                      1
Name: count, dtype: int64

In [26]:
# deans

s1 = schoolsub[schoolsub['WorkingSubInstitution'] == "library and learning assistance"]
s1

Unnamed: 0,Year,Name,Position,Institution,SubInstitution,Education,AffiliationId,carnegie_id,SystemId,RawName,RawInstitution,RawPosition,WorkingPosition,Seniority,IsActing,DesignationCategory,RawSubInstitution,WorkingSubInstitution,SubInstitutionCategory
180160,2013,Dr. Douglas Bates,,Tennessee Technological University,Library And Learning Assistance,"BYU 1982 BS, 1988 MLS: Kansas State 2002 EdD",63920570,221847.0,1003.0,Dr. Douglas Bates,Tennessee Technological University,"Dean, Library and Learning Assistance","dean, library and learning assistance",,False,Library,Library And Learning Assistance,library and learning assistance,library
198632,2018,Dr. Douglas Bates,,Tennessee Technological University,Library And Learning Assistance,"BYU 1982 BS, 1988 MLS; Kansas State 2002 EdD",63920570,221847.0,1003.0,Dr. Douglas Bates,Tennessee Technological University,"Dean, Library and Learning Assistance","dean, library and learning assistance",,False,Library,Library And Learning Assistance,library and learning assistance,library


In [27]:
leader_data[leader_data['Name'] == 'Terry L. Kuhn']

Unnamed: 0,Year,Name,Position,Institution,SubInstitution,Education,AffiliationId,carnegie_id,SystemId,RawName,RawInstitution,RawPosition,WorkingPosition,Seniority,IsActing,DesignationCategory,RawSubInstitution,WorkingSubInstitution,SubInstitutionCategory
4015,1999,Terry L. Kuhn,Vice Provost,Kent State University,,,149910238,203517.0,2802097000.0,Terry L. Kuhn,Kent State University,"Vice Provost and Dean, Undergraduate Studies",vice provost,,False,,,,
4016,1999,Terry L. Kuhn,,Kent State University,,,149910238,203517.0,2802097000.0,Terry L. Kuhn,Kent State University,"Vice Provost and Dean, Undergraduate Studies","dean, undergraduate studies",,False,Academic Affairs,,,
18965,2000,Terry L. Kuhn,Vice Provost,Kent State University,Undergraduate Studies,,149910238,203517.0,2802097000.0,Terry L. Kuhn,Kent State University,Vice Provost and Dean,"vice provost, undergraduate studies",,False,,Undergraduate Studies,,
18966,2000,Terry L. Kuhn,,Kent State University,Undergraduate Studies,,149910238,203517.0,2802097000.0,Terry L. Kuhn,Kent State University,Vice Provost and Dean,"dean, undergraduate studies",,False,,Undergraduate Studies,,
36110,2002,Terry L. Kuhn,Vice Provost,Kent State University,,,149910238,203517.0,2802097000.0,Terry L. Kuhn,Kent State University,"Vice Provost and Dean, Undergraduate Studies",vice provost,,False,,,,
36111,2002,Terry L. Kuhn,,Kent State University,,,149910238,203517.0,2802097000.0,Terry L. Kuhn,Kent State University,"Vice Provost and Dean, Undergraduate Studies","dean, undergraduate studies",,False,Academic Affairs,,,


In [28]:
DESIGNATION_SET['Faculty Affairs']

{'academic personnel',
 'academic personnels',
 'college faculty',
 'college facultys',
 'faculties',
 'facultiess',
 'faculty',
 'faculty advancement',
 'faculty advancements',
 'faculty affair',
 'faculty affairs',
 'faculty and staff',
 'faculty and staffs',
 'faculty development',
 'faculty developments',
 'facultys',
 'research and faculty development',
 'research and faculty developments',
 'staff relations',
 'staff relationss',
 'vice president, faculty',
 'vice president, facultys'}

# Clean Names

In [29]:
# clean periods
leader_data['Name'] = leader_data['Name'].str.replace(r'(?<![a-zA-Z])\.|\.{2,}(?![a-zA-Z])', '', regex=True)

leader_data['Name'] = leader_data['Name'].str.replace(r'[*_!?:]', '', regex=True)
leader_data['Name'] = leader_data['Name'].str.replace(r'\(Vacant\)', 'Vacant', regex=True)
leader_data['Name'] = leader_data['Name'].str.replace(r'.*\b[vV]acant\b.*', 'Vacant', regex=True)
leader_data['Name'] = leader_data['Name'].str.replace(r'(\S)(?=")', r'\1 ', regex=True)
leader_data['Name'] = leader_data['Name'].str.replace(r'\s+', ' ', regex=True)
leader_data['Name'] = leader_data['Name'].str.strip()

leader_data['Name'] = leader_data['Name'].str.replace(r'\([^)]*\)', '', regex=True)
#leader_data['Name'] = leader_data['Name'].transform(pyscisci.clean_person_names)
leader_data['Name'] = leader_data['Name'].str.title()

leader_data['Name'] = leader_data['Name'].str.replace(r'\bIl\b', 'Ii', regex=True)
leader_data['Name'] = leader_data['Name'].str.replace(r'\bIll\b', 'Iii', regex=True)
leader_data['Name'] = leader_data['Name'].str.replace(r'\bIli\b', 'Iii', regex=True)
leader_data['Name'] = leader_data['Name'].str.replace(r'\bIil\b', 'Iii', regex=True)
leader_data['Name'] = leader_data['Name'].str.replace(r'\bCm1\b', 'Cm', regex=True)

leader_data['Name'] = leader_data['Name'].str.replace('*', '', regex=False)

In [30]:
from nameparser.config import CONSTANTS

titles2add = ['trustee', 'chairman', 'very', 'msgr', 'radm']
for t in titles2add:
    CONSTANTS.titles.add(t)
    
suffix_acronyms2add = ['mn', 'op', 'facfas', 'ret', 'ri', 'osa', 'sj', 'cssi', 'svd', 'sm', 'std', 'sc', 
'ofm', 'kg', 'cssp', 'rsm', 'chfm', 'bsn', 'cec', 'mbbch']
for s in suffix_acronyms2add:
    CONSTANTS.suffix_acronyms.add(s)

suffix_acronyms2remove = ['caro', 'mai', 'pla', 'chi']
for s in suffix_acronyms2remove:
    CONSTANTS.suffix_acronyms.remove(s)

CONSTANTS.suffix_not_acronyms.remove('junior')

CONSTANTS.suffix_not_acronyms.add('facsm')

hnames = [HumanName(name, constants=CONSTANTS) for name in leader_data['Name'].values]

leader_data['FirstName'] = [h.first.strip() for h in hnames]
leader_data['MiddleName'] = [h.middle.strip() for h in hnames]
leader_data['LastName'] = [h.last.strip() for h in hnames]
leader_data['NickName'] = [h.nickname.strip() for h in hnames]
leader_data['SuffixName'] = [h.suffix.strip() for h in hnames]
leader_data['PrefixName'] = [h.title.strip() for h in hnames]
leader_data['MiddleInitials'] = ["".join([a[0] for a in h.middle.split(' ') if len(a) > 0]) for h in hnames]


leader_data['LastName'] = leader_data['LastName'].str.replace('.', '', regex=False)

def make_raw_name(prow, use_nickname=False):
    if use_nickname:
        fname_col = 'NickName'
    else:
        fname_col = 'FirstName'
    if len(prow[fname_col].replace('.', '')) <= 1:
        rawname = prow[fname_col] + " " + prow['MiddleName']
    else:
        rawname = prow[fname_col] + " " + prow['MiddleInitials']
    
    rawname += " " + prow['LastName']
    if bool(re.fullmatch(r'[Ii\s]+', prow['SuffixName'])):
        rawname += " " + prow['SuffixName']
    
    return rawname

leader_data['RawName4Match'] = [make_raw_name(row) for i,row in leader_data.iterrows()]

leader_data

Unnamed: 0,Year,Name,Position,Institution,SubInstitution,Education,AffiliationId,carnegie_id,SystemId,RawName,...,WorkingSubInstitution,SubInstitutionCategory,FirstName,MiddleName,LastName,NickName,SuffixName,PrefixName,MiddleInitials,RawName4Match
0,1999,Matthew Goldstein,President,Adelphi University,,,71965598,188429.0,,Matthew Goldstein,...,,,Matthew,,Goldstein,,,,,Matthew Goldstein
1,1999,Robert Ptachik,,Adelphi University,,,71965598,188429.0,,Robert Ptachik,...,,,Robert,,Ptachik,,,,,Robert Ptachik
2,1999,Carol Sabino,,Adelphi University,,,71965598,188429.0,,Carol Sabino,...,,,Carol,,Sabino,,,,,Carol Sabino
3,1999,Carol Sabino,,Adelphi University,,,71965598,188429.0,,Carol Sabino,...,,,Carol,,Sabino,,,,,Carol Sabino
4,1999,Armstrong S. Starkey,Provost,Adelphi University,,,71965598,188429.0,,Armstrong S. Starkey,...,,,Armstrong,S.,Starkey,,,,S,Armstrong S Starkey
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206927,2018,Moshael J. Straus,,Yeshiva University,,,19772626,197708.0,,Moshael J. Straus,...,,,Moshael,J.,Straus,,,,J,Moshael J Straus
206928,2018,Morry J. Weiss,,Yeshiva University,,,19772626,197708.0,,Morry J. Weiss,...,,,Morry,J.,Weiss,,,,J,Morry J Weiss
206929,2018,Mark Wilf,,Yeshiva University,,,19772626,197708.0,,Mark Wilf,...,,,Mark,,Wilf,,,,,Mark Wilf
206930,2018,"Zygmunt ""Zygi "" Wilf",,Yeshiva University,,,19772626,197708.0,,"Zygmunt ""Zygi"" Wilf",...,,,Zygmunt,,Wilf,Zygi,,,,Zygmunt Wilf


In [31]:
top_people = leader_data[['Year',
 'Name', 'RawName', 'AffiliationId',
 'Position',
 'Institution','RawPosition', 'FirstName',
 'MiddleName',
 'LastName',
 'NickName',
 'SuffixName',
 'PrefixName',
 'MiddleInitials',
 'RawName4Match']]
top_people = top_people[top_people['Position'] != ""]
top_people

Unnamed: 0,Year,Name,RawName,AffiliationId,Position,Institution,RawPosition,FirstName,MiddleName,LastName,NickName,SuffixName,PrefixName,MiddleInitials,RawName4Match
0,1999,Matthew Goldstein,Matthew Goldstein,71965598,President,Adelphi University,President,Matthew,,Goldstein,,,,,Matthew Goldstein
4,1999,Armstrong S. Starkey,Armstrong S. Starkey,71965598,Provost,Adelphi University,Provost,Armstrong,S.,Starkey,,,,S,Armstrong S Starkey
5,1999,Devin Thornburg,Devin Thornburg,71965598,Vice Provost,Adelphi University,Associate Provost,Devin,,Thornburg,,,,,Devin Thornburg
7,1999,Angelo Proto,Angelo Proto,71965598,Vice President,Adelphi University,"Vice President, Enrollment Management and Stud...",Angelo,,Proto,,,,,Angelo Proto
8,1999,Carl J. Rheins,Carl J. Rheins,71965598,Vice President,Adelphi University,"Vice President, External Affairs",Carl,J.,Rheins,,,,J,Carl J Rheins
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206829,2018,"Andrew Jay ""Avi "" Lauer","Andrew Jay ""Avi"" Lauer",19772626,Vice President,Yeshiva University,"Vice President, Legal Affairs, Secretary and G...",Andrew,Jay,Lauer,Avi,,,J,Andrew J Lauer
206832,2018,"Dr. Allen M. Spiegel, Md","Dr. Allen M. Spiegel, MD",19772626,Vice President,Yeshiva University,"Vice President, Medical Affairs and Marilyn an...",Allen,M.,Spiegel,,Md,Dr.,M,Allen M Spiegel
206834,2018,Dr. Herbert C. Dobrinsky,Dr. Herbert C. Dobrinsky,19772626,Vice President,Yeshiva University,"Vice President, University Affairs",Herbert,C.,Dobrinsky,,,Dr.,C,Herbert C Dobrinsky
206835,2018,Rabbi Kenneth Brander,Rabbi Kenneth Brander,19772626,Vice President,Yeshiva University,"Vice President, University and Community Life",Kenneth,,Brander,,,Rabbi,,Kenneth Brander


# Version 1

In [32]:
cosine_lower_bound = 0.7
use_threads = True
n_jobs = 10

lid = 0
disambig_leader = []

top_people['LeaderId'] = np.nan

for affid in leader_data['AffiliationId'].unique():
    affsub = top_people[top_people['AffiliationId'] == affid]
    iname = top_people[top_people['AffiliationId'] == affid]['Institution'].values[0]
    affsub = affsub[affsub['Name'] != 'Vacant'].reset_index(names=['Lindex']) #drop=True)
    
    tfidf = TfidfVectorizer(min_df=1, ngram_range = (3,3), analyzer='char', lowercase=False)
    #tfidf = CountVectorizer(min_df=1, ngram_range = (3,3), analyzer='char', lowercase=False)
    
    # expand names for people with nicknames
    expanded_name_list = affsub.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
    expanded_name_list['RawName4Match'] = [make_raw_name(row, use_nickname=True) for i,row in expanded_name_list.iterrows()]
    exp_affsub = pd.concat([affsub, expanded_name_list], ignore_index=True)
    
    # expand name list to keep 
    expanded_name_list2 = affsub[affsub['MiddleName'].str.replace('.', '').str.len() > 1].reset_index(names=['Oidx'])
    expanded_name_list2['FirstName'] = [n[0] for n in expanded_name_list2['FirstName']]
    expanded_name_list2['RawName4Match'] = [make_raw_name(row, use_nickname=False) for i,row in expanded_name_list2.iterrows()]
    exp_affsub = pd.concat([exp_affsub, expanded_name_list2], ignore_index=True)
     
    name_char = tfidf.fit_transform(exp_affsub['RawName4Match'].values)
    #co= awesome_cossim_topn(name_char, name_char.T, ntop=affsub.shape[0], lower_bound=cosine_lower_bound, use_threads=use_threads, n_jobs=n_jobs).todok()
    co=  sp_matmul_topn(name_char, name_char.T, top_n=affsub.shape[0], threshold=cosine_lower_bound, n_threads=n_jobs).todok()
    for i,r in enumerate(expanded_name_list['Oidx'].values):
        co[i+affsub.shape[0], r] = 1
    for i,r in enumerate(expanded_name_list2['Oidx'].values):
        co[i+affsub.shape[0]+expanded_name_list.shape[0], r] = 1
    n_components, labels = connected_components(csgraph=co, directed=False, return_labels=True)

    for idx in np.unique(labels):
        candidate_idx = labels == idx
        #name_candidates = affsub['RawName'].loc[candidate_idx]
        pname = [affid,iname, lid]
        pname.extend(identify_true_name(exp_affsub, candidate_idx)) 
        
        top_people.loc[exp_affsub['Lindex'].loc[candidate_idx], 'LeaderId'] = lid
        syear = top_people.loc[exp_affsub['Lindex'].loc[candidate_idx], 'Year'].min()
        eyear = top_people.loc[exp_affsub['Lindex'].loc[candidate_idx], 'Year'].max()
        pname.extend([syear, eyear]) 
        disambig_leader.append(pname)

        
        exp_affsub.loc[candidate_idx, 'LeaderId'] = lid
        lid += 1

disambig_leader = pd.DataFrame(disambig_leader, columns=['AffiliationId', 'Institution', 'LeaderId', 'FirstName', 'MiddleName', 'MiddleInitials', 'LastName', 'NickName', 'SuffixName', 'PrefixName', 'StartYear', 'EndYear'])        

  expanded_name_list = affsub.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
  expanded_name_list = affsub.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
  expanded_name_list = affsub.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
  expanded_name_list = affsub.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
  expanded_name_list = affsub.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
  expanded_name_list = affsub.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
  expanded_name_list = affsub.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
  expanded_name_list = affsub.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
  expanded_name_list = a

  expanded_name_list = affsub.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
  expanded_name_list = affsub.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
  expanded_name_list = affsub.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
  expanded_name_list = affsub.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
  expanded_name_list = affsub.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
  expanded_name_list = affsub.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
  expanded_name_list = affsub.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
  expanded_name_list = affsub.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
  expanded_name_list = a

In [33]:
flagged_ids = []
def person_quality_check(persondf):
    if persondf.shape[0] > 1:
        numlastname = persondf['LastName'].nunique()
        numfirstname = persondf['FirstName'].nunique()
        msub = all([ mn1.issubset(mn2) or mn2.issubset(mn1) for mn1, mn2 in combinations([set(mn) for mn in persondf['MiddleInitials'].values],2)])
        if numlastname > 1 or not msub:
            #print(persondf)
            print(persondf.name, persondf['LastName'].unique(), numlastname)
            flagged_ids.append([persondf.name, numlastname, numfirstname, msub])
        #print(numlastname, numfirstname, msub)
        #print(persondf)
    

top_people.groupby('LeaderId').apply(person_quality_check)
flagged_ids = pd.DataFrame(flagged_ids, columns=['LeaderId', 'NumLastNames', 'NumFirstNames', 'MiddleInitialSubsets'])
flagged_ids[flagged_ids['MiddleInitialSubsets'] == False]
flagged_ids




96.0 ['Harrison'] 1
124.0 ['Snyder' 'Snyderswoon'] 2
149.0 ['Capaldi' 'Capaldinf'] 2
173.0 ['Langland' 'Langlandn'] 2
177.0 ['Hielmstad' 'Helmstad' 'Hjelmstad'] 3
257.0 ['Repp' 'Rep'] 2
283.0 ['Jeffrey'] 1
342.0 ['Graca' 'Grace'] 2
500.0 ['French' 'Frencha'] 2
543.0 ['Skousen' 'Skousenth'] 2
549.0 ['Stolton' 'Stohlton'] 2
730.0 ['Vinovorski' 'Vinovrski'] 2
748.0 ['Moffitt' 'Moffittben'] 2
814.0 ['Reuling' 'Ruling'] 2
861.0 ['Mason' 'Maxson'] 2
938.0 ['Mcgowen' 'Mcgowan'] 2
940.0 ['Zor' 'Zorn' 'Zom'] 3
1018.0 ['Turkken' 'Turkkan'] 2
1081.0 ['Loessin'] 1
1107.0 ['Ornt' 'Ort'] 2
1362.0 ['Edmonds' 'Edmond'] 2
1392.0 ['Cinelli' 'Cinell'] 2
1428.0 ['De Coste' 'Decoste'] 2
1445.0 ['Przirembl' 'Przirembel'] 2
1475.0 ['Nuru' 'Nuru-Holm'] 2
1488.0 ['Droney' 'Dronev'] 2
1668.0 ['During' 'Durning'] 2
1799.0 ['Willett' 'Willet'] 2
1805.0 ['Wering' 'Wernig'] 2
1815.0 ['Kosoko' 'Kosoko-Lasaki'] 2
1837.0 ['Danielsono' 'Danielson'] 2
2087.0 ['Merkx' 'Merky'] 2
2183.0 ['Bach' 'Bachg'] 2
2252.0 ['Shorroc

  top_people.groupby('LeaderId').apply(person_quality_check)


Unnamed: 0,LeaderId,NumLastNames,NumFirstNames,MiddleInitialSubsets
0,96.0,1,1,False
1,124.0,2,1,True
2,149.0,2,1,True
3,173.0,2,1,True
4,177.0,3,1,True
...,...,...,...,...
245,14843.0,2,1,True
246,14879.0,2,1,True
247,14951.0,2,1,True
248,15235.0,2,1,True


In [35]:
#top_people[top_people['LeaderId'] == flagged_ids['LeaderId'].values[0]]['LastName'].unique()

top_people[top_people['LeaderId'] == 12997.0]

Unnamed: 0,Year,Name,RawName,AffiliationId,Position,Institution,RawPosition,FirstName,MiddleName,LastName,NickName,SuffixName,PrefixName,MiddleInitials,RawName4Match,LeaderId
147770,2010,"William R. Elger, Cpa","William R. Elger, CPA",55302922,Vice President,University Of Texas Medical Branch At Galveston,Executive Vice President and Chief Business an...,William,R.,Elger,,Cpa,,R,William R Elger,12997.0
166529,2011,"William R. Elger, Cpa","William R. Elger, CPA",55302922,Vice President,University Of Texas Medical Branch At Galveston,Executive Vice President and Chief Business an...,William,R.,Elger,,Cpa,,R,William R Elger,12997.0
185745,2013,"William R. Elger, Cpad","William R. Elger, CPAD",55302922,Vice President,University Of Texas Medical Branch At Galveston,Executive Vice President and Chief Business an...,Cpad,,William R Elger,,,,,Cpad William R Elger,12997.0


In [28]:
print(top_people.shape)
top_people.dropna(subset=['LeaderId'])

(51522, 16)


Unnamed: 0,Year,Name,RawName,AffiliationId,Position,Institution,RawPosition,FirstName,MiddleName,LastName,NickName,SuffixName,PrefixName,MiddleInitials,RawName4Match,LeaderId
0,1999,Matthew Goldstein,Matthew Goldstein,71965598,President,Adelphi University,President,Matthew,,Goldstein,,,,,Matthew Goldstein,0.0
4,1999,Armstrong S. Starkey,Armstrong S. Starkey,71965598,Provost,Adelphi University,Provost,Armstrong,S.,Starkey,,,,S,Armstrong S Starkey,1.0
5,1999,Devin Thornburg,Devin Thornburg,71965598,Vice Provost,Adelphi University,Associate Provost,Devin,,Thornburg,,,,,Devin Thornburg,2.0
7,1999,Angelo Proto,Angelo Proto,71965598,Vice President,Adelphi University,"Vice President, Enrollment Management and Stud...",Angelo,,Proto,,,,,Angelo Proto,3.0
8,1999,Carl J. Rheins,Carl J. Rheins,71965598,Vice President,Adelphi University,"Vice President, External Affairs",Carl,J.,Rheins,,,,J,Carl J Rheins,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206829,2018,"Andrew Jay ""Avi "" Lauer","Andrew Jay ""Avi"" Lauer",19772626,Vice President,Yeshiva University,"Vice President, Legal Affairs, Secretary and G...",Andrew,Jay,Lauer,Avi,,,J,Andrew J Lauer,15023.0
206832,2018,"Dr. Allen M. Spiegel, Md","Dr. Allen M. Spiegel, MD",19772626,Vice President,Yeshiva University,"Vice President, Medical Affairs and Marilyn an...",Allen,M.,Spiegel,,Md,Dr.,M,Allen M Spiegel,15018.0
206834,2018,Dr. Herbert C. Dobrinsky,Dr. Herbert C. Dobrinsky,19772626,Vice President,Yeshiva University,"Vice President, University Affairs",Herbert,C.,Dobrinsky,,,Dr.,C,Herbert C Dobrinsky,15010.0
206835,2018,Rabbi Kenneth Brander,Rabbi Kenneth Brander,19772626,Vice President,Yeshiva University,"Vice President, University and Community Life",Kenneth,,Brander,,,Rabbi,,Kenneth Brander,15029.0


In [74]:
top_people[top_people['LeaderId'].isna()]

Unnamed: 0,Year,Name,RawName,AffiliationId,Position,Institution,RawPosition,FirstName,MiddleName,LastName,NickName,SuffixName,PrefixName,MiddleInitials,RawName4Match,LeaderId
1174,1999,Vacant,Vacant,74973139,Vice Provost,Carnegie Mellon University,"Vice Provost, Undergraduate Education",Vacant,,,,,,,Vacant,
1292,1999,Vacant,(Vacant),58956616,Vice President,Case Western Reserve University,"Vice President, Budgets and Planning",Vacant,,,,,,,Vacant,
1378,1999,Vacant,Vacant,84470341,Vice President,Catholic University Of America,"Vice President, Development",Vacant,,,,,,,Vacant,
1557,1999,Vacant,Vacant,174216632,Vice President,City University Of New York,"Vice Chancellor, Student Affairs, Student Serv...",Vacant,,,,,,,Vacant,
2355,1999,Vacant,Vacant,118353179,Vice President,Depaul University,"Vice President, Human Resources",Vacant,,,,,,,Vacant,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204178,2018,Vacant,Vacant,165951966,Vice President,University Of Texas Health Science Center At S...,"Vice President, Human Resources",Vacant,,,,,,,Vacant,
204769,2018,Vacant,Vacant,1289702989,Vice President,University System Of Georgia,"Vice Chancellor, Human Resources",Vacant,,,,,,,Vacant,
205098,2018,Vacant,Vacant,66946132,Vice President,University Of Maryland College Park,"Vice President, Information Technology and Chi...",Vacant,,,,,,,Vacant,
205143,2018,Vacant,Vacant,22407884,Vice President,University Of Maryland Eastern Shore,"Vice President, Administrative Affairs",Vacant,,,,,,,Vacant,


In [96]:
del disambig_leader['Lindex']

In [36]:
cosine_lower_bound = 0.8
use_threads = True
n_jobs = 2

lid = 0


tfidf = TfidfVectorizer(min_df=1, ngram_range = (3,3), analyzer='char', lowercase=False)

disambig_leader['RawName4Match'] = [make_raw_name(row) for i,row in disambig_leader.iterrows()]
disambig_leader['MergedLeaderId'] = np.nan
disambig_leader = disambig_leader.reset_index(names=['Lindex'])

# expand names for people with nicknames
expanded_name_list = disambig_leader.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
expanded_name_list['RawName4Match'] = [make_raw_name(row, use_nickname=True) for i,row in expanded_name_list.iterrows()]
exp_affsub = pd.concat([disambig_leader, expanded_name_list], ignore_index=True)
    
# expand name list to keep 
expanded_name_list2 = disambig_leader[disambig_leader['MiddleName'].str.replace('.', '').str.len() > 1].reset_index(names=['Oidx'])
expanded_name_list2['FirstName'] = [n[0] for n in expanded_name_list2['FirstName']]
expanded_name_list2['RawName4Match'] = [make_raw_name(row, use_nickname=False) for i,row in expanded_name_list2.iterrows()]
exp_affsub = pd.concat([exp_affsub, expanded_name_list2], ignore_index=True)
     
name_char = tfidf.fit_transform(exp_affsub['RawName4Match'].values)
#co= awesome_cossim_topn(name_char, name_char.T, ntop=disambig_leader.shape[0], lower_bound=cosine_lower_bound, use_threads=use_threads, n_jobs=n_jobs).todok()
co=  sp_matmul_topn(name_char, name_char.T, top_n=disambig_leader.shape[0], threshold=cosine_lower_bound, n_threads=n_jobs).todok()

for i,r in enumerate(expanded_name_list['Oidx'].values):
    co[i+disambig_leader.shape[0], r] = 1
for i,r in enumerate(expanded_name_list2['Oidx'].values):
    co[i+disambig_leader.shape[0]+expanded_name_list.shape[0], r] = 1
n_components, labels = connected_components(csgraph=co, directed=False, return_labels=True)

for idx in np.unique(labels):
    candidate_idx = labels == idx
    
    disambig_leader.loc[exp_affsub['Lindex'].loc[candidate_idx], 'MergedLeaderId'] = lid
    disambig_leader.loc[exp_affsub['Lindex'].loc[candidate_idx], 'NumMatches'] = exp_affsub['Lindex'].loc[candidate_idx].nunique()
    #exp_affsub.loc[candidate_idx, 'MergedLeaderId'] = lid
    lid += 1

disambig_leader

Unnamed: 0,Lindex,AffiliationId,Institution,LeaderId,FirstName,MiddleName,MiddleInitials,LastName,NickName,SuffixName,PrefixName,StartYear,EndYear,RawName4Match,MergedLeaderId,NumMatches
0,0,71965598,Adelphi University,0,Matthew,,,Goldstein,,,,1999,1999,Matthew Goldstein,0.0,2.0
1,1,71965598,Adelphi University,1,Armstrong,S.,S,Starkey,,,,1999,1999,Armstrong S Starkey,1.0,1.0
2,2,71965598,Adelphi University,2,Devin,,,Thornburg,,,,1999,1999,Devin Thornburg,2.0,1.0
3,3,71965598,Adelphi University,3,Angelo,B.,B,Proto,,,,1999,2013,Angelo B Proto,3.0,1.0
4,4,71965598,Adelphi University,4,Carl,J.,J,Rheins,,,,1999,1999,Carl J Rheins,4.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15556,15556,2802096936,University System Of Ohio,15556,Rich,,,Petrick,,,,2011,2011,Rich Petrick,13934.0,1.0
15557,15557,2802096936,University System Of Ohio,15557,Harry,,,Andrist,,,,2011,2011,Harry Andrist,1462.0,2.0
15558,15558,2802096936,University System Of Ohio,15558,James,M.,M,Petro,Jmi,,,2013,2013,James M Petro,13935.0,1.0
15559,15559,2802096936,University System Of Ohio,15559,Gary,W.,W,Cates,,,,2013,2013,Gary W Cates,13936.0,1.0


In [103]:
print(disambig_leader['NumMatches'].max())
disambig_leader[disambig_leader['NumMatches'] > 1]['MergedLeaderId'].nunique()

8.0


1346

In [105]:
disambig_leader[disambig_leader['MergedLeaderId'] ==2167]

Unnamed: 0,Lindex,AffiliationId,Institution,LeaderId,FirstName,MiddleName,MiddleInitials,LastName,NickName,SuffixName,PrefixName,StartYear,EndYear,RawName4Match,MergedLeaderId,NumMatches
2193,2193,119443389,East Tennessee State University,2193,Robert,H.,H,Adams,,,,2005,2009,Robert H Adams,2167.0,6.0
4072,4072,169615421,Middle Tennessee State University,4072,Robert,H.,H,Adams,,,,2005,2009,Robert H Adams,2167.0,6.0
7285,7285,75256744,Tennessee State University,7285,Robert,H.,H,Adams,,,,2005,2009,Robert H Adams,2167.0,6.0
7329,7329,63920570,Tennessee Technological University,7329,Robert,H.,H,Adams,,,,2005,2009,Robert H Adams,2167.0,6.0
8385,8385,102401767,University Of Arkansas At Little Rock,8385,Robert,H.,H,Adams,Bob,Cpa,Dr.,2010,2013,Robert H Adams,2167.0,6.0
10393,10393,94658018,University Of Memphis,10393,Robert,H.,H,Adams,,,,2005,2009,Robert H Adams,2167.0,6.0


In [38]:
disambig_leader.to_csv('disambiguated_top_leaders_8_13_25.csv', index=False, header=True, mode='w')

In [106]:
flagged_ids = []
def person_quality_check(persondf):
    if persondf.shape[0] > 1:
        numlastname = persondf['LastName'].nunique()
        numfirstname = persondf['FirstName'].nunique()
        msub = all([ mn1.issubset(mn2) or mn2.issubset(mn1) for mn1, mn2 in combinations([set(mn) for mn in persondf['MiddleInitials'].values],2)])
        if numlastname > 1 or not msub:
            flagged_ids.append([persondf.name, numlastname, numfirstname, msub])
        #print(numlastname, numfirstname, msub)
        #print(persondf)
    

disambig_leader.groupby('MergedLeaderId').apply(person_quality_check)
flagged_ids = pd.DataFrame(flagged_ids, columns=['LeaderId', 'NumLastNames', 'NumFirstNames', 'MiddleInitialSubsets'])
flagged_ids[flagged_ids['MiddleInitialSubsets'] == False]
flagged_ids

  disambig_leader.groupby('MergedLeaderId').apply(person_quality_check)


Unnamed: 0,LeaderId,NumLastNames,NumFirstNames,MiddleInitialSubsets
0,95.0,2,1,True
1,764.0,2,1,True
2,1111.0,1,1,False
3,1173.0,2,1,True
4,1182.0,1,3,False
5,1533.0,1,1,False
6,1562.0,2,1,True
7,1679.0,1,1,False
8,1699.0,2,1,True
9,1846.0,1,2,False


In [111]:
disambig_leader[disambig_leader['MergedLeaderId'] ==6153]

Unnamed: 0,Lindex,AffiliationId,Institution,LeaderId,FirstName,MiddleName,MiddleInitials,LastName,NickName,SuffixName,PrefixName,StartYear,EndYear,RawName4Match,MergedLeaderId,NumMatches
6458,6458,63190737,State University Of New York At Buffalo,6458,Robert,J.,J,Wagner,,,,1999,2002,Robert J Wagner,6153.0,3.0
10298,10298,133738476,University Of Massachusetts At Lowell,10298,Robert,,,Wagner,,,,2002,2002,Robert Wagner,6153.0,3.0
14131,14131,121980950,Utah State University,14131,Robert,W.,W,Wagner,,,,2018,2018,Robert W Wagner,6153.0,3.0


In [31]:
aff_transfers = disambig_leader[disambig_leader['NumMatches'] > 1].sort_values(by=['MergedLeaderId', 'StartYear']).reset_index(drop=True)
aff_transfers

Unnamed: 0,Lindex,AffiliationId,Institution,LeaderId,FirstName,MiddleName,MiddleInitials,LastName,NickName,SuffixName,PrefixName,StartYear,EndYear,RawName4Match,MergedLeaderId,NumMatches
0,0,71965598,Adelphi University,0,Matthew,,,Goldstein,,,,1999,1999,Matthew Goldstein,0.0,2.0
1,1290,174216632,City University Of New York,1290,Matthew,,,Goldstein,,Phd,Dr.,2000,2013,Matthew Goldstein,0.0,2.0
2,5,71965598,Adelphi University,5,Catherine,,,Hennessy,,,,1999,2002,Catherine Hennessy,5.0,2.0
3,2827,139290212,Hofstra University,2827,Catherine,,,Hennessy,,,,2005,2018,Catherine Hennessy,5.0,2.0
4,7668,128956969,Texas Christian University,7668,James,A.,A,Mcgowan,,,,1999,1999,James A Mcgowan,15.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3789,15382,165690674,Oregon Health & Science University,15382,Peter,F.,F,Rapp,,,,2013,2013,Peter F Rapp,13284.0,2.0
3790,15392,4210088475,Texas Tech University System,15392,David,R.,R,Smith,,,,2002,2005,David R Smith,13310.0,2.0
3791,15425,926076166,Texas Tech University Health Sciences Center,15425,David,R.,R,Smith,,,,2002,2002,David R Smith,13310.0,2.0
3792,15540,120156002,Boise State University,15540,James,,,Maguire,,,,2008,2013,James Maguire,13363.0,2.0


In [32]:
def has_overlap(group):
    if len(group) < 2:
        return False
    intervals = sorted(zip(group['StartYear'], group['EndYear']))
    for i in range(len(intervals) - 1):
        _, end1 = intervals[i]
        start2, _ = intervals[i + 1]
        if start2 < end1:  # exclude touching by using '<' instead of '<='
            return True
    return False

overlap_flags = aff_transfers.groupby('MergedLeaderId', as_index=False).apply(has_overlap).rename(columns={None:'HasOverlap'})
aff_transfers = aff_transfers.merge(overlap_flags, on='MergedLeaderId')

aff_transfers[aff_transfers['HasOverlap']]

  overlap_flags = aff_transfers.groupby('MergedLeaderId', as_index=False).apply(has_overlap).rename(columns={None:'HasOverlap'})


Unnamed: 0,Lindex,AffiliationId,Institution,LeaderId,FirstName,MiddleName,MiddleInitials,LastName,NickName,SuffixName,PrefixName,StartYear,EndYear,RawName4Match,MergedLeaderId,NumMatches,HasOverlap
4,7668,128956969,Texas Christian University,7668,James,A.,A,Mcgowan,,,,1999,1999,James A Mcgowan,15.0,8.0,True
5,940,43369023,California State University San Bernardino,940,Robert,,,Mcgowan,,,,2002,2010,Robert Mcgowan,15.0,8.0,True
6,6735,19700959,Florida International University,6735,John,,,Mcgowan,,,,2002,2005,John Mcgowan,15.0,8.0,True
7,15,71965598,Adelphi University,15,James,,,Mcgowan,,,,2005,2010,James Mcgowan,15.0,8.0,True
8,12803,45438204,University Of Texas At San Antonio,12803,John,P.,P,Mcgowan,,,,2007,2009,John P Mcgowan,15.0,8.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3722,13472,1304256225,University Of Wisconsin System,13472,Martin,,,Cadwallader,,,,2007,2007,Martin Cadwallader,11816.0,2.0,True
3760,13988,126744593,University Of Maryland Baltimore,13988,Malinda,,,Orlin,,,,2002,2011,Malinda Orlin,12218.0,2.0,True
3761,13994,126744593,University Of Maryland Baltimore,13994,Malinda,,,Orin,,,,2009,2009,Malinda Orin,12218.0,2.0,True
3766,14008,66946132,University Of Maryland College Park,14008,C.,Daniel,D,Mote,Dan,Jr,,1999,2010,C. Daniel Mote,12231.0,2.0,True


In [90]:
aff_transfers[aff_transfers['HasOverlap'].isna()]

Unnamed: 0,Lindex,AffiliationId,Institution,LeaderId,FirstName,MiddleName,MiddleInitials,LastName,NickName,SuffixName,PrefixName,StartYear,EndYear,RawName4Match,MergedLeaderId,NumMatches,HasOverlap_x,HasOverlap_y,None,HasOverlap


In [32]:
import recordlinkage
from recordlinkage.preprocessing import clean
import networkx as nx


lid = 0
disambig_leader = []

top_people['LeaderId'] = np.nan

top_people = leader_data[['Year',
 'Name', 'RawName', 'AffiliationId',
 'Position',
 'Institution','RawPosition', 'FirstName',
 'MiddleName',
 'LastName',
 'NickName',
 'SuffixName',
 'PrefixName',
 'MiddleInitials',
 'RawName4Match']]
top_people = top_people[top_people['Position'] != ""]

def expand_name_df(df):

    # expand names for people with nicknames
    expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
    expanded_name_list['RawName4Match'] = [make_raw_name(row, use_nickname=True) for i,row in expanded_name_list.iterrows()]
    expanded_df = pd.concat([df, expanded_name_list], ignore_index=True)
    
    # expand name list to keep 
    expanded_name_list2 = df[df['MiddleName'].str.replace('.', '').str.len() > 1].reset_index(names=['Oidx'])
    expanded_name_list2['FirstName'] = [n[0] for n in expanded_name_list2['FirstName']]
    expanded_name_list2['RawName4Match'] = [make_raw_name(row, use_nickname=False) for i,row in expanded_name_list2.iterrows()]
    expanded_df = pd.concat([expanded_df, expanded_name_list2], ignore_index=True)
    
    return expanded_df
    

for affid in top_people['AffiliationId'].unique(): #[51556381, 1317227900, 35777872]:
    affsub = top_people[top_people['AffiliationId'] == affid]
    iname = top_people[top_people['AffiliationId'] == affid]['Institution'].values[0]
    affsub = affsub[affsub['Name'] != 'Vacant'].reset_index(names=['Lindex']) #drop=True)
    
    #tfidf = TfidfVectorizer(min_df=1, ngram_range = (3,3), analyzer='char', lowercase=False)
    #tfidf = CountVectorizer(min_df=1, ngram_range = (3,3), analyzer='char', lowercase=False)
    
    exp_affsub = expand_name_df(affsub)
     
    exp_affsub['RawName4Match_clean'] = clean(exp_affsub['RawName4Match'])
    
    indexer = recordlinkage.Index()
    indexer.full()
    pairs = indexer.index(exp_affsub)

    compare = recordlinkage.Compare()
    compare.string('RawName4Match_clean', 'RawName4Match_clean', method='jarowinkler', threshold=0.85, label='name_match')
    features = compare.compute(pairs, exp_affsub)
    
    matches = features[features['name_match'] == 1].index.tolist()

    G = nx.Graph()
    G.add_nodes_from(exp_affsub.index)
    G.add_edges_from(matches)
    
    for i in range(exp_affsub.shape[0] - affsub.shape[0]):
        G.add_edge(i+affsub.shape[0], exp_affsub['Oidx'].values[i+affsub.shape[0]])
    
    for component in nx.connected_components(G):
        #print(component)
        candidate_idx = np.sort(list(component))
        
        canonical = identify_true_name(exp_affsub, candidate_idx)  # most frequent
        pname = [affid, iname, lid]
        pname.extend(canonical) 
        
        top_people.loc[exp_affsub['Lindex'].loc[candidate_idx], 'LeaderId'] = lid
        syear = exp_affsub.loc[candidate_idx, 'Year'].min()
        eyear = exp_affsub.loc[candidate_idx, 'Year'].max()
        pname.extend([syear, eyear]) 
        disambig_leader.append(pname)
        exp_affsub.loc[candidate_idx, 'LeaderId'] = lid
        lid += 1



disambig_leader = pd.DataFrame(disambig_leader, columns=['AffiliationId', 'Institution', 'LeaderId', 'FirstName', 'MiddleName', 'MiddleInitials', 'LastName', 'NickName', 'SuffixName', 'PrefixName', 'StartYear', 'EndYear'])        



  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])




  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])




  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])




  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])




  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])




  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])




  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])




  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])




  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])




  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])




  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])




  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])




  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])




  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])




  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])




  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])




  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])




  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])




  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])
  expanded_name_list = df.replace({'NickName': {'': np.nan}}).dropna(subset=['NickName']).reset_index(names=['Oidx'])


In [42]:


disambig_leader['RawName4Match'] = [make_raw_name(row) for i,row in disambig_leader.iterrows()]
disambig_leader['MergedLeaderId'] = np.nan
#disambig_leader = disambig_leader.reset_index(names=['Lindex'])

exp_disambig_leader = expand_name_df(disambig_leader)
exp_disambig_leader['RawName4Match_clean'] = clean(exp_disambig_leader['RawName4Match'])

indexer = recordlinkage.Index()
indexer.full()
pairs = indexer.index(exp_disambig_leader)

compare = recordlinkage.Compare()
compare.string('RawName4Match_clean', 'RawName4Match_clean', method='jarowinkler', threshold=0.98, label='name_match')
features = compare.compute(pairs, exp_disambig_leader)
    
matches = features[features['name_match'] == 1].index.tolist()

G = nx.Graph()
G.add_nodes_from(exp_disambig_leader.index)
G.add_edges_from(matches)
for i in range(exp_disambig_leader.shape[0] - disambig_leader.shape[0]):
    G.add_edge(i+disambig_leader.shape[0], exp_disambig_leader['Oidx'].values[i+disambig_leader.shape[0]])

merged_lid = 0
    
for component in nx.connected_components(G):

    candidate_idx = np.sort(list(component))
        
    disambig_leader.loc[exp_disambig_leader['Lindex'].loc[candidate_idx], 'MergedLeaderId'] = merged_lid
    disambig_leader.loc[exp_disambig_leader['Lindex'].loc[candidate_idx], 'NumMatches'] = exp_disambig_leader['Lindex'].loc[candidate_idx].nunique()
    merged_lid += 1

disambig_leader



Unnamed: 0,Lindex,AffiliationId,Institution,LeaderId,FirstName,MiddleName,MiddleInitials,LastName,NickName,SuffixName,PrefixName,StartYear,EndYear,RawName4Match,MergedLeaderId,NumMatches
0,0,71965598,Adelphi University,0,Matthew,,,Goldstein,,,,1999,1999,Matthew Goldstein,0.0,2.0
1,1,71965598,Adelphi University,1,Armstrong,S.,S,Starkey,,,,1999,1999,Armstrong S Starkey,1.0,1.0
2,2,71965598,Adelphi University,2,Devin,,,Thornburg,,,,1999,1999,Devin Thornburg,2.0,1.0
3,3,71965598,Adelphi University,3,Angelo,B.,B,Proto,,,,1999,2013,Angelo B Proto,3.0,1.0
4,4,71965598,Adelphi University,4,Carl,J.,J,Rheins,,,,1999,1999,Carl J Rheins,4.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14047,14047,2802096936,University System Of Ohio,14047,Rich,,,Petrick,,,,2011,2011,Rich Petrick,12954.0,1.0
14048,14048,2802096936,University System Of Ohio,14048,Harry,,,Andrist,,,,2011,2011,Harry Andrist,12955.0,1.0
14049,14049,2802096936,University System Of Ohio,14049,James,M.,M,Petro,Jmi,,,2013,2013,James M Petro,12956.0,1.0
14050,14050,2802096936,University System Of Ohio,14050,Gary,W.,W,Cates,,,,2013,2013,Gary W Cates,12957.0,1.0


In [43]:
print(disambig_leader['MergedLeaderId'].nunique())


12959


In [45]:
flagged_ids = []
def person_quality_check(persondf):
    if persondf.shape[0] > 1:
        numlastname = persondf['LastName'].nunique()
        numfirstname = persondf['FirstName'].nunique()
        msub = all([ mn1.issubset(mn2) or mn2.issubset(mn1) for mn1, mn2 in combinations([set(mn) for mn in persondf['MiddleInitials'].values],2)])
        if numlastname > 1:
            #print(persondf)
            print(persondf.name, persondf['LastName'].unique(), numlastname)
            flagged_ids.append([persondf.name, numlastname, numfirstname, msub])
        #print(numlastname, numfirstname, msub)
        #print(persondf)
    

disambig_leader.groupby('MergedLeaderId').apply(person_quality_check)
flagged_ids = pd.DataFrame(flagged_ids, columns=['LeaderId', 'NumLastNames', 'NumFirstNames', 'MiddleInitialSubsets'])
flagged_ids[flagged_ids['MiddleInitialSubsets'] == False]
flagged_ids




1078.0 [np.str_('Dedominicisn') np.str_('Dedominicis')] 2
1466.0 [np.str_('Fry') np.str_('Frey')] 2
2451.0 [np.str_('Abell') np.str_('Bell')] 2
2572.0 [np.str_('Grimes1') np.str_('Grimes')] 2
3788.0 [np.str_('Gallagher') np.str_('Gallager')] 2
5437.0 [np.str_('Haile') np.str_('Hailey')] 2
7426.0 [np.str_('Morgan') np.str_('Margon')] 2
8292.0 [np.str_('Schlor') np.str_('Schloer')] 2


  disambig_leader.groupby('MergedLeaderId').apply(person_quality_check)


Unnamed: 0,LeaderId,NumLastNames,NumFirstNames,MiddleInitialSubsets
0,1078.0,2,2,True
1,1466.0,2,1,True
2,2451.0,2,1,True
3,2572.0,2,1,True
4,3788.0,2,1,True
5,5437.0,2,1,True
6,7426.0,2,1,True
7,8292.0,2,1,True


In [55]:
merged_leaders_2_split = [2451.0,3788.0,5437.0 ]
merged_lid = disambig_leader['MergedLeaderId'].max() + 1
for mid in merged_leaders_2_split:
    split_sub = disambig_leader[disambig_leader['MergedLeaderId'] == mid]
    disambig_leader.loc[split_sub['Lindex'].values[1], 'MergedLeaderId'] = merged_lid
    merged_lid += 1

In [54]:
disambig_leader[disambig_leader['MergedLeaderId'] == flagged_ids['LeaderId'].values[7]]

Unnamed: 0,Lindex,AffiliationId,Institution,LeaderId,FirstName,MiddleName,MiddleInitials,LastName,NickName,SuffixName,PrefixName,StartYear,EndYear,RawName4Match,MergedLeaderId,NumMatches
8770,8770,157725225,University Of Illinois At Urbana Champaign,8770,Wolfgang,,,Schlor,,,,2011,2013,Wolfgang Schlor,8292.0,2.0
12475,12475,181565077,Georgia State University,12475,Wolfgang,,,Schloer,,,Dr.,2018,2018,Wolfgang Schloer,8292.0,2.0


In [84]:
aff_transfers = disambig_leader[disambig_leader['NumMatches'] > 1].sort_values(by=['MergedLeaderId', 'StartYear']).reset_index(drop=True)

def has_overlap(group):
    if len(group) < 2:
        return False
    intervals = sorted(zip(group['StartYear'], group['EndYear']))
    for i in range(len(intervals) - 1):
        _, end1 = intervals[i]
        start2, _ = intervals[i + 1]
        if start2 < end1:  # exclude touching by using '<' instead of '<='
            return True
    return False

overlap_flags = aff_transfers.groupby('MergedLeaderId', as_index=False).apply(has_overlap).rename(columns={None:'HasOverlap'})
aff_transfers = aff_transfers.merge(overlap_flags, on='MergedLeaderId')

aff_transfers[aff_transfers['HasOverlap']]

  overlap_flags = aff_transfers.groupby('MergedLeaderId', as_index=False).apply(has_overlap).rename(columns={None:'HasOverlap'})


Unnamed: 0,Lindex,AffiliationId,Institution,LeaderId,FirstName,MiddleName,MiddleInitials,LastName,NickName,SuffixName,PrefixName,StartYear,EndYear,RawName4Match,MergedLeaderId,NumMatches,HasOverlap
4,2879,11883440,James Madison University,2879,Robert,L.,L,Scott,,,,1999,2002,Robert L Scott,10.0,3.0,True
5,10,71965598,Adelphi University,10,Robert,Allyn,A,Scott,,,Dr.,2002,2018,Robert A Scott,10.0,3.0,True
6,4035,4210127926,North Dakota University System,4035,Robert,L.,L,Potts,,,,2005,2005,Robert L Potts,10.0,3.0,True
9,4650,126345244,Portland State University,4650,Michael,F.,F,Burton,,,,1999,2018,Michael F Burton,20.0,2.0,True
10,20,71965598,Adelphi University,20,Michael,E.,E,Breton,,,,2008,2011,Michael E Breton,20.0,2.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2750,12129,1304256225,University Of Wisconsin System,12129,Martin,,,Cadwallader,,,,2007,2007,Martin Cadwallader,10973.0,2.0,True
2781,13563,83399316,Rockefeller University,13563,Gerald,,,Latter,,,,2007,2013,Gerald Latter,11770.0,2.0,True
2782,13081,72951846,Washington State University,13081,Gerald,,,Schlatter,,,,2010,2010,Gerald Schlatter,11770.0,2.0,True
2785,13126,204465549,Washington University,13126,John,R.,R,Loya,,,,1999,2018,John R Loya,11811.0,2.0,True


In [75]:
affid = 181401687
affsub = top_people[top_people['AffiliationId'] == affid]
iname = top_people[top_people['AffiliationId'] == affid]['Institution'].values[0]
affsub = affsub[affsub['Name'] != 'Vacant'].reset_index(names=['Lindex']) #drop=True)

    
exp_affsub = expand_name_df(affsub)
exp_affsub['RawName4Match_clean'] = clean(exp_affsub['RawName4Match'])
    
indexer = recordlinkage.Index()
indexer.full()
pairs = indexer.index(exp_affsub)

compare = recordlinkage.Compare()
compare.string('RawName4Match_clean', 'RawName4Match_clean', method='jarowinkler', threshold=0.85, label='name_match')
features = compare.compute(pairs, exp_affsub)
    
matches = features[features['name_match'] == 1].index.tolist()

G = nx.Graph()
G.add_nodes_from(exp_affsub.index)
G.add_edges_from(matches)

for i in range(exp_affsub.shape[0] - affsub.shape[0]):
    G.add_edge(i+affsub.shape[0], exp_affsub['Oidx'].values[i+affsub.shape[0]])

test_lid = 0
test_disambig_leader = []    
for component in nx.connected_components(G):
        
    candidate_idx = np.sort(list(component))
        
    canonical = identify_true_name(exp_affsub, candidate_idx)  # most frequent
    pname = [affid, iname, test_lid]
    pname.extend(canonical) 
        
    
    syear = exp_affsub.loc[candidate_idx, 'Year'].min()
    eyear = exp_affsub.loc[candidate_idx, 'Year'].max()
    pname.extend([syear, eyear]) 

    test_disambig_leader.append(pname)
        
    test_lid += 1

test_disambig_leader = pd.DataFrame(test_disambig_leader, columns=['AffiliationId', 'Institution', 'LeaderId', 'FirstName', 'MiddleName', 'MiddleInitials', 'LastName', 'NickName', 'SuffixName', 'PrefixName', 'StartYear', 'EndYear'])
test_disambig_leader



Unnamed: 0,AffiliationId,Institution,LeaderId,FirstName,MiddleName,MiddleInitials,LastName,NickName,SuffixName,PrefixName,StartYear,EndYear
0,181401687,American University,0,Benjamin,,,Ladner,,,,1999,2005
1,181401687,American University,1,Cornelius,M.,M,Kerwin,Neil,,Dr.,1999,2013
2,181401687,American University,2,Mary,E.,E,Kennard,,,,1999,2013
3,181401687,American University,3,Linda,B.,B,Nelson,,,,1999,1999
4,181401687,American University,4,Thomas,J.,J,Minar,Tom,,Dr.,1999,2013
5,181401687,American University,5,Donald,L.,L,Myers,Don,,,1999,2013
6,181401687,American University,6,Gail,Short,S,Hanson,,,Dr.,1999,2013
7,181401687,American University,7,Albert,,,Checcio,,,,2002,2005
8,181401687,American University,8,Cheryl,,,Storie,,,,2005,2007
9,181401687,American University,9,Robert,A.,A,Pastor,,,,2005,2008


In [56]:
disambig_leader.to_csv('disambiguated_top_leaders_8_13_25.csv', index=False, header=True, mode='w')
top_people.to_csv('disambiguated_top_leaders_positionpersonaffiliation_8_13_25.csv', index=False, header=True, mode='w')