In [1]:
import pandas as pd
import os
import json
import numpy as np


POSITION_BANK = ["President", "Chancellor", "Provost", "Director", "Dean", "Controller", "Trustee", "Member", "Regent", "Chairman", "Overseer", "Assistant", "Librarian", "Secretary", "Chaplain", "Minister", "Treasurer", "Senior Counsel", "General Counsel", "Legal Counsel", "University Counsel", "College Counsel", "Special Counsel", "Corporation Counsel", "Officer", "Chief", "Professor", "Commissioner", "Fellow", "Chairperson", "Manager", "Clergy", "Coordinator", "Auditor", "Governor", "Representative", "Stockbroker", "Advisor", "Commandant", "Rector", "Attorney", "Curator", "Clerk", "Department Head", "Pastor", "Head", "Comptroller", "Deputy", "Inspector General"]
#for classifying position as dean, administration
DEAN_WORDS = ["summer", "student", "faculty", "academic service", "academics", "academic program", "admissions", "admission", "enrollment", "student life", "housing", "academic support", "advising", "enrollment management", 
                       "student relations", "academic computing", "academic records", "student service", "student affairs", "student development", "registrar", "financial aid", "media service", "library service", "university librar",
                       "internation affair", "special program", "continuing education", "external relation", "development", "services"]

#for subinstituion
ADMINISTRATION_WORDS = ["academic service", "academics", "academic program", "admissions", "admission", "enrollment service", "student life", "housing", "academic support", "advising", "enrollment management", 
                       "student relations", "academic computing", "academic records", "student service", "student affairs", "student development", "registrar", "financial aid", "media service", "library service", "university librar"]


In [2]:
path_connected_data = "C:\\Users\\tykun\\OneDrive\\Documents\\SchoolDocs\\VSCodeProjects\\connectedData\\"
path_temp_data = "C:\\Users\\tykun\\OneDrive\\Documents\\SchoolDocs\\VSCodeProjects\\connectedData\\temporaryData\\"
year = "2009"

path_read = f"{path_connected_data}{year}_split_positions.csv"
provost_path = f"{path_temp_data}{year}_provost.csv"

In [3]:
#Extract the names of all the institutions for validation
def extract_institutions(df):
    institution_list = []
    for index, row in df.iterrows():
        if row["Institution"] not in institution_list:
            institution_list.append(row["Institution"])
    return institution_list


def extract_first_member(df):
    first_member_df = []
    previous_institution = None
    for index, row in df.iterrows():
        current_institution = row["Institution"]
        if previous_institution is None or current_institution != previous_institution:
            first_member_df.append(row)
        previous_institution = current_institution
    return pd.DataFrame(first_member_df)

def extract_first_member_exclude_string(df):
    first_member_df = []
    previous_institution = None
    for index, row in df.iterrows():
        current_institution = row["Institution"]
        # Check if the value at index 2 has changed, including handling NaN for the first row
        if previous_institution is None or current_institution != previous_institution:
            if "associate" not in row["Position"].lower() and "vice" not in row["Position"].lower():
                first_member_df.append(row)
        previous_institution = current_institution
    return pd.DataFrame(first_member_df)

def replace_values(df, string):
    df["Position"] = string
    return df 

def find_missing_institutions(institutions, df):
    lowercase_df = df["Institution"].str.lower()
    missing_institutions = []
    for institution in institutions:
        if institution.lower() not in lowercase_df.values:
            missing_institutions.append(institution)
    return missing_institutions


def rename_previous_value(df):
    df.rename(columns={'Previous_Value': 'Fixed Position'}, inplace = True)
    df["Fixed Position"] = ""
    return pd.DataFrame(df)


def count_universities(full_df):
    institutions = full_df["Institution"].unique()
    return list(institutions)

In [4]:
def determine_missing(university_list, presidents_df):
    president_institutions = list(presidents_df["Institution"])
    university_list_normalized = [uni.strip().lower() for uni in university_list]
    president_institutions_normalized = [president.strip().lower() for president in president_institutions]
    missing_list = [uni for uni in university_list_normalized if uni not in president_institutions_normalized]
    return missing_list

In [5]:
#Provost
#Extract every row with the substring provost in the title
def extract_provost(df):
    provost_df = []
    for index, row in df.iterrows():
        if 'provost' in row.iloc[1].lower():
            # print(row)
            provost_df.append(row)
    provost_df = pd.DataFrame(provost_df)
    return provost_df


def create_provost_df(full_df):
    provost_initial = extract_provost(full_df)
    provost_cleaned = extract_first_member_exclude_string(provost_initial)
    return pd.DataFrame(provost_cleaned)

In [6]:
full_dataframe = pd.read_csv(path_read)

In [7]:
provost_df = create_provost_df(full_dataframe)
university_list = count_universities(full_dataframe)
missing_provost = determine_missing(university_list, provost_df)

for inst in missing_provost:
    print(inst.title())

provost_df["FixedPosition"] = "Provost"
provost_df.to_csv(provost_path, index = False)

Agnes Scott College
Albion College
Allegheny College
Amherst College
Auburn University At Montgomery
Baldwin Wallace College
Bard College
Bates College
Baylor College Of Medicine
Beloit College
Bowdoin College
Brigham Young University
The California State University System
California State University, Fullerton
California State University San Bernardino
Carleton College
Carthage College
Centre College
Chapman University
Chestnut Hill College
Christian Brothers University
Claremont Mckenna College
Clark Atlanta University
Colby College
College Of Mount St Joseph
College Of Notre Dame Of Maryland
College Of St Catherine
College Of St Scholastica
College Of The Holy Cross
College Of Wooster
Colorado College
Colorado State University System
Concordia University
Connecticut College
Converse College
Creighton University
Davidson College
Drury University
Duke University Health System
Eastern Michigan University
Elmira College
Emerson College
Fairfield University
Fordham University
Furman Univ