In [1]:
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
from wikidata.client import Client

In [20]:
data_path = "/zfs/projects/faculty/amirgo-management/HathiTrust/"
df = pd.read_csv(data_path + "post45fiction.csv")

In [38]:
# proportion of authors having wikidata ids
print("Proportion of authors having wikidata ids: ", df["author_wikidata_qid"].nunique() / df['author_viaf'].nunique())
# proportion of documents having authors with wikidata ids
print("Proportion of documents having authors with wikidata ids: ", df["author_wikidata_qid"].count() / df.shape[0])
# number of unique authors
print("Number of unique authors: ", len(df["author_wikidata_qid"].unique()))

Proportion of authors having wikidata ids:  0.5869628479968959
Proportion of documents having authors with wikidata ids:  0.7598546488664192
Number of unique authors:  18154


In [4]:
def property_value_extraction(entity, prop):
    claims = entity.data.get('claims', {})
    property_claims = claims.get(prop, []) # usually start with p
    values = []
    for claim in property_claims:
        mainsnak = claim.get('mainsnak', {})
        datavalue = mainsnak.get('datavalue', {})
        value = datavalue.get('value', {}).get('id', None)
        if value:
            values.append(value)
    return values

def time_extraction(entity, prop):
    claims = entity.data.get('claims', {})
    property_claims = claims.get(prop, []) # usually start with p
    values = []
    for claim in property_claims:
        mainsnak = claim.get('mainsnak', {})
        datavalue = mainsnak.get('datavalue', {})
        value = datavalue.get('value', {}).get('time', None)
        if value:
            values.append(value)
    return values

# sometimes, education information is nested
def get_education(entity):
    # if degree or major is nested in education
    institution_values = []
    degree_values = []
    major_values = []

    claims = entity.data.get('claims', {})
    education_claims = claims.get('P69', []) # usually start with p
    for education in education_claims:
        # main entry: institution
        mainsnak = education.get('mainsnak', {})
        datavalue = mainsnak.get('datavalue', {})
        value = datavalue.get('value', {}).get('id', None)
        if value:
            institution_values.append(value)
        # search qualifiers for degree and major, in case they are nested
        qualifiers = education.get('qualifiers', {})
        if "P512" in qualifiers:
            degree_id = qualifiers["P512"][0].get('datavalue', {}).get('value', {}).get('id')
            degree_values.append(degree_id)
        if "P812" in qualifiers:
            major_id = qualifiers["P812"][0].get('datavalue', {}).get('value', {}).get('id')
            major_values.append(major_id)
    
    # search for degree and major if they are not nested
    degree_values += property_value_extraction(entity, 'P512')
    major_values += property_value_extraction(entity, 'P812')

    # remove duplicates
    institution_values = list(set(institution_values))
    degree_values = list(set(degree_values))
    major_values = list(set(major_values))

    return institution_values, degree_values, major_values    

def get_SES_characteristics(entity):
    # gender
    gender_values = property_value_extraction(entity, 'P21')
    # birth date
    birth_date_values = time_extraction(entity, 'P569')
    # death date
    death_date_values = time_extraction(entity, 'P570')
    # citizenship
    citizenship_values = property_value_extraction(entity, 'P27')
    # ethinic group
    ethinic_group_values = property_value_extraction(entity, 'P172')
    # education
    education_values, degree_values, major_values = get_education(entity)
    # student of (this will be sparse) -- if students of famous economists are included, this will be useful
    student_of_values = property_value_extraction(entity, 'P1066')
    # occupation
    occupation_values = property_value_extraction(entity, 'P106')
    # employer in the past
    employer_values = property_value_extraction(entity, 'P108')
    # political party (this will be sparse)
    political_party_values = property_value_extraction(entity, 'P102')
    # ideology (this will be sparse)
    ideology_values = property_value_extraction(entity, 'P1142')

    # dictionary
    SES_characteristics = {"gender": gender_values, "birth_date": birth_date_values, "death_date": death_date_values,
                            "citizenship": citizenship_values, "ethinic_group": ethinic_group_values, "education": education_values,
                            "degree": degree_values,"major": major_values, "student_of": student_of_values, "occupation": occupation_values,
                            "employer": employer_values, "political_party": political_party_values, "ideology": ideology_values}
    return SES_characteristics

In [None]:
# total entities to be queried
total_entities = df["author_wikidata_qid"].unique()
# remove nan: 18153 entities in total
total_entities = [x for x in total_entities if str(x) != 'nan']
print("Total entities to be queried: ", len(total_entities))

In [6]:
# for each entity, get SES characteristics
client = Client()
total_ses_characteristics = []
error_qids = []
for qid in tqdm(total_entities):
    try:
        entity = client.get(qid, load=True)
        SES_characteristics = get_SES_characteristics(entity)
        SES_characteristics["qid"] = qid
        total_ses_characteristics.append(SES_characteristics)
        # wait for 0.2 second
        time.sleep(0.2)
    except:
        print("Error: ", qid)
        error_qids.append(qid)
        time.sleep(30) # wait for 30 seconds

  0%|          | 0/18153 [00:00<?, ?it/s]

100%|██████████| 18153/18153 [2:18:07<00:00,  2.19it/s]  


In [None]:
# save the SES characteristics
ses_df = pd.DataFrame(total_ses_characteristics)
ses_df.to_csv(data_path + "ses_characteristics.csv", index=False)
# pkl version
ses_df.to_pickle(data_path + "ses_characteristics.pkl")

In [6]:
# save the SES characteristics
ses_df = pd.read_pickle(data_path + "ses_characteristics.pkl")

# Convert QID to label

In [6]:
data_path = "/zfs/projects/faculty/amirgo-management/HathiTrust/"
ses_df = pd.read_pickle(data_path + "fiction_ses_characteristics.pkl")

In [8]:
# Save label dictionary
import pickle
with open("/zfs/projects/faculty/amirgo-management/opus/processed/qid_to_label.pkl", "rb") as f:
    qid_to_label_dict = pickle.load(f)

In [9]:
ses_df['gender'] = ses_df['gender'].apply(lambda x: [qid_to_label_dict[qid] for qid in x if qid!=None])
ses_df['citizenship'] = ses_df['citizenship'].apply(lambda x: [qid_to_label_dict[qid] for qid in x if qid!=None])
ses_df['ethinic_group'] = ses_df['ethinic_group'].apply(lambda x: [qid_to_label_dict[qid] for qid in x if qid!=None])
ses_df['education'] = ses_df['education'].apply(lambda x: [qid_to_label_dict[qid] for qid in x if qid!=None])
ses_df['degree'] = ses_df['degree'].apply(lambda x: [qid_to_label_dict[qid] for qid in x if qid!=None])
ses_df['major'] = ses_df['major'].apply(lambda x: [qid_to_label_dict[qid] for qid in x if qid!=None])
ses_df['student_of'] = ses_df['student_of'].apply(lambda x: [qid_to_label_dict[qid] for qid in x if qid!=None])
ses_df['occupation'] = ses_df['occupation'].apply(lambda x: [qid_to_label_dict[qid] for qid in x if qid!=None])
ses_df['employer'] = ses_df['employer'].apply(lambda x: [qid_to_label_dict[qid] for qid in x if qid!=None])
ses_df['political_party'] = ses_df['political_party'].apply(lambda x: [qid_to_label_dict[qid] for qid in x if qid!=None])
ses_df['ideology'] = ses_df['ideology'].apply(lambda x: [qid_to_label_dict[qid] for qid in x if qid!=None])

In [11]:
ses_df.to_pickle(data_path + "fiction_ses_characteristics_labelled.pkl")

# Key indicator extraction

In [2]:
data_path = "/zfs/projects/faculty/amirgo-management/HathiTrust/"
ses_df = pd.read_pickle(data_path + "fiction_ses_characteristics.pkl")
ses_df_labelled = pd.read_pickle(data_path + "fiction_ses_characteristics_labelled.pkl")
bschools = pd.read_csv("/zfs/projects/faculty/amirgo-management/opus/processed/business_schools.csv")
bschools['qid'] = bschools['business_school'].apply(lambda x: x.split('/')[-1])

In [6]:
# business education
# if someone went to business school: if education institution is in the list of business schools; or if degree is business
business_degree = set(['Q798129', 'Q12580940','Q191701'])
business_schools = set(bschools['qid'].tolist())
def if_biz_ed(row):
    education = row['education']
    degree = row['degree']
    if len(education) == 0 and len(degree) == 0:
        return "Missing"
    else:
        if_biz = False
        for edu in education:
            if edu in business_schools:
                if_biz = True
                break
        for deg in degree:
            if deg in business_degree:
                if_biz = True
                break
        return str(if_biz)

ses_df['if_business_ed'] = ses_df.apply(lambda x: if_biz_ed(x), axis=1)

In [7]:
# college education (a corse classification)
# if the education institution has 'university' or 'college' in the name;
college_degrees = ['bachelor', 'master', 'doctor', 'phd', 'ba', 'ma', 'bs', 'ms', 'mba', 'jd', 'llb', 'llm', 'md']
def if_college_ed(row):
    education = row['education']
    degree = row['degree']
    if len(education) == 0 and len(degree) == 0:
        return "Missing"
    else:
        if_college = False
        for edu in education:
            if 'university' in edu.lower() or 'college' in edu.lower():
                if_college = True
                break
        for deg in degree:
            deg = deg.lower()
            if any(col in deg for col in college_degrees):
                if_college = True
                break
        return str(if_college)

ses_df_labelled['if_college_ed'] = ses_df_labelled.apply(lambda x: if_college_ed(x), axis=1)

In [8]:
# business occupation
occupations = ['businessperson','business executive','entrepreneur', 'businessman', 'business',
 'investor', 'executive', 'ceo', 'banker','manager', 'consultant','chief executive officer', 'finance', 'managing', 'executive','investment']

def if_bis_occuptation(row):
    occupation = row['occupation']
    if len(occupation) == 0:
        return "Missing"
    else:
        if_bis = False
        for occ in occupation:
            occ = occ.lower()
            for x in occupations:
                if x in occ:
                    if_bis = True
                    break
        return str(if_bis)

ses_df_labelled['if_business_occupation'] = ses_df_labelled.apply(lambda x: if_bis_occuptation(x), axis=1)

In [None]:
ses_df_labelled['if_business_ed']  =ses_df['if_business_ed']
ses_df_labelled.to_pickle(data_path + "fiction_ses_characteristics_labelled.pkl")

# check data distribution

In [18]:
data_path = "/zfs/projects/faculty/amirgo-management/HathiTrust/"
ses_df_labelled = pd.read_pickle(data_path + "fiction_ses_characteristics_labelled.pkl")

In [19]:
def if_american_citizen(ls):
    if len(ls) == 0:
        return "Missing"
    else:
        if_american = False
        for x in ls:
            if x == 'United States of America':
                if_american = True
                break
        return str(if_american)

In [20]:
ses_df_labelled['US_citizen'] = ses_df_labelled['citizenship'].apply(lambda x: if_american_citizen(x))

In [21]:
# def convert gender to string
def convert_gender(ls):
    if len(ls) == 0:
        return 'Missing'
    elif len(ls) == 1:
        if ls[0] == 'male' or ls[0] == 'female':
            return ls[0]
        else:
            return "Other"
    else:
        return "Other"

In [22]:
ses_df_labelled['gender_str'] = ses_df_labelled['gender'].apply(lambda x: convert_gender(x))
ses_df_labelled['birth_date'] = ses_df_labelled['birth_date'].apply(lambda x: x[0] if len(x) > 0 else "Missing")
ses_df_labelled['birth_year'] = ses_df_labelled['birth_date'].apply(lambda x: int(x[1:5]) if x != "Missing" else "Missing")
ses_df_labelled[['qid','birth_year','gender_str','if_college_ed','if_business_occupation','if_business_ed','US_citizen']].to_csv(data_path + "fiction_ses_characteristics_labelled_subset.csv", index=False)

In [11]:
ses_df_labelled['US_citizen'].value_counts()

US_citizen
False      8762
True       6520
Missing    2870
Name: count, dtype: int64

In [12]:
ses_df_labelled['if_business_ed'].value_counts()

if_business_ed
False      9836
Missing    8282
True         34
Name: count, dtype: int64

In [13]:
ses_df_labelled['if_business_occupation'].value_counts()

if_business_occupation
False      17121
Missing      902
True         129
Name: count, dtype: int64

In [14]:
ses_df_labelled['if_college_ed'].value_counts()

if_college_ed
True       8411
Missing    8282
False      1459
Name: count, dtype: int64

In [15]:
ses_df_labelled['gender_str'].value_counts()

gender_str
male       12415
female      5455
Missing      252
Other         30
Name: count, dtype: int64

In [17]:
ses_df_labelled[['qid','birth_year','gender_str','if_college_ed','if_business_occupation','if_business_ed','US_citizen']]

Unnamed: 0,qid,birth_year,gender_str,if_college_ed,if_business_occupation,if_business_ed,US_citizen
0,Q931181,1954,male,True,True,True,True
1,Q42511,1866,male,True,False,False,False
2,Q57068193,1951,male,True,False,False,True
3,Q6524734,1873,male,Missing,False,Missing,True
4,Q5546247,1821,male,True,False,False,False
...,...,...,...,...,...,...,...
18147,Q20751719,1885,male,Missing,False,Missing,False
18148,Q3195283,1880,male,Missing,True,Missing,True
18149,Q3778505,1881,male,False,False,False,False
18150,Q64684835,1904,male,Missing,False,Missing,Missing


In [27]:
meta_Df.merge(ses_df_labelled[['qid','birth_year','gender_str','if_college_ed','if_business_occupation','if_business_ed']], left_on='author_wikidata_qid', right_on='qid', how='left')

Unnamed: 0,id,docid,author_authorized_heading,author_lccn,author_viaf,author_wikidata_qid,oldauthor,author,author_marc,authordate,...,shorttitle,instances,juvenileprob,nonficprob,qid,birth_year,gender_str,if_college_ed,if_business_occupation,if_business_ed
0,75954,mdp.39015092163339,"Davis, John, 1954-",no2009108480,233022338,Q931181,"Davis, John A","Davis, John A","Davis, John A",d.1897.,...,Qing guo nu li : fu ren Zhongguo fu nü sheng h...,1,0.791218,0.287677,Q931181,1954.0,male,True,True,True
1,75953,ien.35556042103887,"Wells, H. G. (Herbert George), 1866-1946",n79063613,97006424,Q42511,"Wells, H. G. (Herbert George)","Wells, H. G. (Herbert George)","Wells, H. G. (Herbert George), 1866-1946",1866-1946.,...,Certain personal matters,1,0.006842,0.578157,Q42511,1866.0,male,True,False,False
2,75952,inu.30000127716409,"Jeon, Heecheon",n2010066094,137450559,,"Jeon, Heecheon","Jeon, Heecheon","Jeon, Heecheon",,...,Subjectivity of différance : a poiesis of deco...,1,0.026401,0.991856,,,,,,
3,75951,mdp.39076002906829,"Wilson, Steven E. (Steven Eugene), 1951-",n93800210,267738931,Q57068193,"Wison, Steven E. (Steven Eugene)","Wison, Steven E. (Steven Eugene)","Wilson, Steven E. (Steven Eugene), 1951-",1951-,...,The ghosts of Anatolia : an epic journey to fo...,1,0.304493,0.290158,Q57068193,1951.0,male,True,False,False
4,75950,umn.31951d030369342,"Kobrin, Leon, 1872-1946",n89117653,62354406,Q6524734,,,"Kobrin, Leon, 1872-1946",,...,Six plays of the yiddish theatre,1,0.387223,0.889030,Q6524734,1873.0,male,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75949,5,mdp.39015063779485,"Aldington, Richard, 1892-1962",n80057251,7402360,Q514998,"Aldington, Richard","Aldington, Richard","Aldington, Richard, 1892-1962",1892-1962.,...,"The romance of Casanova, a novel",1,0.004011,0.048060,Q514998,1892.0,male,True,False,False
75950,4,uc1.$b803918,"Steen, Marguerite",n50023624,32243514,Q6760816,"Steen, Marguerite","Steen, Marguerite","Steen, Marguerite",,...,Rose Timson; a novel,1,0.012550,0.101755,Q6760816,1894.0,female,False,False,False
75951,3,uc1.$b399374,"Lucas, Curtis, 1914-",nr2003039880,68876905,,"Lucas, Curtis","Lucas, Curtis","Lucas, Curtis",,...,"Third ward, Newark",1,0.160681,0.188503,,,,,,
75952,2,mdp.39015002713835,"Raddall, Thomas H., 1903-1994",n50052424,115893402,Q2161620,"Raddall, Thomas H","Raddall, Thomas H","Raddall, Thomas H., 1903-1994",1903-1994.,...,Pride's Fancy,1,0.008902,0.035858,Q2161620,1903.0,male,False,False,False
