In [2]:
import pandas as pd

<h2>Reading in Data</h2>|

In [3]:
pharma_inventors=pd.read_csv("data/working/pharma_inventors.csv") #Reading in Pharma inventors as extracted from the PatentsView Dataset by Kathryn
civil_inventors=pd.read_csv("data/working/civil_inventors.csv") #Reading in Civil inventors as extracted from the PatentsView Dataset by Kathryn

In [4]:
#Print Lengths for testing

In [5]:
len(pharma_inventors)

380546

In [6]:
len(civil_inventors)

263460

In [7]:
#Taking a peek at the data

In [8]:
civil_inventors.head()

Unnamed: 0,location_id,inventor_id,name_first,name_last,male_flag,attribution_status,city,state,country,latitude,longitude,county,state_fips,county_fips
0,00006da3-cb90-11eb-9615-121df0c29c1e,fl:jo_ln:kephart-1,John W.,"Kephart, Jr.",1.0,1,Alder,MT,US,45.3247,-112.108,Madison,30.0,30057.0
1,0005ce99-cb8f-11eb-9615-121df0c29c1e,fl:ha_ln:marti-2,Hanspeter,Marti,1.0,1,Kottwil,,CH,47.1623,8.04498,,,
2,0010f898-cb91-11eb-9615-121df0c29c1e,fl:er_ln:booth-4,Eric Lee,Booth,1.0,1,Wlllacoochee,GA,US,,,,13.0,
3,00171e6a-cb90-11eb-9615-121df0c29c1e,fl:do_ln:english-3,Donald,English,1.0,1,Mount Herman,NJ,US,39.6176,-74.5943,,34.0,
4,002a61fb-cb8f-11eb-9615-121df0c29c1e,t6hjb0czspbxykv41mjt31rix,Terry L.,Saucke,1.0,1,Portland,ND,US,47.4982,-97.3704,Traill,38.0,38097.0


In [9]:
pharma_inventors.head()

Unnamed: 0,location_id,inventor_id,name_first,name_last,male_flag,attribution_status,city,state,country,latitude,longitude,county,state_fips,county_fips
0,00054e5e-cb90-11eb-9615-121df0c29c1e,fl:ro_ln:chambers-18,Robert J.,Chambers,1.0,1,Msytic,CT,US,,,,9.0,
1,0007f8eb-cb90-11eb-9615-121df0c29c1e,fl:ph_ln:bussat-2,Philippe,Bussat,1.0,1,Fiegeres,,FR,,,,,
2,00104bda-cb90-11eb-9615-121df0c29c1e,fl:te_ln:mazer-3,Terrence B.,Mazer,1.0,1,Reynoldsbury,OH,US,,,,39.0,
3,00108172-cb90-11eb-9615-121df0c29c1e,fl:lo_ln:deurer-1,Lothar,Deurer,1.0,1,Kolbenz,,DE,,,,,
4,002161f5-cb8f-11eb-9615-121df0c29c1e,fl:fl_ln:nielsen-4,Flemming S.,Nielsen,1.0,1,Federikssund,,DK,,,,,


<h3> We note that the <i>name_first</i> column can have spaces and Middle Initials. We need to remove that in order to have a clean first name for gender identification</h3>

In [10]:
pharma_inventors['cleaned_name']=pharma_inventors['name_first'].str.split(' ').str.get(0)
civil_inventors['cleaned_name']=civil_inventors['name_first'].str.split(' ').str.get(0)

In [11]:
civil_inventors['cleaned_name']=civil_inventors['cleaned_name'].str.lower()
pharma_inventors['cleaned_name']=pharma_inventors['cleaned_name'].str.lower()

In [12]:
civil_inventors['cleaned_name'].head()

0         john
1    hanspeter
2         eric
3       donald
4        terry
Name: cleaned_name, dtype: object

In [13]:
pharma_inventors['cleaned_name'].head()

0      robert
1    philippe
2    terrence
3      lothar
4    flemming
Name: cleaned_name, dtype: object

<h3> Method 1 - Using the WGND1 dataset </h3>

In [14]:
from zipfile import ZipFile

# pass in the specific file name 
# to the open method
with ZipFile("wgnd_1_1.zip") as myzip:
    wgnd1 = pd.read_csv(myzip.open("dictionary_source_v1.1.csv"))

In [15]:
wgnd1.head()

Unnamed: 0,gender,name,source,code
0,M,A G,Albertagvt,CA
1,M,A HANNAN,Albertagvt,CA
2,M,A JAY,Albertagvt,CA
3,M,A K I L,Albertagvt,CA
4,F,A LAH,Albertagvt,CA


In [16]:
wgnd1['name']=wgnd1['name'].str.lower()

In [17]:
#Ready to merge with the civil inventors and pharma inventors set on name. 
civil_inventors_wgnd=pd.merge(civil_inventors,wgnd1,left_on='cleaned_name',right_on='name',how='left')
pharma_inventors_wgnd=pd.merge(pharma_inventors,wgnd1,left_on='cleaned_name',right_on='name',how='left')

<h3> Method 2 - Using the WGND2 dataset </h3>

In [18]:
with ZipFile("wgnd_2_0.zip") as myzip:
    wgnd2 = pd.read_csv(myzip.open("wgnd_2_0_name-gender-code.csv"))

In [19]:
wgnd2.head()

Unnamed: 0,name,code,gender,wgt
0,"""baby""",AU,F,1.0
1,'aisyah,AU,F,1.0
2,'anela,CA,F,1.0
3,'fiyinfoluwa,CA,F,1.0
4,'olioni,AU,M,1.0


In [20]:
#Getting rid of single and double quotes from the name
wgnd2['name']=wgnd2['name'].str.strip("\'")
wgnd2['name']=wgnd2['name'].str.strip("\"")

In [None]:
#Ready to merge with the civil inventors and pharma inventors set on name
civil_inventors_wgnd2=pd.merge(civil_inventors_wgnd,wgnd2,left_on='cleaned_name',right_on='name',how='left')
pharma_inventors_wgnd2=pd.merge(pharma_inventors_wgnd,wgnd2,left_on='cleaned_name',right_on='name',how='left')

In [None]:
#Drop extra columns so we can deduplicate effectively
civil_inventors_wgnd2.drop(columns=['source', 'code_x','name_x','name_y'],inplace=True)
pharma_inventors_wgnd2.drop(columns=['source', 'code_x','name_x','name_y'],inplace=True)

In [None]:
civil_inventors_wgnd2.drop_duplicates(inplace=True)

In [38]:
pharma_inventors_wgnd2_country_cleaned.drop_duplicates(inplace=True)

387880

In [34]:
civil_inventors_wgnd2_country_cleaned=civil_inventors_wgnd2[civil_inventors_wgnd2['code_y']==civil_inventors_wgnd2['country']]
pharma_inventors_wgnd2_country_cleaned=pharma_inventors_wgnd2[pharma_inventors_wgnd2['code_y']==pharma_inventors_wgnd2['country']]

In [None]:
pharma_inventors_wgnd2_country_cleaned.to_csv("pharma_wgnd1_wgnd2.csv")

In [None]:
civil_inventors_wgnd2_country_cleaned.to_csv("civil_wgnd1_wgnd2.csv")

<h2> Generating Samples of Inventors </h2>

In [2]:
import pandas as pd
pharma_inventors=pd.read_csv("pharma_wgnd1_wgnd2.csv")
civil_inventors=pd.read_csv("civil_wgnd1_wgnd2.csv")

In [3]:
#Eliminate duplicate Inventor IDs
civil_inventors=civil_inventors.sort_values('Unnamed: 0', ascending=False).drop_duplicates('inventor_id').sort_index()
pharma_inventors=pharma_inventors.sort_values('Unnamed: 0', ascending=False).drop_duplicates('inventor_id').sort_index()

In [4]:
#Use Male_Flag to split into subgroups
civil_inventors_male=civil_inventors[civil_inventors['male_flag']==1.0]
civil_inventors_nonmale=civil_inventors[civil_inventors['male_flag']!=1.0]

pharma_inventors_male=pharma_inventors[pharma_inventors['male_flag']==1.0]
pharma_inventors_nonmale=pharma_inventors[pharma_inventors['male_flag']!=1.0]

In [None]:
#subsampling by country, taking 1% of the total set

In [5]:
pharma_inventors_male_samples=pharma_inventors_male.groupby('country',group_keys=False).apply(lambda x: x.sample(frac=.1))
pharma_inventors_nonmale_samples=pharma_inventors_nonmale.groupby('country',group_keys=False).apply(lambda x: x.sample(frac=.1))
civil_inventors_nonmale_samples=civil_inventors_nonmale.groupby('country',group_keys=False).apply(lambda x: x.sample(frac=.1))
civil_inventors_male_samples=civil_inventors_male.groupby('country',group_keys=False).apply(lambda x: x.sample(frac=.1))

In [6]:
#applying sampling again to get sets of 1000 rows.
civil_inventors_male_samples_large=civil_inventors_male_samples.groupby('country',group_keys=False).apply(lambda x: x.sample(frac=.054))
civil_inventors_nonmale_samples_large=civil_inventors_nonmale_samples.groupby('country',group_keys=False).apply(lambda x: x.sample(frac=0.692))
pharma_inventors_male_samples_large=pharma_inventors_male_samples.groupby('country',group_keys=False).apply(lambda x: x.sample(frac=.05865))
pharma_inventors_nonmale_samples_large=pharma_inventors_nonmale_samples.groupby('country',group_keys=False).apply(lambda x: x.sample(frac=.1709))

In [None]:
#Apply Gender Guesser

In [8]:
import gender_guesser.detector as gender
d = gender.Detector()

In [None]:
civil_inventors_male_samples_large['gender-guesser']=civil_inventors_male_samples_large['cleaned_name'].apply(lambda x: d.get_gender(x))
civil_inventors_nonmale_samples_large['gender-guesser']=civil_inventors_nonmale_samples_large['cleaned_name'].apply(lambda x: d.get_gender(x))
pharma_inventors_male_samples_large['gender-guesser']=pharma_inventors_male_samples_large['cleaned_name'].apply(lambda x: d.get_gender(x))
pharma_inventors_nonmale_samples_large['gender-guesser']=pharma_inventors_nonmale_samples_large['cleaned_name'].apply(lambda x: d.get_gender(x))



In [None]:
#apply Genderize

In [9]:
from genderize import Genderize

In [None]:
def genderize(name):
    return Genderize(
    user_agent='GenderizeDocs/0.0',
    api_key='449ade9b601b7b2b45b2fe7551b78fcf',
    timeout=5.0).get([name])[0]['gender']

In [None]:
civil_inventors_male_samples_large['genderized']=civil_inventors_male_samples_large.apply(lambda row : genderize(row['cleaned_name']),axis=1)
civil_inventors_nonmale_samples_large['genderized']=civil_inventors_nonmale_samples_large.apply(lambda row : genderize(row['cleaned_name']),axis=1)
pharma_inventors_male_samples_large['genderized']=pharma_inventors_male_samples_large.apply(lambda row : genderize(row['cleaned_name']),axis=1)
pharma_inventors_nonmale_samples_large['genderized']=pharma_inventors_nonmale_samples_large.apply(lambda row : genderize(row['cleaned_name']),axis=1)

In [None]:
civil_inventors_male_samples_large.to_csv("samples/civil_inventors_male_samples_large.csv")
civil_inventors_nonmale_samples_large.to_csv("samples/civil_inventors_nonmale_samples_large.csv")
pharma_inventors_male_samples_large.to_csv("samples/pharma_inventors_male_samples_large.csv")
pharma_inventors_nonmale_samples_large.to_csv("samples/pharma_inventors_nonmale_samples_large.csv")