### Initializing

In [149]:
import pandas as pd
import json
import numpy as np
import os

In [150]:
def create_profile(x, url):
    with open(str(url)+str(x)) as f:
        dict1 = json.load(f)
    list_col= ['experience', 'education', 'languages','languages','publications', 'certifications','volunteer', 'honors', 'projects']
    for n in list_col:
        if n in dict1:
            for i in range(len(dict1[n])): 
                dict1[str(n+str(i+1))]= dict1[n][i]
    data = pd.DataFrame.from_dict(dict1, orient='index').T
    return data

In [151]:
file_list = [f for f in os.listdir('../data/jsons') if f.endswith(".json")]
len(file_list)

49

In [152]:
initial_profile=create_profile('brachetantoine.json', '../data/jsons/')

In [153]:
profiles = initial_profile
for i in file_list:
    a= create_profile(i, '../data/jsons/')
    profiles= pd.concat((profiles,a),axis=0)

In [154]:
file_list2 = [f for f in os.listdir('../data/jsons2') if f.endswith(".json")]
len(file_list2)

124

In [155]:
profiles2 = initial_profile
for i in file_list2:
    a= create_profile(i,'../data/jsons2/')
    profiles2= pd.concat((profiles2,a),axis=0)

In [156]:
data=pd.concat([profiles, profiles2]).reset_index(drop=True)
data.head(2)

Unnamed: 0,summary,industryName,lastName,student,geoCountryName,geoCountryUrn,geoLocationBackfilled,elt,industryUrn,firstName,...,publications9,publications10,maidenName,projects8,certifications5,certifications6,certifications7,certifications8,certifications9,certifications10
0,Antoine croit à l’intelligence de tous et à la...,IT Services and IT Consulting,Brachet,False,France,urn:li:fs_geo:105015875,False,False,urn:li:fs_industry:96,Antoine,...,,,,,,,,,,
1,French Entrepreneur - Founder and Managing Par...,Venture Capital and Private Equity Principals,Demri,False,France,urn:li:fs_geo:105015875,False,False,urn:li:fs_industry:106,Bobby,...,,,,,,,,,,


### Initial dataset cleaning

In [157]:
data.shape

(173, 89)

In [158]:
print(list(data.columns))

['summary', 'industryName', 'lastName', 'student', 'geoCountryName', 'geoCountryUrn', 'geoLocationBackfilled', 'elt', 'industryUrn', 'firstName', 'entityUrn', 'geoLocation', 'geoLocationName', 'location', 'headline', 'displayPictureUrl', 'img_100_100', 'img_200_200', 'img_400_400', 'img_800_800', 'profile_id', 'profile_urn', 'member_urn', 'public_id', 'experience', 'education', 'languages', 'publications', 'certifications', 'volunteer', 'honors', 'projects', 'experience1', 'experience2', 'experience3', 'experience4', 'experience5', 'education1', 'education2', 'education3', 'languages1', 'languages2', 'languages3', 'publications1', 'publications2', 'publications3', 'publications4', 'publications5', 'publications6', 'publications7', 'publications8', 'locationName', 'honors1', 'honors2', 'honors3', 'honors4', 'honors5', 'honors6', 'honors7', 'volunteer1', 'volunteer2', 'volunteer3', 'projects1', 'projects2', 'projects3', 'birthDate', 'languages4', 'projects4', 'projects5', 'address', 'lan

### Drop duplicates in rows

In [159]:
mask = data.duplicated(subset=['headline', 'firstName', 'lastName'], keep='first')
dropped = data.loc[mask]
dropped

Unnamed: 0,summary,industryName,lastName,student,geoCountryName,geoCountryUrn,geoLocationBackfilled,elt,industryUrn,firstName,...,publications9,publications10,maidenName,projects8,certifications5,certifications6,certifications7,certifications8,certifications9,certifications10
3,Antoine croit à l’intelligence de tous et à la...,IT Services and IT Consulting,Brachet,False,France,urn:li:fs_geo:105015875,False,False,urn:li:fs_industry:96,Antoine,...,,,,,,,,,,
48,Antoine croit à l’intelligence de tous et à la...,IT Services and IT Consulting,Brachet,False,France,urn:li:fs_geo:105015875,False,False,urn:li:fs_industry:96,Antoine,...,,,,,,,,,,
50,French Entrepreneur - Founder and Managing Par...,Venture Capital and Private Equity Principals,Demri,False,France,urn:li:fs_geo:105015875,False,False,urn:li:fs_industry:106,Bobby,...,,,,,,,,,,
51,Inspired by the power we can build to change t...,Civic and Social Organizations,Durieux,False,France,urn:li:fs_geo:105015875,False,False,urn:li:fs_industry:90,Sarah,...,,,,,,,,,,
84,,Internet,Paret,False,France,urn:li:fs_geo:105015875,False,True,urn:li:fs_industry:6,Jeremie,...,,,,,,,,,,


In [160]:
data.drop_duplicates(subset=['headline', 'firstName', 'lastName'], keep='first', inplace=True)

In [161]:
print(sorted(list(data.lastName.str.lower())))

['albet', 'alemanno', 'ami', 'andré', 'arrouard', 'auclair', 'auclair', 'baillot', 'banzet', 'barbaut', 'barbero', 'barbier', 'barre', 'bauquet', 'behar', 'behar', 'benkirane', 'bensa', 'berenguer-moncada', 'bertrand', 'bertrand', 'bessoles', 'besson', 'besson', 'bianchin-fabre', 'bloch', 'bouché', 'boudehen', 'bourlitio', 'brachet', 'briante guillemont', 'buchotte', 'bézard', 'canivet', 'chadenet', 'chaput', 'chatellier', 'chaygneaud-dupuy', 'chevallereau', 'choisne', 'claret', 'cochin', 'colas', 'colas', 'colombe', 'combaz', 'coudard', 'coutant', 'coutant', 'd', 'd.', 'daniel', 'daniel', 'dardier', 'dauchez', 'de bodman', 'de briey', 'de marignan', 'de pimodan', 'de roquefeuil', 'de royer', 'de sousa', 'de sousa', 'deffrennes', 'deleris', 'deleris', 'dembele', 'demri', 'demri', 'dernoncourt', 'des gachons', 'desmaison', 'deville', 'diguet', 'dubreuil', 'dubreuil', 'duguet', 'duperthuy', 'durieux', 'duriez', 'duriez', 'duval', 'désigaud', 'désigaud', 'escoubes', 'eve', 'forest', 'fox'

In [162]:
dups=(data.loc[(data.lastName.str.lower()=="auclair")|(data.lastName.str.lower()=="behar")|(data.lastName.str.lower()=="bertrand")|(data.lastName.str.lower()=="besson")|(data.lastName.str.lower()=="colas")|(data.lastName.str.lower()=="coutant")|(data.lastName.str.lower()=="daniel")|(data.lastName.str.lower()=="de sousa")|(data.lastName.str.lower()=="deleris")|(data.lastName.str.lower()=="demri")|(data.lastName.str.lower()=="dubreuil")|(data.lastName.str.lower()=="duriez")|(data.lastName.str.lower()=="désigaud")|(data.lastName.str.lower()=="kergall")|(data.lastName.str.lower()=="l.")|(data.lastName.str.lower()=="le roux")|(data.lastName.str.lower()=="legros")|(data.lastName.str.lower()=="lenoir")|(data.lastName.str.lower()=="levoir")|(data.lastName.str.lower()=="trèves")])[["firstName", "lastName"]].sort_values(by="lastName")
print(list(dups.index)[0::2])
data.drop(labels=list(dups.index)[0::2], axis=0, inplace=True)

[155, 152, 114, 130, 111, 147, 89, 54, 125, 126, 127, 69, 162, 65, 59, 87, 64, 133, 168, 74]


### Dropping unnecessary columns (specific to LIn, personal information, etc.)

In [163]:
pd.options.display.max_rows = 100

In [164]:
todrop=['industryName', 'experience', 'education', 'languages','languages','publications', 
        'certifications','volunteer', 'honors', 'projects', 
        'displayPictureUrl', 'img_100_100', 'img_200_200', 'img_400_400', 
        'img_800_800', 'student', 'maidenName', 'birthDate', 
        'address','geoCountryUrn', 'geoLocationBackfilled', 'elt', 'industryUrn',
        'entityUrn', 'geoLocation', 'location', 'locationName', 
        'profile_id', 'profile_urn', 'member_urn', 'public_id']
data.drop(columns=todrop, inplace=True)

In [165]:
data.describe().T.sort_index()

Unnamed: 0,count,unique,top,freq
certifications1,39,39,{'name': 'Psychology'},1
certifications10,1,1,{'authority': 'The London School of Economics ...,1
certifications2,18,18,"{'authority': 'OpenClassrooms', 'name': 'Learn...",1
certifications3,9,9,"{'authority': 'COLIBRIS', 'name': 'Quelle démo...",1
certifications4,4,4,"{'authority': 'A Cloud Guru', 'name': 'Introdu...",1
certifications5,3,3,"{'authority': 'Google', 'name': 'Google Ads Ce...",1
certifications6,3,3,"{'authority': 'Google', 'name': 'Certification...",1
certifications7,3,3,"{'authority': 'HubSpot', 'name': 'Hubspot Inbo...",1
certifications8,1,1,{'authority': 'ANSSI - Agence nationale de la ...,1
certifications9,1,1,"{'authority': 'Institut français', 'name': 'Cu...",1


### Check who responded to what

In [166]:
data.isna().sum().sort_values(ascending=False)

certifications10    147
certifications9     147
certifications8     147
projects8           147
languages7          146
honors7             146
certifications7     145
certifications6     145
certifications5     145
certifications4     144
projects7           144
projects6           144
honors6             142
languages6          141
honors5             141
certifications3     139
honors4             139
projects5           138
honors3             138
publications10      135
projects4           135
publications9       132
publications8       130
projects3           130
certifications2     130
languages5          129
honors2             129
publications7       127
publications6       124
publications5       121
projects2           120
honors1             119
publications4       113
publications3       112
certifications1     109
volunteer3          108
publications2       104
projects1           103
languages4          102
volunteer2           90
publications1        87
volunteer1      

In [167]:
data["lang_stated"]=1
data["honors_stated"]=1
data["pubs_stated"]=1
data["projects_stated"]=1
data["certif_stated"]=1
data.head(1)

Unnamed: 0,summary,lastName,geoCountryName,firstName,geoLocationName,headline,experience1,experience2,experience3,experience4,...,certifications6,certifications7,certifications8,certifications9,certifications10,lang_stated,honors_stated,pubs_stated,projects_stated,certif_stated
0,Antoine croit à l’intelligence de tous et à la...,Brachet,France,Antoine,Greater Paris Metropolitan Region,"Directeur associé de bluenove, initiateur du m...","{'locationName': 'Paris Area, France', 'entity...",{'entityUrn': 'urn:li:fs_position:(ACoAAAA615E...,"{'locationName': 'Paris', 'entityUrn': 'urn:li...","{'locationName': 'Paris Area, France', 'entity...",...,,,,,,1,1,1,1,1


In [168]:
data.loc[data["languages1"].isna(), "lang_stated"]=0
data.loc[data["honors1"].isna(), "honors_stated"]=0
data.loc[data["projects1"].isna(), "projects_stated"]=0
data.loc[data["certifications1"].isna(), "certif_stated"]=0
data.loc[data["publications1"].isna(), "pubs_stated"]=0

In [169]:
print(sum(data["lang_stated"]))
print(sum(data["honors_stated"]))
print(sum(data["projects_stated"]))
print(sum(data["certif_stated"]))
print(sum(data["pubs_stated"]))

107
29
45
39
61


In [170]:
data["over_2_lang"]=1

In [171]:
data.loc[data["languages3"].isna(), "over_2_lang"]=0
print(sum(data["over_2_lang"]))

86


In [172]:
data["lin_details_remplis"]=0

In [173]:
details_col_list= [i for i in data.columns if "stated" in i]
details_col_list

['lang_stated',
 'honors_stated',
 'pubs_stated',
 'projects_stated',
 'certif_stated']

In [174]:
data["lin_details_remplis"]= data["lang_stated"]+data["honors_stated"]+data["pubs_stated"]+data["projects_stated"]+data["certif_stated"]
print(list(data["lin_details_remplis"]))

[2, 1, 2, 3, 1, 2, 3, 1, 3, 3, 2, 4, 0, 2, 5, 0, 3, 3, 1, 4, 1, 1, 1, 1, 1, 1, 2, 3, 1, 2, 0, 2, 1, 0, 2, 2, 2, 1, 0, 5, 5, 1, 2, 2, 3, 2, 0, 1, 1, 3, 5, 3, 0, 2, 0, 3, 3, 2, 3, 3, 0, 4, 2, 2, 0, 0, 3, 1, 2, 2, 0, 3, 1, 0, 2, 2, 2, 1, 1, 2, 2, 3, 0, 3, 0, 0, 0, 4, 2, 4, 2, 2, 3, 3, 3, 2, 2, 3, 1, 1, 0, 3, 3, 0, 3, 0, 0, 1, 4, 3, 4, 2, 3, 2, 1, 2, 3, 0, 1, 2, 2, 2, 2, 1, 1, 2, 2, 5, 3, 4, 2, 2, 1, 2, 1, 3, 2, 2, 1, 0, 2, 2, 1, 2, 2, 2, 3, 2]


### Exploding columns

In [175]:
print(list(data.columns))

['summary', 'lastName', 'geoCountryName', 'firstName', 'geoLocationName', 'headline', 'experience1', 'experience2', 'experience3', 'experience4', 'experience5', 'education1', 'education2', 'education3', 'languages1', 'languages2', 'languages3', 'publications1', 'publications2', 'publications3', 'publications4', 'publications5', 'publications6', 'publications7', 'publications8', 'honors1', 'honors2', 'honors3', 'honors4', 'honors5', 'honors6', 'honors7', 'volunteer1', 'volunteer2', 'volunteer3', 'projects1', 'projects2', 'projects3', 'languages4', 'projects4', 'projects5', 'languages5', 'languages6', 'certifications1', 'projects6', 'projects7', 'certifications2', 'certifications3', 'certifications4', 'languages7', 'publications9', 'publications10', 'projects8', 'certifications5', 'certifications6', 'certifications7', 'certifications8', 'certifications9', 'certifications10', 'lang_stated', 'honors_stated', 'pubs_stated', 'projects_stated', 'certif_stated', 'over_2_lang', 'lin_details_rem

In [176]:
for i in range (1,4): 
    data=pd.concat([data, 
    pd.DataFrame(data['education'+str(i)].apply(pd.Series)).add_prefix('ed_'+str(i)), 
    pd.DataFrame(data['volunteer'+str(i)].apply(pd.Series)).add_prefix('vol_'+str(i))], 
    axis=1).reset_index(drop=True)
    data.drop(columns=['education'+str(i),'volunteer'+str(i)], inplace=True)   

In [177]:
for i in range (1,6): 
    data=pd.concat([data, 
    pd.DataFrame(data['experience'+str(i)].apply(pd.Series)).add_prefix('exp_'+str(i))], 
    axis=1).reset_index(drop=True)
    data.drop(columns='experience'+str(i), inplace=True)   

In [178]:
for i in range (1,8): 
    data=pd.concat([data, 
    pd.DataFrame(data['honors'+str(i)].apply(pd.Series)).add_prefix('hon_'+str(i)), 
    pd.DataFrame(data['languages'+str(i)].apply(pd.Series)).add_prefix('lang_'+str(i))], 
    axis=1).reset_index(drop=True)
    data.drop(columns=['honors'+str(i),'languages'+str(i)], inplace=True)    

In [179]:
for i in range (1,9): 
    data=pd.concat([data, 
    pd.DataFrame(data['projects'+str(i)].apply(pd.Series)).add_prefix('proj_'+str(i))], 
    axis=1).reset_index(drop=True)
    data.drop(columns='projects'+str(i), inplace=True)

In [180]:
for i in range (1,11): 
    data=pd.concat([data, 
    pd.DataFrame(data['certifications'+str(i)].apply(pd.Series)).add_prefix('cert_'+str(i)), 
    pd.DataFrame(data['publications'+str(i)].apply(pd.Series)).add_prefix('pub_'+str(i))], 
    axis=1).reset_index(drop=True)
    data.drop(columns=['certifications'+str(i),'publications'+str(i)], inplace=True)

In [181]:
list_columns=[i for i in data.columns if ("Urn" in i or "Logo" in i or "_region" in i or '_abuse' in i or '_geo' in i or '_proj' in i or '_organi' in i or '_0' in i)]
#to check before dropping 
data[list_columns].T
data.drop(columns=list_columns, inplace=True)

In [182]:
reg_columns=[i for i in data.columns if "region" in i]
reg_columns
data.drop(columns=reg_columns, inplace=True)

### Dropping columns with too many missing values

In [183]:
data.shape

(148, 383)

In [184]:
print(list(data.columns))

['summary', 'lastName', 'geoCountryName', 'firstName', 'geoLocationName', 'headline', 'lang_stated', 'honors_stated', 'pubs_stated', 'projects_stated', 'certif_stated', 'over_2_lang', 'lin_details_remplis', 'ed_1school', 'ed_1timePeriod', 'ed_1degreeName', 'ed_1schoolName', 'ed_1fieldOfStudy', 'ed_1activities', 'ed_1description', 'ed_1grade', 'ed_1courses', 'ed_10', 'ed_1projects', 'ed_1honors', 'vol_10', 'vol_1role', 'vol_1companyName', 'vol_1timePeriod', 'vol_1cause', 'vol_1description', 'vol_1company', 'ed_2timePeriod', 'ed_2schoolName', 'ed_2school', 'ed_2fieldOfStudy', 'ed_2degreeName', 'ed_20', 'ed_2activities', 'ed_2grade', 'ed_2description', 'ed_2projects', 'ed_2honors', 'ed_2courses', 'vol_20', 'vol_2role', 'vol_2companyName', 'vol_2timePeriod', 'vol_2cause', 'vol_2description', 'vol_2company', 'ed_3grade', 'ed_3timePeriod', 'ed_3degreeName', 'ed_3schoolName', 'ed_3fieldOfStudy', 'ed_3school', 'ed_3description', 'ed_30', 'ed_3activities', 'ed_3courses', 'ed_3honors', 'ed_3proj

In [185]:
full_null=[i for i in data.columns if data[i].isna().sum()>147]
data.drop(columns=full_null, inplace=True)

In [186]:
data.shape

(148, 330)

In [187]:
nullvalues=pd.DataFrame(data.isna().sum().sort_values(ascending=False))
print(list(nullvalues.index)[0:50])

['hon_7description', 'proj_8title', 'exp_1courses', 'cert_7url', 'cert_9company', 'cert_9timePeriod', 'cert_9name', 'cert_9authority', 'proj_8description', 'proj_8timePeriod', 'proj_8members', 'proj_6occupation', 'proj_8occupation', 'cert_7displaySource', 'cert_8company', 'exp_2courses', 'cert_6licenseNumber', 'cert_6url', 'cert_7licenseNumber', 'cert_8authority', 'exp_2honors', 'cert_9displaySource', 'proj_8url', 'cert_4displaySource', 'cert_4url', 'cert_2licenseNumber', 'cert_6displaySource', 'exp_2organizations', 'cert_10company', 'cert_10timePeriod', 'cert_10name', 'cert_10authority', 'ed_3projects', 'exp_5courses', 'cert_3displaySource', 'cert_3url', 'cert_9url', 'cert_8name', 'ed_3honors', 'exp_2projects', 'proj_6url', 'exp_5honors', 'lang_7name', 'exp_3projects', 'exp_5organizations', 'hon_7occupation', 'hon_7title', 'exp_4courses', 'exp_1organizations', 'lang_7proficiency']


In [188]:
data.reset_index(inplace=True)
data.set_index("index")

Unnamed: 0_level_0,summary,lastName,geoCountryName,firstName,geoLocationName,headline,lang_stated,honors_stated,pubs_stated,projects_stated,...,cert_10authority,cert_10name,cert_10timePeriod,cert_10company,pub_10date,pub_10name,pub_10publisher,pub_10description,pub_10url,pub_10authors
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Antoine croit à l’intelligence de tous et à la...,Brachet,France,Antoine,Greater Paris Metropolitan Region,"Directeur associé de bluenove, initiateur du m...",1,0,1,0,...,,,,,,,,,,
1,French Entrepreneur - Founder and Managing Par...,Demri,France,Bobby,"Paris, Île-de-France",Founder & Managing Partner at ROCH Ventures,0,1,0,0,...,,,,,,,,,,
2,Inspired by the power we can build to change t...,Durieux,France,Sarah,"Paris, Île-de-France",Co-director Multitudes Foundation - Activist a...,1,0,1,0,...,,,,,,,,,,
3,"IT Consultant, Technical Expert, Lead develope...",Jaillot,France,Bastien,Greater Paris Metropolitan Region,Web expert chez Jolicode,1,0,1,1,...,,,,,,,,,,
4,,Vanneroy,France,Coline,Greater Paris Metropolitan Region,Directrice des opérations chez Cap Collectif,0,0,0,1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143,Mes projets du moment : \n- Le pilotage de la ...,Gresset,France,Sarah,,Directrice de projets RH,1,0,0,0,...,,,,,,,,,,
144,,Briante Guillemont,France,Sophie,Greater Paris Metropolitan Region,Political adviser,1,0,1,0,...,,,,,,,,,,
145,,Gabadou Santiago,France,Eloïse,,Democracy and Innovation,1,0,0,1,...,,,,,,,,,,
146,"Telecom & Environment engineer, I enjoy to con...",Coutant,France,Ludovic,Greater Paris Metropolitan Region,Engineer specialized in the digital & energy t...,1,0,1,1,...,,,,,,,,,,


In [189]:
columns= data.columns
columnstodrop=[i for i in data.columns if (data[i].isna().sum() > len(data)/2+10) or (i=="index")]
print(columnstodrop)

add_data=data[["index", "lastName", "firstName"]]
add_data=add_data.merge(data[columnstodrop], how='left', on="index")
add_data.head()

['index', 'ed_1activities', 'ed_1description', 'ed_1grade', 'ed_1courses', 'ed_1projects', 'ed_1honors', 'vol_1cause', 'vol_1description', 'vol_1company', 'ed_2activities', 'ed_2grade', 'ed_2description', 'ed_2projects', 'ed_2honors', 'ed_2courses', 'vol_2role', 'vol_2companyName', 'vol_2timePeriod', 'vol_2cause', 'vol_2description', 'vol_2company', 'ed_3grade', 'ed_3description', 'ed_3activities', 'ed_3courses', 'ed_3honors', 'ed_3projects', 'vol_3role', 'vol_3companyName', 'vol_3timePeriod', 'vol_3description', 'vol_3company', 'vol_3cause', 'exp_1honors', 'exp_1organizations', 'exp_1courses', 'exp_2organizations', 'exp_2projects', 'exp_2honors', 'exp_2courses', 'exp_3projects', 'exp_3courses', 'exp_3honors', 'exp_4projects', 'exp_4honors', 'exp_4courses', 'exp_5honors', 'exp_5projects', 'exp_5organizations', 'exp_5courses', 'hon_1description', 'hon_1occupation', 'hon_1title', 'hon_1issueDate', 'hon_1issuer', 'hon_2description', 'hon_2occupation', 'hon_2title', 'hon_2issueDate', 'hon_

Unnamed: 0,index,lastName,firstName,ed_1activities,ed_1description,ed_1grade,ed_1courses,ed_1projects,ed_1honors,vol_1cause,...,cert_10authority,cert_10name,cert_10timePeriod,cert_10company,pub_10date,pub_10name,pub_10publisher,pub_10description,pub_10url,pub_10authors
0,0,Brachet,Antoine,,,,,,,,...,,,,,,,,,,
1,1,Demri,Bobby,Junior Entreprise. Département Moyen-Orient.,,,,,,,...,,,,,,,,,,
2,2,Durieux,Sarah,,The journey to transformative change in our so...,,,,,POLITICS,...,,,,,,,,,,
3,3,Jaillot,Bastien,,,,,,,,...,,,,,,,,,,
4,4,Vanneroy,Coline,,"Mémoire : ""Les conceptions ordinaires de l'Eur...",,,,,,...,,,,,,,,,,


In [190]:
add_data.to_csv("//Users/tatianadeferaudy/Desktop/bacasable/marche_CT/additional_data_low_response.csv", sep=';', index=True)

In [191]:
data.drop(columns=columnstodrop, inplace=True)

In [192]:
data.shape

(148, 72)

In [193]:
nullvalues=pd.DataFrame(data.isna().sum().sort_values(ascending=False))
nullvalues.head()

Unnamed: 0,0
vol_1timePeriod,83
vol_1role,79
vol_1companyName,79
ed_3fieldOfStudy,69
lang_3proficiency,67


### Cleaning time, industry and company columns 

In [194]:
print(data.columns)

Index(['summary', 'lastName', 'geoCountryName', 'firstName', 'geoLocationName',
       'headline', 'lang_stated', 'honors_stated', 'pubs_stated',
       'projects_stated', 'certif_stated', 'over_2_lang',
       'lin_details_remplis', 'ed_1school', 'ed_1timePeriod', 'ed_1degreeName',
       'ed_1schoolName', 'ed_1fieldOfStudy', 'vol_1role', 'vol_1companyName',
       'vol_1timePeriod', 'ed_2timePeriod', 'ed_2schoolName', 'ed_2school',
       'ed_2fieldOfStudy', 'ed_2degreeName', 'ed_3timePeriod',
       'ed_3degreeName', 'ed_3schoolName', 'ed_3fieldOfStudy', 'ed_3school',
       'exp_1locationName', 'exp_1geoLocationName', 'exp_1companyName',
       'exp_1timePeriod', 'exp_1description', 'exp_1company', 'exp_1title',
       'exp_2companyName', 'exp_2timePeriod', 'exp_2company', 'exp_2title',
       'exp_2locationName', 'exp_2geoLocationName', 'exp_2description',
       'exp_3locationName', 'exp_3geoLocationName', 'exp_3companyName',
       'exp_3timePeriod', 'exp_3company', 'exp_3title'

In [195]:
for i in range (1,6):
    data['exp_'+str(i)+'startDate_month']= (pd.DataFrame(data['exp_'+str(i)+'timePeriod'].apply(pd.Series))['startDate']).apply(pd.Series)['month']
    data['exp_'+str(i)+'startDate_year']= (pd.DataFrame(data['exp_'+str(i)+'timePeriod'].apply(pd.Series))['startDate']).apply(pd.Series)['year']
    data['exp_'+str(i)+'endDate_month']= (pd.DataFrame(data['exp_'+str(i)+'timePeriod'].apply(pd.Series))['endDate']).apply(pd.Series)['month']
    data['exp_'+str(i)+'endDate_year']= (pd.DataFrame(data['exp_'+str(i)+'timePeriod'].apply(pd.Series))['endDate']).apply(pd.Series)['year']
    data['exp_'+str(i)+'industry']= (pd.DataFrame(data['exp_'+str(i)+'company'].apply(pd.Series))['industries'])
    data['exp_'+str(i)+'company_empl_low']= (pd.DataFrame(data['exp_'+str(i)+'company'].apply(pd.Series))['employeeCountRange']).apply(pd.Series)['start']
    data['exp_'+str(i)+'company_empl_high']= (pd.DataFrame(data['exp_'+str(i)+'company'].apply(pd.Series))['employeeCountRange']).apply(pd.Series)['end']

  data['exp_'+str(i)+'industry']= (pd.DataFrame(data['exp_'+str(i)+'company'].apply(pd.Series))['industries'])
  data['exp_'+str(i)+'company_empl_low']= (pd.DataFrame(data['exp_'+str(i)+'company'].apply(pd.Series))['employeeCountRange']).apply(pd.Series)['start']
  data['exp_'+str(i)+'company_empl_high']= (pd.DataFrame(data['exp_'+str(i)+'company'].apply(pd.Series))['employeeCountRange']).apply(pd.Series)['end']


In [196]:
for i in range (1,4):
    data['ed_'+str(i)+'startDate_month']= (pd.DataFrame(data['ed_'+str(i)+'timePeriod'].apply(pd.Series))['startDate']).apply(pd.Series)['month']
    data['ed_'+str(i)+'startDate_year']= (pd.DataFrame(data['ed_'+str(i)+'timePeriod'].apply(pd.Series))['startDate']).apply(pd.Series)['year']

In [197]:
print(list(data.columns))

['summary', 'lastName', 'geoCountryName', 'firstName', 'geoLocationName', 'headline', 'lang_stated', 'honors_stated', 'pubs_stated', 'projects_stated', 'certif_stated', 'over_2_lang', 'lin_details_remplis', 'ed_1school', 'ed_1timePeriod', 'ed_1degreeName', 'ed_1schoolName', 'ed_1fieldOfStudy', 'vol_1role', 'vol_1companyName', 'vol_1timePeriod', 'ed_2timePeriod', 'ed_2schoolName', 'ed_2school', 'ed_2fieldOfStudy', 'ed_2degreeName', 'ed_3timePeriod', 'ed_3degreeName', 'ed_3schoolName', 'ed_3fieldOfStudy', 'ed_3school', 'exp_1locationName', 'exp_1geoLocationName', 'exp_1companyName', 'exp_1timePeriod', 'exp_1description', 'exp_1company', 'exp_1title', 'exp_2companyName', 'exp_2timePeriod', 'exp_2company', 'exp_2title', 'exp_2locationName', 'exp_2geoLocationName', 'exp_2description', 'exp_3locationName', 'exp_3geoLocationName', 'exp_3companyName', 'exp_3timePeriod', 'exp_3company', 'exp_3title', 'exp_3description', 'exp_4locationName', 'exp_4geoLocationName', 'exp_4companyName', 'exp_4time

In [198]:
for i in range (1,2): 
    data=pd.concat([data, 
    pd.DataFrame(data['vol_'+str(i)+'timePeriod'].apply(pd.Series)).add_prefix('vol_'+str(i))], 
    axis=1).reset_index(drop=True)
    data.drop(columns=['vol_'+str(i)+'timePeriod'], inplace=True)   

In [199]:
data.shape

(148, 115)

In [203]:
columns=(data.columns.sort_values())
data=data[columns]

In [206]:
timecols= [i for i in data.columns if "time" in i]
data.drop(columns=timecols, inplace=True)


In [208]:
data.isna().sum().sort_values(ascending=False)[0:14]
data.drop(columns="vol_10", inplace=True)
#data.fillna("[]", inplace=True)
# we are filling na with [] because special characters will be removed later on. 

In [210]:
data.drop(columns=["ed_1school", "ed_2school", "ed_3school"], inplace=True)

In [220]:
pd.options.display.max_rows = 115
data.head().T

Unnamed: 0,0,1,2,3,4
certif_stated,0,0,0,0,0
ed_1degreeName,Master,,Global messaging,Master informatique,Master 2
ed_1fieldOfStudy,Business/Managerial Economics,Relations Internationales et Sciences Politiques,Communication orale et rhétorique,Architecture logicielle distribuée,Sciences politiques
ed_1schoolName,ESCP Europe,Ecole des Hautes Etudes Politiques,ASO communications / New Economy Organizers Ne...,Université Bordeaux I,Université Paris 1 Panthéon-Sorbonne
ed_1startDate_month,,,3.0,,
ed_1startDate_year,1998.0,2006.0,2022.0,2004.0,2005.0
ed_2degreeName,,,Training program to prepare women to political...,,Licence
ed_2fieldOfStudy,,Relations et affaires internationales,Political sciences and government,,Droit
ed_2schoolName,Prépa Saint Jean de Douai,The Hebrew University,Investies,,Université Lille 2 Droit et Santé
ed_2startDate_month,,,9.0,,


In [221]:
data.shape

(148, 98)

### Export dataset and description dataframe

In [222]:
data.to_csv("//Users/tatianadeferaudy/Desktop/Bacasable/marche_CT/dataset3.csv", sep=';', index=False)

In [223]:
data.describe(include='O').T
general_pop_data= pd.DataFrame(data.describe(include="O").T)
general_pop_data.to_csv("//Users/tatianadeferaudy/Desktop/Bacasable/marche_CT/general_pop_data3.csv", sep=';')

### Making a copy

In [225]:
data2=data.copy()

In [226]:
pd.options.display.max_rows = 105
data.head(5).T

Unnamed: 0,0,1,2,3,4
certif_stated,0,0,0,0,0
ed_1degreeName,Master,,Global messaging,Master informatique,Master 2
ed_1fieldOfStudy,Business/Managerial Economics,Relations Internationales et Sciences Politiques,Communication orale et rhétorique,Architecture logicielle distribuée,Sciences politiques
ed_1schoolName,ESCP Europe,Ecole des Hautes Etudes Politiques,ASO communications / New Economy Organizers Ne...,Université Bordeaux I,Université Paris 1 Panthéon-Sorbonne
ed_1startDate_month,,,3.0,,
ed_1startDate_year,1998.0,2006.0,2022.0,2004.0,2005.0
ed_2degreeName,,,Training program to prepare women to political...,,Licence
ed_2fieldOfStudy,,Relations et affaires internationales,Political sciences and government,,Droit
ed_2schoolName,Prépa Saint Jean de Douai,The Hebrew University,Investies,,Université Lille 2 Droit et Santé
ed_2startDate_month,,,9.0,,


#### Education EDA

In [229]:
ed_columns=[i for i in data.columns if "ed_" in i]
print(ed_columns)

['ed_1degreeName', 'ed_1fieldOfStudy', 'ed_1schoolName', 'ed_1startDate_month', 'ed_1startDate_year', 'ed_2degreeName', 'ed_2fieldOfStudy', 'ed_2schoolName', 'ed_2startDate_month', 'ed_2startDate_year', 'ed_3degreeName', 'ed_3fieldOfStudy', 'ed_3schoolName', 'ed_3startDate_month', 'ed_3startDate_year']


In [233]:
data[ed_columns].describe()

Unnamed: 0,ed_1startDate_month,ed_1startDate_year,ed_2startDate_month,ed_2startDate_year,ed_3startDate_month,ed_3startDate_year
count,11.0,138.0,5.0,123.0,3.0,99.0
mean,6.181818,2008.478261,7.0,2008.512195,6.0,2006.474747
std,3.970345,9.33498,3.082207,8.165668,3.605551,8.700626
min,1.0,1968.0,2.0,1983.0,2.0,1982.0
25%,3.0,2003.0,6.0,2003.0,4.5,2002.0
50%,9.0,2010.0,9.0,2010.0,7.0,2009.0
75%,9.0,2015.0,9.0,2015.0,8.0,2013.0
max,12.0,2023.0,9.0,2022.0,9.0,2023.0


In [232]:
data[ed_columns].describe(include='O')

Unnamed: 0,ed_1degreeName,ed_1fieldOfStudy,ed_1schoolName,ed_2degreeName,ed_2fieldOfStudy,ed_2schoolName,ed_3degreeName,ed_3fieldOfStudy,ed_3schoolName
count,130,114,143,110,104,130,92,79,107
unique,108,105,103,94,97,107,73,75,96
top,Master's degree,Marketing,Sciences Po,Master,"Business Administration and Management, General",Sciences Po,Baccalauréat,Science politique,Sciences Po
freq,10,3,10,7,3,7,6,2,6


In [None]:
# Education info : 
# education start dates are around 2007 (2009, 2009, 2006), with an std of 8 to 9. 
# these people were in university mainly between 1998 and 2016 
# although there are some that state education in 1968 (min), and the max is 2023 as a start date
# most frequent degree is master's, and most frequent school is Sciences Po. 
# most frequent fields of study, however, are business administration and management and marketing. 

In [311]:
from unidecode import unidecode 

def recoding_degree(x):
    x=str(x).lower()
    x=unidecode(x)
    if ("master" in x) or ("msc" in x) or ("m.sc" in x) or ("maitrise" in x) or ('m1' in x) or ('m2' in x) or ("diplome d" in x) or ("polytechnique" in x) or ('sciences po' in x):
        return 'master'
    elif ("baccalaureat" in x): 
        return 'baccalaureat'
    elif ("bachelor" in x) or ("licence" in x) or ("deug" in x) :
        return 'licence'
    elif ("phd" in x) or ("dea" in x) or ("dess" in x) or ("doctorat" in x) or ("ph.d" in x):
        return 'dea_dess_phd'
    elif ("prepa" in x) or ("hypokhagne" in x): 
        return 'prepa'
    elif ("mba" in x) or ("executive education" in x) : 
        return 'mba'
    elif ("bac" in x): 
        return 'baccalaureat'
    elif ("bts" in x) or  ("dut" in x): 
        return 'bts_dut'
    else:
        return x

In [362]:
data['recoded_degree_1']="NA"
data['recoded_degree_2']="NA"
data['recoded_degree_3']="NA"
data["recoded_degree_1"]= data["ed_1degreeName"].apply(recoding_degree)
data["recoded_degree_2"]= data["ed_2degreeName"].apply(recoding_degree)
data["recoded_degree_3"]= data["ed_3degreeName"].apply(recoding_degree)

#### Total degrees table

In [363]:
degree1=pd.DataFrame(data['recoded_degree_1'].value_counts().reset_index())
degree1.columns=["degree", "count"]

In [364]:
degree2= pd.DataFrame(data['recoded_degree_2'].value_counts().reset_index())
degree2.columns=["degree", "count"]

In [365]:
degree3=pd.DataFrame(data['recoded_degree_3'].value_counts().reset_index())
degree3.columns=["degree", "count"]

In [366]:
degree2.head()

Unnamed: 0,degree,count
0,master,54
1,,38
2,licence,10
3,dea_dess_phd,8
4,baccalaureat,4


In [367]:
total_degrees= degree1.merge(degree2, how='outer', on="degree").merge(degree3, how="outer", on="degree")

In [368]:
total_degrees["total"]=0
total_degrees.fillna(0, inplace=True)
total_degrees["total"]= total_degrees["count_x"]+total_degrees["count_y"]+total_degrees["count"]
(total_degrees[["degree", "total"]].sort_values(by="total", ascending=False))[0:10]

Unnamed: 0,degree,total
0,master,168.0
1,,112.0
5,licence,34.0
2,dea_dess_phd,20.0
6,baccalaureat,14.0
3,mba,10.0
33,prepa,8.0
4,bts_dut,4.0
60,exchange program,3.0
34,associate's degree,3.0


#### degree variables

In [400]:
(data['recoded_degree_1'].value_counts())[0:10]
(data['recoded_degree_2'].value_counts())[0:10]
(data['recoded_degree_3'].value_counts())[0:10]

nan                         56
master                      32
licence                     21
baccalaureat                 8
prepa                        4
dea_dess_phd                 3
exchange program             3
mba                          3
wset certificate level 2     1
label hf management          1
Name: recoded_degree_3, dtype: int64

In [402]:
data['dea_dess_phd']=0
data['master']=0
data['mba']=0
data['licence']=0
data["prepa"]=0
data.loc[(data.recoded_degree_1 =='master')|(data.recoded_degree_2 =='master')|(data.recoded_degree_3 =='master'), 'master']=1
data.loc[(data.recoded_degree_1 =='master')|(data.recoded_degree_2 =='master')|(data.recoded_degree_3 =='master'), 'licence']=1
data['master'].sum()

110

In [403]:
data.loc[(data.recoded_degree_1 =='dea_dess_phd')|(data.recoded_degree_2 =='dea_dess_phd')|(data.recoded_degree_3 =='dea_dess_phd'), 'dea_dess_phd']=1
data.loc[(data.recoded_degree_1 =='dea_dess_phd')|(data.recoded_degree_2 =='dea_dess_phd')|(data.recoded_degree_3 =='dea_dess_phd'), 'master']=1
data.loc[(data.recoded_degree_1 =='dea_dess_phd')|(data.recoded_degree_2 =='dea_dess_phd')|(data.recoded_degree_3 =='dea_dess_phd'), 'licence']=1
data['dea_dess_phd'].sum()

19

In [404]:
data.loc[(data.recoded_degree_1 =='mba')|(data.recoded_degree_2 =='mba')|(data.recoded_degree_3 =='mba'), 'mba']=1
data['mba'].sum()

8

In [405]:
data.loc[(data.recoded_degree_1 =='prepa')|(data.recoded_degree_2 =='prepa')|(data.recoded_degree_3 =='prepa'), 'prepa']=1
data['prepa'].sum()

8

In [406]:
#Licence, bts, DUT 
data.loc[(data.recoded_degree_1 =='bts_dut')|(data.recoded_degree_2 =='bts_dut')|(data.recoded_degree_3 =='bts_dut'), 'licence']=1
data.loc[(data.recoded_degree_1 =='licence')|(data.recoded_degree_2 =='licence')|(data.recoded_degree_3 =='licence'), 'licence']=1
data['licence'].sum()

126

#### field of study

In [422]:
data[["ed_1fieldOfStudy", 'ed_2fieldOfStudy', "ed_3fieldOfStudy"]].fillna('NR', inplace=True)
# NA= 34, 44, 69

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[["ed_1fieldOfStudy", 'ed_2fieldOfStudy', "ed_3fieldOfStudy"]].fillna('NR', inplace=True)


In [447]:
print(list(data["ed_1fieldOfStudy"]))
print(list(data["ed_2fieldOfStudy"]))
print(list(data["ed_3fieldOfStudy"]))

['Business/Managerial Economics', 'Relations Internationales et Sciences Politiques', 'Communication orale et rhétorique', 'Architecture logicielle distribuée', 'Sciences politiques', nan, 'Written & Audiovisual Journalism', nan, 'Strategy', 'Management des Ressources Humaines', 'Conception de Produits Industriels', 'International Politics', nan, 'Master of Digital Business', 'Management', 'Management', 'Focus: Democratic Innovation', 'Public Affairs', nan, 'Financial Accounting', 'Business and Marketing', 'Computer Science', 'Circular Economy', 'Etudes politiques', 'Sciences politiques', 'Relations et affaires internationales', 'Product Management', nan, 'Finance and Consulting', 'Specialization in Entrepreneurship', 'Économie', 'in Global Leadership and Public Policy for the 21st century', nan, 'Sciences Economiques', "Politiques de l'Environnement", 'Community management', nan, 'Political science', nan, 'Business School', 'Marketing', 'Relations et affaires internationales', 'Multim

In [497]:
def recoding_field_study(x):
    x=str(x).lower()
    y=[]
    x=unidecode(x)
    z='NR'
    if x=="nan":
        return z
    else:
        if ("science po" in x) or ("sciences po" in x) or ("politic" in x) or ('politique' in x) or ("gouvernement" in x) or ("government" in x):
            y.append("sciences_po")
        if ("action publique" in x) or ("affaires publiques" in x) or ("public" in x) or ("administration publique" in x) or ("gestion publique" in x) or ("policy" in x) or ("relations publiques" in x) :
            y.append("AP")
        if ("economi" in x) or ('economy' in x):
            y.append("economie")
        if ("droit" in x) or ("law" in x):
            y.append("droit")
        if ("international relations" in x) or ("relations internationales" in x) or ("affaires internationales" in x) or ("foreign" in x) or ("affaires internationales" in x):
            y.append("RI")
        if ("lettres" in x) or ("litter" in x) or ("langu" in x) or ("liter" in x):
            y.append("lettres")
        if ("business" in x) or ("commerc" in x):
            y.append("business")
        if ("manage" in x):
            y.append("management")
        if ("computer" in x) or ("informatique" in x) or ("web development" in x) or ("information" in x):
            y.append("informatique")
        if ("amenagement" in x) or ("urba" in x):
            y.append("urba_amgt")
        if ("ingenie" in x) or ("engineer" in x):
            y.append("ingenierie")
        if ("communication" in x):
            y.append("communication")
        if ("histoire" in x) or ("history" in x) or ("philosophie" in x) or ("philosophy" in x):
            y.append("hist_philo")
        if ("media" in x):
            y.append("media")
        if ("financ" in x):
            y.append("finance")  
        if 'journal' in x:
            y.append("journalisme")
        if 'strateg' in x:
            y.append("strategie")
        if ("digital" in x) or ("logiciel" in x):
            y.append("numerique")
        if ("international" in x) or ("europe" in x):
            y.append("international") 
        if ('design' in x) or ('conception' in x):
            y.append('design')
        if ("ecolog"in x) or ('environment' in x) or ("biodiv" in x) or ("environnement" in x) or ("sustain" in x) or ("durable" in x):
            y.append("environnement")
        if ("entrepr" in x):
            y.append("entrepreneuriat")
        if ('math' in x):
            y.append ("maths")
        y=' '.join(y)
        if len(y)<1:
            return 'autre'
        else:
            return y

In [498]:
data["fieldofstudy1"]='other'
data['fieldofstudy1']=data["ed_1fieldOfStudy"].apply(recoding_field_study)
data.fieldofstudy1.value_counts()

NR                                       34
autre                                    21
sciences_po                               8
management                                7
environnement                             4
economie                                  4
business                                  4
RI international                          3
urba_amgt                                 3
AP                                        3
business management                       3
numerique                                 3
sciences_po international                 3
business international                    2
ingenierie design                         2
finance                                   2
strategie                                 2
journalisme                               2
communication                             2
entrepreneuriat                           2
ingenierie                                2
droit international                       2
international                   

In [499]:
data["fieldofstudy2"]='other'
data['fieldofstudy2']=data["ed_2fieldOfStudy"].apply(recoding_field_study)
data.fieldofstudy2.value_counts()

NR                                                        44
autre                                                     17
sciences_po                                               10
economie                                                   6
AP                                                         5
management                                                 4
business management                                        3
business international                                     3
lettres                                                    3
communication                                              3
hist_philo                                                 2
design                                                     2
business                                                   2
urba_amgt                                                  2
RI international                                           2
media                                                      2
sciences_po RI internati

In [500]:
data["fieldofstudy3"]='other'
data['fieldofstudy3']=data["ed_3fieldOfStudy"].apply(recoding_field_study)
data.fieldofstudy3.value_counts()

NR                                           69
autre                                        14
sciences_po                                   8
economie                                      5
business                                      4
communication media                           3
droit                                         3
hist_philo                                    3
maths                                         2
RI international                              2
strategie                                     2
informatique                                  2
environnement                                 2
AP management finance                         1
economie business                             1
urba_amgt                                     1
sciences_po RI hist_philo international       1
sciences_po urba_amgt hist_philo              1
economie informatique maths                   1
AP environnement                              1
sciences_po RI international            

In [501]:
categories=['sciences_po', 'AP', "economie", 'droit', 'RI', 'lettres', 'business', 'management', 
            'informatique', 'urba_amgt', "ingenierie", "communication", 'hist_philo', 'media', 'finance',
            'journalisme', "strategie", "numerique", "international", 'design', 'environnement', "entrepreneuriat", 'maths']

for i in categories:
    data[i]=0

# data.loc[(data.recoded_degree_1 =='master')|(data.recoded_degree_2 =='master')|(data.recoded_degree_3 =='master'), 'master']=1
# data.loc[(data.recoded_degree_1 =='master')|(data.recoded_degree_2 =='master')|(data.recoded_degree_3 =='master'), 'licence']=1
# data['master'].sum()

In [511]:
def field_study_coding(x, col_list):
    y=0
    for c in col_list:
        if x in data[c]:
            y=1
        if y==1:
            return y
            break

In [512]:
categories=['sciences_po', 'AP', "economie", 'droit', 'RI', 'lettres', 'business', 'management', 
            'informatique', 'urba_amgt', "ingenierie", "communication", 'hist_philo', 'media', 'finance',
            'journalisme', "strategie", "numerique", "international", 'design', 'environnement', "entrepreneuriat", 'maths']
cols= ["ed_1fieldOfStudy", 'ed_2fieldOfStudy', "ed_3fieldOfStudy"]
for a in categories:
    data[a]=field_study_coding(a,cols)

In [519]:
type(data['sciences_po'])


pandas.core.series.Series

In [505]:
for i in categories:
    for a in cols:
        if i in data[a]:
            data[i]=1
        else:
            continue

In [517]:
data[["finance", "journalisme", "sciences_po"]].sum()

finance        0
journalisme    0
sciences_po    0
dtype: object

In [488]:
print(len(categories))

23


In [520]:
## from previous files 

In [None]:
# recoding location columns 
# note : this type of formatting flattens elements with different locations 
# giving priority to the french one

def recoding_location(x):
    x=str(x)
    if ("Paris" in x) or ("PAris" in x) or ("Montreuil" in x) or ("Puteaux" in x):
        return 'Paris Metropolitan Region'
    elif ("Brussels" in x) or ("Bruxelles" in x): 
         return 'Brussels Metropolitan Region'
    elif "Berlin" in x: 
         return 'Berlin Metropolitan Region'
    elif "Nantes" in x: 
         return 'Nantes Metropolitan Region'
    elif "Bordeaux" in x: 
         return 'Bordeaux Metropolitan Region'
    elif "Lyon" in x: 
         return 'Lyon Metropolitan Region'
    elif "Marseille" in x: 
         return 'Marseille Metropolitan Region'
    elif "Lille" in x: 
         return 'Lille Metropolitan Region'
    else:
        return x

# possible improvement with geopy library 

# replace " France" (if it is only the word, not Ile de France) by nothing? 

In [None]:
for i in profiles_experience.columns:
    if ('locationName' in i) or ('LocationName' in i):
        profiles_experience[i]= profiles_experience[i].apply(recoding_location)

profiles_experience.head()

In [None]:
list_companies=list(profiles_experience.exp1_companyName.str.lower())+list(profiles_experience.exp2_companyName.str.lower())+list(profiles_experience.exp3_companyName.str.lower())+list(profiles_experience.exp4_companyName.str.lower())+ list(profiles_experience.exp5_companyName.str.lower())
print(len(list_companies)), print(len(set(list_companies)))
df=pd.DataFrame(set(list_companies))
df.columns=['company']
dict_companies=df.company.value_counts().sort_index().to_dict()

In [None]:
def recoding_company(x):
    x=str(x).lower()
    if (x=="gov") or ("la météo de l" in x):
        return 'GOV la meteo de lopinion'
    elif ("make.org" in x): 
         return 'make.org'
    elif "orange" in x: 
         return 'Orange'
    elif ("freelance" in x) or ("self-employed" in x): 
         return 'freelance'
    elif "stig" in x: 
         return 'STIG'
    elif "voxe" in x: 
         return 'VOXE'
    else:
        return x

In [None]:
#test with 
# profiles_experience.exp1_companyName.apply(recoding_company)
#apply: 
for i in profiles_experience.columns:
    if ('companyName' in i):
        profiles_experience[i]= profiles_experience[i].apply(recoding_company)

print(len(list_companies)), print(len(set(list_companies)))

#list(profiles_experience.exp5_companyName)

In [None]:
list_titles=list(profiles_experience.exp1_title.str.lower())+list(profiles_experience.exp2_title.str.lower())+list(profiles_experience.exp3_title.str.lower())+list(profiles_experience.exp4_title.str.lower())+ list(profiles_experience.exp5_title.str.lower())
print(len(list_titles)), print(len(set(list_titles)))
df=pd.DataFrame(set(list_titles))
df.columns=['title']
dict_titles=df.title.value_counts().sort_index().to_dict()

In [None]:
# if includes "ceo" or "coo" or "cfo" or "président" or "directeur" or "directrice" or "director" or "cto" or "cpo" or "general manager" or "president" or "head of"
# if includes "consultant" or "conseiller" or "conseillère"
# if includes "founder" or "fondateur" or "fondatrice"

In [None]:
def recoding_title_dir(x):
     x=str(x).lower()
     dir=["ceo", "coo", "cfo", "président", "directeur", "directrice", "director", 
          "cto", "cpo", "general manager", "president", "head of"] 
     if any([y in x for y in dir]):
          return 1
     else: 
          return 0

def recoding_title_cs(x):
     x=str(x).lower()
     cs= ["consultant", "conseiller", "conseillère"]
     if any([y in x for y in cs]):
          return 1
     else: 
          return 0

def recoding_title_fond(x): 
     x=str(x).lower()
     fond=["founder", "fondateur", "fondatrice"]
     if any([y in x for y in fond]):
         return 1
     else: 
          return 0

In [None]:
list_title_columns= [i for i in profiles_experience.columns if 'title' in i]
print(list_title_columns)

for i in list_title_columns:
    profiles_experience[str(i)+'_direction']=profiles_experience[i].apply(recoding_title_dir)
for i in list_title_columns:
    profiles_experience[str(i)+'consulting']=profiles_experience[i].apply(recoding_title_cs)
for i in list_title_columns:
    profiles_experience[str(i)+'founder']=profiles_experience[i].apply(recoding_title_fond)

profiles_experience.columns

In [None]:
profiles_experience['consulting_roles']= profiles_experience['exp1_titleconsulting']+profiles_experience['exp2_titleconsulting']+profiles_experience['exp3_titleconsulting']+profiles_experience['exp4_titleconsulting']+profiles_experience['exp5_titleconsulting']
profiles_experience['direction_roles']= profiles_experience['exp1_title_direction']+profiles_experience['exp2_title_direction']+profiles_experience['exp3_title_direction']+profiles_experience['exp4_title_direction']+profiles_experience['exp5_title_direction']
profiles_experience['founding_roles']= profiles_experience['exp1_titlefounder']+profiles_experience['exp2_titlefounder']+profiles_experience['exp3_titlefounder']+profiles_experience['exp4_titlefounder']+profiles_experience['exp5_titlefounder']
print(profiles_experience['consulting_roles'].value_counts()), print(profiles_experience['direction_roles'].value_counts()), print(profiles_experience.founding_roles.value_counts())

In [None]:
people_experience_db['startDate_month']=np.array(people_experience_db['startDate_month'], np.int16) 
people_experience_db['startDate_year']=np.array(people_experience_db['startDate_year'], np.int16) 
people_experience_db['endDate_month']=np.array(people_experience_db['endDate_month'], np.int16) 
people_experience_db['endDate_year']=np.array(people_experience_db['endDate_year'], np.int16) 
people_experience_db['company_empl_low']=np.array(people_experience_db['company_empl_low'], np.int16) 
people_experience_db['company_empl_high']= np.array(people_experience_db['company_empl_high'], np.int16) 
