## Importing information

In [1]:
import pandas as pd
import json
import numpy as np
import os

In [2]:
def create_profile(x, url):
    with open(str(url)+str(x)) as f:
        dict1 = json.load(f)
    list_col= ['experience', 'education', 'languages','languages','publications', 'certifications','volunteer', 'honors', 'projects']
    for n in list_col:
        if n in dict1:
            for i in range(len(dict1[n])): 
                dict1[str(n+str(i+1))]= dict1[n][i]
    data = pd.DataFrame.from_dict(dict1, orient='index').T
    return data

In [3]:
file_list = [f for f in os.listdir('../data/jsons') if f.endswith(".json")]
len(file_list)

49

In [4]:
initial_profile=create_profile('brachetantoine.json', '../data/jsons/')

In [5]:
profiles = initial_profile
for i in file_list:
    a= create_profile(i, '../data/jsons/')
    profiles= pd.concat((profiles,a),axis=0)

In [6]:
file_list2 = [f for f in os.listdir('../data/jsons2') if f.endswith(".json")]
len(file_list2)

124

In [7]:
profiles2 = initial_profile
for i in file_list2:
    a= create_profile(i,'../data/jsons2/')
    profiles2= pd.concat((profiles2,a),axis=0)

In [8]:
data=pd.concat([profiles, profiles2]).reset_index(drop=True)
data.head(2)

Unnamed: 0,summary,industryName,lastName,student,geoCountryName,geoCountryUrn,geoLocationBackfilled,elt,industryUrn,firstName,...,publications9,publications10,maidenName,projects8,certifications5,certifications6,certifications7,certifications8,certifications9,certifications10
0,Antoine croit à l’intelligence de tous et à la...,IT Services and IT Consulting,Brachet,False,France,urn:li:fs_geo:105015875,False,False,urn:li:fs_industry:96,Antoine,...,,,,,,,,,,
1,French Entrepreneur - Founder and Managing Par...,Venture Capital and Private Equity Principals,Demri,False,France,urn:li:fs_geo:105015875,False,False,urn:li:fs_industry:106,Bobby,...,,,,,,,,,,


## Initial dataset cleaning

In [9]:
data.shape

(173, 89)

In [10]:
print(list(data.columns))

['summary', 'industryName', 'lastName', 'student', 'geoCountryName', 'geoCountryUrn', 'geoLocationBackfilled', 'elt', 'industryUrn', 'firstName', 'entityUrn', 'geoLocation', 'geoLocationName', 'location', 'headline', 'displayPictureUrl', 'img_100_100', 'img_200_200', 'img_400_400', 'img_800_800', 'profile_id', 'profile_urn', 'member_urn', 'public_id', 'experience', 'education', 'languages', 'publications', 'certifications', 'volunteer', 'honors', 'projects', 'experience1', 'experience2', 'experience3', 'experience4', 'experience5', 'education1', 'education2', 'education3', 'languages1', 'languages2', 'languages3', 'publications1', 'publications2', 'publications3', 'publications4', 'publications5', 'publications6', 'publications7', 'publications8', 'locationName', 'honors1', 'honors2', 'honors3', 'honors4', 'honors5', 'honors6', 'honors7', 'volunteer1', 'volunteer2', 'volunteer3', 'projects1', 'projects2', 'projects3', 'birthDate', 'languages4', 'projects4', 'projects5', 'address', 'lan

### Drop duplicates in rows

In [11]:
mask = data.duplicated(subset=['headline', 'firstName', 'lastName'], keep='first')
dropped = data.loc[mask]
dropped

Unnamed: 0,summary,industryName,lastName,student,geoCountryName,geoCountryUrn,geoLocationBackfilled,elt,industryUrn,firstName,...,publications9,publications10,maidenName,projects8,certifications5,certifications6,certifications7,certifications8,certifications9,certifications10
3,Antoine croit à l’intelligence de tous et à la...,IT Services and IT Consulting,Brachet,False,France,urn:li:fs_geo:105015875,False,False,urn:li:fs_industry:96,Antoine,...,,,,,,,,,,
48,Antoine croit à l’intelligence de tous et à la...,IT Services and IT Consulting,Brachet,False,France,urn:li:fs_geo:105015875,False,False,urn:li:fs_industry:96,Antoine,...,,,,,,,,,,
50,French Entrepreneur - Founder and Managing Par...,Venture Capital and Private Equity Principals,Demri,False,France,urn:li:fs_geo:105015875,False,False,urn:li:fs_industry:106,Bobby,...,,,,,,,,,,
51,Inspired by the power we can build to change t...,Civic and Social Organizations,Durieux,False,France,urn:li:fs_geo:105015875,False,False,urn:li:fs_industry:90,Sarah,...,,,,,,,,,,
84,,Internet,Paret,False,France,urn:li:fs_geo:105015875,False,True,urn:li:fs_industry:6,Jeremie,...,,,,,,,,,,


In [12]:
data.drop_duplicates(subset=['headline', 'firstName', 'lastName'], keep='first', inplace=True)

### Dropping unnecessary columns (specific to LIn, personal information, etc.)

In [13]:
pd.options.display.max_rows = 100

In [14]:
todrop=['industryName', 'experience', 'education', 'languages','languages','publications', 
        'certifications','volunteer', 'honors', 'projects', 
        'displayPictureUrl', 'img_100_100', 'img_200_200', 'img_400_400', 
        'img_800_800', 'student', 'maidenName', 'birthDate', 
        'address','geoCountryUrn', 'geoLocationBackfilled', 'elt', 'industryUrn',
        'entityUrn', 'geoLocation', 'location', 'locationName', 
        'profile_id', 'profile_urn', 'member_urn', 'public_id']
data.drop(columns=todrop, inplace=True)

In [15]:
data.describe().T.sort_index()

Unnamed: 0,count,unique,top,freq
certifications1,42,42,{'name': 'Psychology'},1
certifications10,3,3,"{'authority': 'LinkedIn', 'name': 'Project Man...",1
certifications2,21,21,"{'authority': 'OpenClassrooms', 'name': 'Learn...",1
certifications3,12,12,"{'authority': 'COLIBRIS', 'name': 'Quelle démo...",1
certifications4,7,7,"{'authority': 'A Cloud Guru', 'name': 'Introdu...",1
certifications5,6,6,"{'authority': 'Google', 'name': 'Google Ads Ce...",1
certifications6,6,6,"{'authority': 'Google', 'name': 'Certification...",1
certifications7,6,6,"{'authority': 'HubSpot', 'name': 'Hubspot Inbo...",1
certifications8,4,4,"{'authority': 'LinkedIn', 'name': 'Become an H...",1
certifications9,4,4,"{'authority': 'ETS Global', 'name': 'TOEIC', '...",1


### Exploding columns

In [16]:
# there must be a more functional way to do what is happening below
# but i'm too tired :)

In [17]:
for i in range (1,4): 
    data=pd.concat([data, 
    pd.DataFrame(data['education'+str(i)].apply(pd.Series)).add_prefix('ed_'+str(i)), 
    pd.DataFrame(data['volunteer'+str(i)].apply(pd.Series)).add_prefix('vol_'+str(i))], 
    axis=1).reset_index(drop=True)
    data.drop(columns=['education'+str(i),'volunteer'+str(i)], inplace=True)   

In [18]:
for i in range (1,6): 
    data=pd.concat([data, 
    pd.DataFrame(data['experience'+str(i)].apply(pd.Series)).add_prefix('exp_'+str(i))], 
    axis=1).reset_index(drop=True)
    data.drop(columns='experience'+str(i), inplace=True)   

In [19]:
for i in range (1,8): 
    data=pd.concat([data, 
    pd.DataFrame(data['honors'+str(i)].apply(pd.Series)).add_prefix('hon_'+str(i)), 
    pd.DataFrame(data['languages'+str(i)].apply(pd.Series)).add_prefix('lang_'+str(i))], 
    axis=1).reset_index(drop=True)
    data.drop(columns=['honors'+str(i),'languages'+str(i)], inplace=True)    

In [20]:
for i in range (1,9): 
    data=pd.concat([data, 
    pd.DataFrame(data['projects'+str(i)].apply(pd.Series)).add_prefix('proj_'+str(i))], 
    axis=1).reset_index(drop=True)
    data.drop(columns='projects'+str(i), inplace=True)

In [21]:
for i in range (1,11): 
    data=pd.concat([data, 
    pd.DataFrame(data['certifications'+str(i)].apply(pd.Series)).add_prefix('cert_'+str(i)), 
    pd.DataFrame(data['publications'+str(i)].apply(pd.Series)).add_prefix('pub_'+str(i))], 
    axis=1).reset_index(drop=True)
    data.drop(columns=['certifications'+str(i),'publications'+str(i)], inplace=True)

In [22]:
list_columns=[i for i in data.columns if ("Urn" in i or "Logo" in i or "_region" in i or '_abuse' in i or '_geo' in i or '_proj' in i or '_organi' in i or '_0' in i)]
#to check before dropping 
data[list_columns].T
data.drop(columns=list_columns, inplace=True)

### We seem to still have duplicates

In [23]:
print(sorted(list(data.lastName.str.lower())))

['albet', 'alemanno', 'ami', 'andré', 'arrouard', 'auclair', 'auclair', 'baillot', 'banzet', 'barbaut', 'barbero', 'barbier', 'barre', 'bauquet', 'behar', 'behar', 'benkirane', 'bensa', 'berenguer-moncada', 'bertrand', 'bertrand', 'bessoles', 'besson', 'besson', 'bianchin-fabre', 'bloch', 'bouché', 'boudehen', 'bourlitio', 'brachet', 'briante guillemont', 'buchotte', 'bézard', 'canivet', 'chadenet', 'chaput', 'chatellier', 'chaygneaud-dupuy', 'chevallereau', 'choisne', 'claret', 'cochin', 'colas', 'colas', 'colombe', 'combaz', 'coudard', 'coutant', 'coutant', 'd', 'd.', 'daniel', 'daniel', 'dardier', 'dauchez', 'de bodman', 'de briey', 'de marignan', 'de pimodan', 'de roquefeuil', 'de royer', 'de sousa', 'de sousa', 'deffrennes', 'deleris', 'deleris', 'dembele', 'demri', 'demri', 'dernoncourt', 'des gachons', 'desmaison', 'deville', 'diguet', 'dubreuil', 'dubreuil', 'duguet', 'duperthuy', 'durieux', 'duriez', 'duriez', 'duval', 'désigaud', 'désigaud', 'escoubes', 'eve', 'forest', 'fox'

In [24]:
(data.loc[(data.lastName.str.lower()=="auclair")|(data.lastName.str.lower()=="behar")|(data.lastName.str.lower()=="bertrand")|(data.lastName.str.lower()=="besson")|(data.lastName.str.lower()=="colas")|(data.lastName.str.lower()=="coutant")|(data.lastName.str.lower()=="daniel")|(data.lastName.str.lower()=="de sousa")|(data.lastName.str.lower()=="deleris")|(data.lastName.str.lower()=="demri")|(data.lastName.str.lower()=="dubreuil")|(data.lastName.str.lower()=="duriez")|(data.lastName.str.lower()=="désigaud")|(data.lastName.str.lower()=="kergall")|(data.lastName.str.lower()=="l.")|(data.lastName.str.lower()=="le roux")|(data.lastName.str.lower()=="legros")|(data.lastName.str.lower()=="lenoir")|(data.lastName.str.lower()=="levoir")|(data.lastName.str.lower()=="trèves")])[["firstName", "lastName"]].sort_values(by="firstName")
data.drop(labels=[69, 121, 161, 82, 84, 93, 116,147,153,106,64,113,166,128,124,50,67,163, 150,157], axis=0, inplace=True)

### Dropping columns with too many missing values

In [25]:
a= list(data.columns)
a.sort()
data= data[a]

In [26]:
nullvalues=pd.DataFrame(data.isna().sum().sort_values(ascending=False))

In [27]:
print(list(nullvalues.index)[0:50])

['cert_10', 'cert_60', 'proj_40', 'proj_50', 'exp_20', 'proj_60', 'exp_10', 'proj_70', 'ed_30', 'cert_100', 'ed_20', 'pub_10', 'pub_100', 'pub_20', 'pub_30', 'pub_40', 'ed_10', 'cert_90', 'cert_80', 'proj_30', 'proj_20', 'proj_10', 'exp_50', 'hon_30', 'hon_40', 'hon_20', 'hon_50', 'hon_10', 'hon_60', 'hon_70', 'lang_10', 'exp_30', 'lang_20', 'lang_30', 'exp_40', 'lang_40', 'lang_50', 'lang_60', 'lang_70', 'cert_70', 'proj_80', 'cert_40', 'pub_70', 'vol_30', 'vol_20', 'pub_90', 'pub_80', 'cert_20', 'vol_10', 'cert_50']


In [28]:
data.shape

(148, 395)

In [29]:
data.reset_index(inplace=True)
data.set_index("index")

Unnamed: 0_level_0,cert_10,cert_100,cert_10authority,cert_10company,cert_10displaySource,cert_10licenseNumber,cert_10name,cert_10timePeriod,cert_10url,cert_1authority,...,vol_2description,vol_2role,vol_2timePeriod,vol_30,vol_3cause,vol_3company,vol_3companyName,vol_3description,vol_3role,vol_3timePeriod
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,Empow’her supports women entrepreneurs so the...,Board Member,"{'endDate': {'month': 3, 'year': 2021}, 'start...",,,,Rencontre des Justices,Rencontre des Justices is a grassroots movemen...,Co-founder,"{'endDate': {'month': 7, 'year': 2020}, 'start..."
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,Électeurs en herbe est un programme d'éducatio...,Présidente-fondatrice,"{'startDate': {'month': 7, 'year': 2016}}",,,{'miniCompany': {'objectUrn': 'urn:li:company:...,Viking Club Paris,,Dirigeante - Coach de l'équipe féminine U11 - U13,"{'endDate': {'month': 6, 'year': 2021}, 'start..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,,,,,,,,,,,...,,Animateur d’ateliers,"{'startDate': {'month': 9, 'year': 2018}}",,EDUCATION,{'miniCompany': {'objectUrn': 'urn:li:company:...,Toi Aussi !,,Animateur d’interventions,
162,,,,,,,,,,Ministère chargé de l'Éducation nationale,...,,,,,,,,,,
164,,,,,,,,,,,...,,,,,,,,,,
165,,,,,,,,,,,...,,,,,,,,,,


In [30]:
columns= data.columns
columnstodrop=[i for i in data.columns if (data[i].isna().sum() > len(data)/2+10) or (i=="index")]
print(columnstodrop)

add_data=data[["index", "lastName", "firstName"]]
add_data=add_data.merge(data[columnstodrop], how='left', on="index")
add_data.head()

['index', 'cert_10', 'cert_100', 'cert_10authority', 'cert_10company', 'cert_10displaySource', 'cert_10licenseNumber', 'cert_10name', 'cert_10timePeriod', 'cert_10url', 'cert_1authority', 'cert_1company', 'cert_1displaySource', 'cert_1licenseNumber', 'cert_1name', 'cert_1timePeriod', 'cert_1url', 'cert_20', 'cert_2authority', 'cert_2company', 'cert_2displaySource', 'cert_2licenseNumber', 'cert_2name', 'cert_2timePeriod', 'cert_2url', 'cert_30', 'cert_3authority', 'cert_3company', 'cert_3displaySource', 'cert_3licenseNumber', 'cert_3name', 'cert_3timePeriod', 'cert_3url', 'cert_40', 'cert_4authority', 'cert_4company', 'cert_4displaySource', 'cert_4licenseNumber', 'cert_4name', 'cert_4timePeriod', 'cert_4url', 'cert_50', 'cert_5authority', 'cert_5company', 'cert_5displaySource', 'cert_5licenseNumber', 'cert_5name', 'cert_5timePeriod', 'cert_5url', 'cert_60', 'cert_6authority', 'cert_6company', 'cert_6displaySource', 'cert_6licenseNumber', 'cert_6name', 'cert_6timePeriod', 'cert_6url', 'c

Unnamed: 0,index,lastName,firstName,cert_10,cert_100,cert_10authority,cert_10company,cert_10displaySource,cert_10licenseNumber,cert_10name,...,vol_2description,vol_2role,vol_2timePeriod,vol_30,vol_3cause,vol_3company,vol_3companyName,vol_3description,vol_3role,vol_3timePeriod
0,0,Brachet,Antoine,,,,,,,,...,,,,,,,,,,
1,1,Demri,Bobby,,,,,,,,...,,,,,,,,,,
2,2,Durieux,Sarah,,,,,,,,...,Empow’her supports women entrepreneurs so the...,Board Member,"{'endDate': {'month': 3, 'year': 2021}, 'start...",,,,Rencontre des Justices,Rencontre des Justices is a grassroots movemen...,Co-founder,"{'endDate': {'month': 7, 'year': 2020}, 'start..."
3,3,Jaillot,Bastien,,,,,,,,...,,,,,,,,,,
4,4,Vanneroy,Coline,,,,,,,,...,Électeurs en herbe est un programme d'éducatio...,Présidente-fondatrice,"{'startDate': {'month': 7, 'year': 2016}}",,,{'miniCompany': {'objectUrn': 'urn:li:company:...,Viking Club Paris,,Dirigeante - Coach de l'équipe féminine U11 - U13,"{'endDate': {'month': 6, 'year': 2021}, 'start..."


In [46]:
add_data.to_csv("//Users/tatianadeferaudy/Desktop/bacasable/marche_CT/additional_data_low_response.csv", sep=';', index=True)

In [32]:
data.drop(columns=columnstodrop, inplace=True)

In [33]:
nullvalues=pd.DataFrame(data.isna().sum().sort_values(ascending=False))
nullvalues.head()

Unnamed: 0,0
vol_1timePeriod,82
exp_5region,82
exp_2region,81
exp_3region,80
vol_1role,78


### Other recoding - could be improved

In [35]:
data.drop(columns=['ed_1school', 'ed_2school','ed_3school'], inplace=True)

In [41]:
data.head()

Unnamed: 0,ed_1degreeName,ed_1fieldOfStudy,ed_1schoolName,ed_1timePeriod,ed_2degreeName,ed_2fieldOfStudy,ed_2schoolName,ed_2timePeriod,ed_3degreeName,ed_3fieldOfStudy,...,lang_1proficiency,lang_2name,lang_2proficiency,lang_3name,lang_3proficiency,lastName,summary,vol_1companyName,vol_1role,vol_1timePeriod
0,Master,Business/Managerial Economics,ESCP Europe,"{'endDate': {'year': 2002}, 'startDate': {'yea...",,,Prépa Saint Jean de Douai,"{'endDate': {'year': 1998}, 'startDate': {'yea...",Baccalauréat,Economics,...,FULL_PROFESSIONAL,French,NATIVE_OR_BILINGUAL,German,ELEMENTARY,Brachet,Antoine croit à l’intelligence de tous et à la...,,,
1,,Relations Internationales et Sciences Politiques,Ecole des Hautes Etudes Politiques,"{'endDate': {'year': 2009}, 'startDate': {'yea...",,Relations et affaires internationales,The Hebrew University,"{'endDate': {'year': 2006}, 'startDate': {'yea...",Baccalauréat,Sciences Economiques et Sociales Option Anglais,...,,,,,,Demri,French Entrepreneur - Founder and Managing Par...,,,
2,Global messaging,Communication orale et rhétorique,ASO communications / New Economy Organizers Ne...,"{'endDate': {'month': 6, 'year': 2022}, 'start...",Training program to prepare women to political...,Political sciences and government,Investies,"{'endDate': {'month': 6, 'year': 2021}, 'start...",Public narrative,Leadership organisationnel,...,NATIVE_OR_BILINGUAL,Espagnol,,Italien,,Durieux,Inspired by the power we can build to change t...,Collectif Quartier Général,Co-Founder,"{'endDate': {'month': 1, 'year': 2022}, 'start..."
3,Master informatique,Architecture logicielle distribuée,Université Bordeaux I,"{'endDate': {'year': 2009}, 'startDate': {'yea...",,,,,,,...,,,,,,Jaillot,"IT Consultant, Technical Expert, Lead develope...",,,
4,Master 2,Sciences politiques,Université Paris 1 Panthéon-Sorbonne,"{'endDate': {'year': 2007}, 'startDate': {'yea...",Licence,Droit,Université Lille 2 Droit et Santé,"{'endDate': {'year': 2005}, 'startDate': {'yea...",,,...,,,,,,Vanneroy,,La Cravate Solidaire,Administratrice,"{'endDate': {'month': 6, 'year': 2021}, 'start..."


#### Time columns could be split

In [29]:
timecols= [i for i in data.columns if "time" in i]
timecols

['ed_1timePeriod',
 'ed_2timePeriod',
 'ed_3timePeriod',
 'exp_1timePeriod',
 'exp_2timePeriod',
 'exp_3timePeriod',
 'exp_4timePeriod',
 'exp_5timePeriod']

In [30]:
#data.isna().sum()
#data.fillna("[]", inplace=True)
# we are filling na with [] because special characters will be removed later on. 

In [42]:
data.shape

(148, 66)

### Make dataframe with described dataset (modes, counts...) - not interesting if the dataframe is at this level of transformation

In [33]:
#data.describe().T
#general_pop_data= pd.DataFrame(data.describe(include="O").T)
#general_pop_data.to_csv("general_pop_data.csv", sep=';')

In [43]:
pd.options.display.max_rows = 100
data.head(5).T

Unnamed: 0,0,1,2,3,4
ed_1degreeName,Master,,Global messaging,Master informatique,Master 2
ed_1fieldOfStudy,Business/Managerial Economics,Relations Internationales et Sciences Politiques,Communication orale et rhétorique,Architecture logicielle distribuée,Sciences politiques
ed_1schoolName,ESCP Europe,Ecole des Hautes Etudes Politiques,ASO communications / New Economy Organizers Ne...,Université Bordeaux I,Université Paris 1 Panthéon-Sorbonne
ed_1timePeriod,"{'endDate': {'year': 2002}, 'startDate': {'yea...","{'endDate': {'year': 2009}, 'startDate': {'yea...","{'endDate': {'month': 6, 'year': 2022}, 'start...","{'endDate': {'year': 2009}, 'startDate': {'yea...","{'endDate': {'year': 2007}, 'startDate': {'yea..."
ed_2degreeName,,,Training program to prepare women to political...,,Licence
ed_2fieldOfStudy,,Relations et affaires internationales,Political sciences and government,,Droit
ed_2schoolName,Prépa Saint Jean de Douai,The Hebrew University,Investies,,Université Lille 2 Droit et Santé
ed_2timePeriod,"{'endDate': {'year': 1998}, 'startDate': {'yea...","{'endDate': {'year': 2006}, 'startDate': {'yea...","{'endDate': {'month': 6, 'year': 2021}, 'start...",,"{'endDate': {'year': 2005}, 'startDate': {'yea..."
ed_3degreeName,Baccalauréat,Baccalauréat,Public narrative,,
ed_3fieldOfStudy,Economics,Sciences Economiques et Sociales Option Anglais,Leadership organisationnel,,


### Exporting our dataset

In [47]:
data.to_csv("//Users/tatianadeferaudy/Desktop/Bacasable/marche_CT/dataset3.csv", sep=';', index=False)