# Importing json profiles into a database 

In [58]:
import pandas as pd
import json
import numpy as np

In [59]:
def create_profile(x):
    with open('../data/jsons/'+str(x)) as f:
        dict1 = json.load(f)
    list_col= ['experience', 'education', 'languages']
    for n in list_col:
        if n in dict1:
            for i in range(len(dict1[n])): 
                dict1[str(n+str(i+1))]= dict1[n][i]
    data = pd.DataFrame.from_dict(dict1, orient='index').T
    return data

In [60]:
import os
file_list = [f for f in os.listdir('../data/jsons') if f.endswith(".json")]
len(file_list)
# you can replace the list file_list by whatever list of jsons you want to import

49

In [61]:
initial_profile=create_profile('brachetantoine.json')

In [62]:
list_profiles=file_list
profiles = initial_profile
for i in list_profiles:
    a= create_profile(i)
    profiles= pd.concat((profiles,a),axis=0)

In [63]:
profiles.reset_index(inplace=True)

# Initial cleaning

### dropping duplicate columns and rows

In [64]:
pd.options.display.max_rows = 100
profiles.describe(include="O").T

# total 48 rows

# COLUMNS 
# ['profile_id'] = str : linkedin id - num and letters, 47 unique 
# ['lastName'] - full 
# ['firstName'] - full 
# ['geoCountryName'] = country name, 3 values, 46 filled
# ['geoLocationName'] = region name, 11 unique - check and recode 
# ['birthDate'] = 35 missing  

# ['summary'] = text, includes \n, 32 unique
# ['industryName'] = short text, full, 16 unique
# ['headline'] = professional headline, 47 unique 

# ['experience'] = liste de dictionnaires. chaque élément est accessible par [0], [1] - full 
# ['education'][0] = liste de dictionnaires - full 
# ['languages'] = list de dictionnaires - name, proficiency - 32 
# ['publications'] =  16 
# ['certifications'] = 11
# ['volunteer'] = 17 
# ['honors'] =  12
# ['projects'] =  22

# to drop (specific to linkedin, not relevant, repeated): 
# displayPictureUrl', 'img_100_100', 'img_200_200', 'img_400_400', 'img_800_800',
# ['student'] = boolean T/F - only false 
# ['geoCountryUrn'] === CODED - internal LIn code 
# ['geoLocationBackfilled'] === boolean T/F - only false 
# ['elt'] === boolean T/F - only false 
# ['industryUrn'] === CODED (format urn:li:fs_industry:96) // to industryName 
# ['displayPictureUrl'] = url to picture (not accessible without linkedin )
# ['img_100_100', 'img_200_200', 'img_400_400', 'img_800_800'] = ref of image not accessible 
# ['profile_urn'] = str : format urn:li:fs_miniProfile:ACoAAAA615EBaNquQR5gOz_oFr9emeCr0ZNw67M 
# ['member_urn'] = str : format urn:li:member:3856273
# ['public_id'] = str : lastnamefirstname lower ()
# ['entityUrn'] === CODED (format urn:li:fs_profile:ACoAAAA615EBaNquQR5gOz_oFr9emeCr0ZNw67M)
# ['geoLocation']) === CODED, 12 unique 
# ['location'] === dictionary with one nested dictionary, country code - 4 unique 
# ['locationName'] = country name, 4 values (incl. other), 46 filled
# ['address'] = only 1 value (email)

Unnamed: 0,count,unique,top,freq
summary,33,32,Antoine croit à l’intelligence de tous et à la...,2
industryName,48,16,IT Services and IT Consulting,11
lastName,48,47,Brachet,2
student,48,1,False,48
geoCountryName,46,3,France,39
geoCountryUrn,46,3,urn:li:fs_geo:105015875,39
geoLocationBackfilled,48,1,False,48
elt,48,1,False,48
industryUrn,48,16,urn:li:fs_industry:96,11
firstName,48,41,Antoine,3


In [65]:
profiles.columns

Index(['index', 'summary', 'industryName', 'lastName', 'student',
       'geoCountryName', 'geoCountryUrn', 'geoLocationBackfilled', 'elt',
       'industryUrn', 'firstName', 'entityUrn', 'geoLocation',
       'geoLocationName', 'location', 'headline', 'displayPictureUrl',
       'img_100_100', 'img_200_200', 'img_400_400', 'img_800_800',
       'profile_id', 'profile_urn', 'member_urn', 'public_id', 'experience',
       'education', 'languages', 'publications', 'certifications', 'volunteer',
       'honors', 'projects', 'experience1', 'experience2', 'experience3',
       'experience4', 'experience5', 'education1', 'education2', 'education3',
       'languages1', 'languages2', 'languages3', 'locationName', 'birthDate',
       'languages4', 'address', 'languages5', 'languages6', 'languages7'],
      dtype='object')

In [66]:
# New database with only the columns that interest us
profiles = profiles[['index', 'profile_id', 'lastName', 'firstName', 'geoCountryName', 'geoLocationName', 
                                  'summary', 'industryName', 'headline', 'experience', 'education', 
                                  'languages', 'publications', 'certifications', 'volunteer', 'honors', 'projects', 
                                  'experience1','experience2', 'experience3', 'experience4', 'experience5',
                                  'education1', 'education2', 'education3', 'languages1', 'languages2',
                                  'languages3','languages4', 'languages5', 'languages6', 'languages7']]

In [67]:
mask = profiles.duplicated(subset=['profile_id'], keep='first')
dropped = profiles.loc[mask]
dropped

Unnamed: 0,index,profile_id,lastName,firstName,geoCountryName,geoLocationName,summary,industryName,headline,experience,...,education1,education2,education3,languages1,languages2,languages3,languages4,languages5,languages6,languages7
3,0,ACoAAAA615EBaNquQR5gOz_oFr9emeCr0ZNw67M,Brachet,Antoine,France,Greater Paris Metropolitan Region,Antoine croit à l’intelligence de tous et à la...,IT Services and IT Consulting,"Directeur associé de bluenove, initiateur du m...","[{'locationName': 'Paris Area, France', 'entit...",...,{'entityUrn': 'urn:li:fs_education:(ACoAAAA615...,{'entityUrn': 'urn:li:fs_education:(ACoAAAA615...,{'entityUrn': 'urn:li:fs_education:(ACoAAAA615...,"{'name': 'English', 'proficiency': 'FULL_PROFE...","{'name': 'French', 'proficiency': 'NATIVE_OR_B...","{'name': 'German', 'proficiency': 'ELEMENTARY'}",,,,


In [68]:
profiles.drop_duplicates(subset=['profile_id'], keep='first', inplace=True)

In [69]:
# We may also have duplicates because some people have a CV in two languages 
mask = profiles.duplicated(subset=['lastName', 'firstName'], keep='first')
dropped = profiles.loc[mask]
print(len(dropped))
profiles.drop_duplicates(subset=['lastName', 'firstName'], keep='first', inplace=True)

0


### Recoding the honors, publications, volunteer, projects, certifications and languages columns

In [70]:
print(len(profiles))

47


In [71]:
profiles.isna().sum().sort_values(ascending=False)
# we have 47 rows 

languages7         46
languages6         44
languages5         44
languages4         36
languages3         23
languages2         18
summary            15
languages1         15
education3         12
education2          6
geoLocationName     4
experience5         3
geoCountryName      2
experience4         2
education1          1
experience2         0
experience3         0
index               0
experience1         0
profile_id          0
honors              0
volunteer           0
certifications      0
publications        0
languages           0
education           0
experience          0
headline            0
industryName        0
firstName           0
lastName            0
projects            0
dtype: int64

In [72]:
# we want to drop the columns with too many missing values
# for columns projects, publications, certifications, and honors, there were a lot of missing values
# (our original code exploded them like the experience, language and education information)
# for these columns, we will create one column that states whether this section was filled
# and keep the detail in just the original column as text. 
# for languages, we will do the same but also keep whether 3 or more languages were stated

profiles['languages3'] = (profiles['languages3']).astype(str)
profiles['languages_over2']=np.where(profiles['languages3']=='nan', '0', '1')
print(profiles['languages_over2'].value_counts())

list_columns=['honors', 'publications', 'volunteer', 'projects', 'certifications', 'languages']
for i in list_columns:
    profiles[i]=(profiles[i]).astype(str)
    profiles[i+'_stated']=np.where(profiles[i]=='[]', '0', '1')
    print(profiles[i+'_stated'].value_counts())

profiles.head()

1    24
0    23
Name: languages_over2, dtype: int64
0    36
1    11
Name: honors_stated, dtype: int64
0    32
1    15
Name: publications_stated, dtype: int64
0    31
1    16
Name: volunteer_stated, dtype: int64
0    26
1    21
Name: projects_stated, dtype: int64
0    37
1    10
Name: certifications_stated, dtype: int64
1    32
0    15
Name: languages_stated, dtype: int64


Unnamed: 0,index,profile_id,lastName,firstName,geoCountryName,geoLocationName,summary,industryName,headline,experience,...,languages5,languages6,languages7,languages_over2,honors_stated,publications_stated,volunteer_stated,projects_stated,certifications_stated,languages_stated
0,0,ACoAAAA615EBaNquQR5gOz_oFr9emeCr0ZNw67M,Brachet,Antoine,France,Greater Paris Metropolitan Region,Antoine croit à l’intelligence de tous et à la...,IT Services and IT Consulting,"Directeur associé de bluenove, initiateur du m...","[{'locationName': 'Paris Area, France', 'entit...",...,,,,1,0,1,0,0,0,1
1,0,ACoAAAfK9YwBjZr16cDaVuxZICOg0QUnbPiUoXE,Demri,Bobby,France,"Paris, Île-de-France",French Entrepreneur - Founder and Managing Par...,Venture Capital and Private Equity Principals,Founder & Managing Partner at ROCH Ventures,[{'entityUrn': 'urn:li:fs_position:(ACoAAAfK9Y...,...,,,,0,1,0,0,0,0,0
2,0,ACoAAAUn_5ABqO0mSShQxo4gFyTaCDoaYUk5Fm8,Durieux,Sarah,France,"Paris, Île-de-France",Inspired by the power we can build to change t...,Civic and Social Organizations,Co-director Multitudes Foundation - Activist a...,[{'entityUrn': 'urn:li:fs_position:(ACoAAAUn_5...,...,,,,1,0,1,1,0,0,1
4,0,ACoAAARi9fcBfH1GjbFHnWc1QqHftfh3LQjRUEg,Jaillot,Bastien,France,Greater Paris Metropolitan Region,"IT Consultant, Technical Expert, Lead develope...",IT Services and IT Consulting,Web expert chez Jolicode,[{'entityUrn': 'urn:li:fs_position:(ACoAAARi9f...,...,,,,0,0,1,0,1,0,1
5,0,ACoAAApOhv4B3_GF-OGg8-ipxxkMf6AOggWigl8,Vanneroy,Coline,France,Greater Paris Metropolitan Region,,IT Services and IT Consulting,Directrice des opérations chez Cap Collectif,[{'entityUrn': 'urn:li:fs_position:(ACoAAApOhv...,...,,,,0,0,0,1,1,0,0


In [73]:
# DROPPING THE LANGUAGE COLUMN 
profiles.drop(columns=['languages1', 'languages2',
       'languages3', 'languages4', 'languages5', 'languages6', 'languages7'], inplace=True)

Notes for later : 
* there is surely a better way to deal with the languages, publications, honors, volunteers, project and certifications information. However, we have so many missing values that a yes/no column for each of these categories (filled or not) already provides some information. 
* the languages category could be split in different ways. We chose to highlight the profiles who state 3 or more languages spoken. For this category, the name of languages could be treated as text.

**Important** 
We could change the import code to not split the languages, publications, honors, volunteers, projects and certification columns, but instead just add a column based on the existing ones. 


### Dealing with the experience and education columns

In [74]:
profiles.describe(include='O').T

Unnamed: 0,count,unique,top,freq
profile_id,47,47,ACoAAAA615EBaNquQR5gOz_oFr9emeCr0ZNw67M,1
lastName,47,47,Brachet,1
firstName,47,41,Nicolas,3
geoCountryName,45,3,France,38
geoLocationName,43,11,"Paris, Île-de-France",15
summary,32,32,Antoine croit à l’intelligence de tous et à la...,1
industryName,47,16,IT Services and IT Consulting,10
headline,47,47,"Directeur associé de bluenove, initiateur du m...",1
experience,47,47,"[{'locationName': 'Paris Area, France', 'entit...",1
education,47,47,[{'entityUrn': 'urn:li:fs_education:(ACoAAAA61...,1


In [75]:
profiles_experience=pd.concat([profiles, 
pd.DataFrame(profiles['experience1'].apply(pd.Series)).add_prefix('exp1_'), 
pd.DataFrame(profiles['experience2'].apply(pd.Series)).add_prefix('exp2_'), 
pd.DataFrame(profiles['experience3'].apply(pd.Series)).add_prefix('exp3_'), 
pd.DataFrame(profiles['experience4'].apply(pd.Series)).add_prefix('exp4_'), 
pd.DataFrame(profiles['experience5'].apply(pd.Series)).add_prefix('exp5_')], 
axis=1).reset_index(drop=True)

In [76]:
profiles_experience= pd.concat([profiles_experience, 
pd.DataFrame(profiles_experience['education1'].apply(pd.Series)).add_prefix('ed1_'), 
pd.DataFrame(profiles_experience['education2'].apply(pd.Series)).add_prefix('ed2_'), 
pd.DataFrame(profiles_experience['education3'].apply(pd.Series)).add_prefix('ed3_')], 
axis=1).reset_index(drop=True)

In [77]:
print(list(profiles_experience.columns))

['index', 'profile_id', 'lastName', 'firstName', 'geoCountryName', 'geoLocationName', 'summary', 'industryName', 'headline', 'experience', 'education', 'languages', 'publications', 'certifications', 'volunteer', 'honors', 'projects', 'experience1', 'experience2', 'experience3', 'experience4', 'experience5', 'education1', 'education2', 'education3', 'languages_over2', 'honors_stated', 'publications_stated', 'volunteer_stated', 'projects_stated', 'certifications_stated', 'languages_stated', 'exp1_locationName', 'exp1_entityUrn', 'exp1_geoLocationName', 'exp1_companyName', 'exp1_timePeriod', 'exp1_description', 'exp1_company', 'exp1_title', 'exp1_companyUrn', 'exp1_companyLogoUrl', 'exp1_$anti_abuse_metadata', 'exp1_geoUrn', 'exp1_region', 'exp1_honors', 'exp1_organizations', 'exp2_entityUrn', 'exp2_companyName', 'exp2_timePeriod', 'exp2_company', 'exp2_title', 'exp2_companyUrn', 'exp2_companyLogoUrl', 'exp2_locationName', 'exp2_geoLocationName', 'exp2_geoUrn', 'exp2_description', 'exp2_r

In [78]:
for i in range (1,6):
    profiles_experience['exp'+str(i)+'_startDate_month']= (pd.DataFrame(profiles_experience['exp'+str(i)+'_timePeriod'].apply(pd.Series))['startDate']).apply(pd.Series)['month']
    profiles_experience['exp'+str(i)+'_startDate_year']= (pd.DataFrame(profiles_experience['exp'+str(i)+'_timePeriod'].apply(pd.Series))['startDate']).apply(pd.Series)['year']
    profiles_experience['exp'+str(i)+'_endDate_month']= (pd.DataFrame(profiles_experience['exp'+str(i)+'_timePeriod'].apply(pd.Series))['endDate']).apply(pd.Series)['month']
    profiles_experience['exp'+str(i)+'_endDate_year']= (pd.DataFrame(profiles_experience['exp'+str(i)+'_timePeriod'].apply(pd.Series))['endDate']).apply(pd.Series)['year']
    profiles_experience['exp'+str(i)+'_industry']= (pd.DataFrame(profiles_experience['exp1_company'].apply(pd.Series))['industries'])
    profiles_experience['exp'+str(i)+'_company_empl_low']= (pd.DataFrame(profiles_experience['exp1_company'].apply(pd.Series))['employeeCountRange']).apply(pd.Series)['start']
    profiles_experience['exp'+str(i)+'_company_empl_high']= (pd.DataFrame(profiles_experience['exp1_company'].apply(pd.Series))['employeeCountRange']).apply(pd.Series)['end']

In [79]:
profiles_experience.exp1_industry.value_counts()

[Computer Software]                      11
[Information Technology and Services]     9
[Civic & Social Organization]             7
[Management Consulting]                   4
[Higher Education]                        4
[Internet]                                2
[Venture Capital & Private Equity]        1
[Nonprofit Organization Management]       1
[Graphic Design]                          1
[Performing Arts]                         1
[Marketing and Advertising]               1
[Research]                                1
Name: exp1_industry, dtype: int64

In [80]:
for i in range (1,4):
    profiles_experience['ed'+str(i)+'_startDate_month']= (pd.DataFrame(profiles_experience['ed'+str(i)+'_timePeriod'].apply(pd.Series))['startDate']).apply(pd.Series)['month']
    profiles_experience['ed'+str(i)+'_startDate_year']= (pd.DataFrame(profiles_experience['ed'+str(i)+'_timePeriod'].apply(pd.Series))['startDate']).apply(pd.Series)['year']

In [81]:
print(list(profiles_experience.columns))

['index', 'profile_id', 'lastName', 'firstName', 'geoCountryName', 'geoLocationName', 'summary', 'industryName', 'headline', 'experience', 'education', 'languages', 'publications', 'certifications', 'volunteer', 'honors', 'projects', 'experience1', 'experience2', 'experience3', 'experience4', 'experience5', 'education1', 'education2', 'education3', 'languages_over2', 'honors_stated', 'publications_stated', 'volunteer_stated', 'projects_stated', 'certifications_stated', 'languages_stated', 'exp1_locationName', 'exp1_entityUrn', 'exp1_geoLocationName', 'exp1_companyName', 'exp1_timePeriod', 'exp1_description', 'exp1_company', 'exp1_title', 'exp1_companyUrn', 'exp1_companyLogoUrl', 'exp1_$anti_abuse_metadata', 'exp1_geoUrn', 'exp1_region', 'exp1_honors', 'exp1_organizations', 'exp2_entityUrn', 'exp2_companyName', 'exp2_timePeriod', 'exp2_company', 'exp2_title', 'exp2_companyUrn', 'exp2_companyLogoUrl', 'exp2_locationName', 'exp2_geoLocationName', 'exp2_geoUrn', 'exp2_description', 'exp2_r

In [82]:
list_columns=[i for i in profiles_experience.columns if ("Urn" in i or "Logo" in i or "_region" in i or '_abuse' in i or '_geo' in i or '_proj' in i or '_organi' in i or '_0' in i)]
#to check before dropping 
#profiles_experience[list_columns]
profiles_experience.drop(columns=list_columns, inplace=True)

In [83]:
# dropping timePeriod columns (since they were split)
for i in profiles_experience.columns:
    if 'timePeriod' in i:
        profiles_experience.drop(columns=i, inplace=True)

In [84]:
print(len(profiles_experience.columns))

127


In [85]:
profiles_experience.drop(columns='index', inplace=True)
profiles_experience.reset_index(inplace=True)
profiles_experience.head()

Unnamed: 0,index,profile_id,lastName,firstName,geoCountryName,geoLocationName,summary,industryName,headline,experience,...,exp5_endDate_year,exp5_industry,exp5_company_empl_low,exp5_company_empl_high,ed1_startDate_month,ed1_startDate_year,ed2_startDate_month,ed2_startDate_year,ed3_startDate_month,ed3_startDate_year
0,0,ACoAAAA615EBaNquQR5gOz_oFr9emeCr0ZNw67M,Brachet,Antoine,France,Greater Paris Metropolitan Region,Antoine croit à l’intelligence de tous et à la...,IT Services and IT Consulting,"Directeur associé de bluenove, initiateur du m...","[{'locationName': 'Paris Area, France', 'entit...",...,2017.0,[Management Consulting],11.0,50.0,,1998.0,,1996.0,,1993.0
1,1,ACoAAAfK9YwBjZr16cDaVuxZICOg0QUnbPiUoXE,Demri,Bobby,France,"Paris, Île-de-France",French Entrepreneur - Founder and Managing Par...,Venture Capital and Private Equity Principals,Founder & Managing Partner at ROCH Ventures,[{'entityUrn': 'urn:li:fs_position:(ACoAAAfK9Y...,...,2017.0,[Venture Capital & Private Equity],2.0,10.0,,2006.0,,2005.0,,1998.0
2,2,ACoAAAUn_5ABqO0mSShQxo4gFyTaCDoaYUk5Fm8,Durieux,Sarah,France,"Paris, Île-de-France",Inspired by the power we can build to change t...,Civic and Social Organizations,Co-director Multitudes Foundation - Activist a...,[{'entityUrn': 'urn:li:fs_position:(ACoAAAUn_5...,...,2021.0,[Nonprofit Organization Management],2.0,10.0,3.0,2022.0,9.0,2020.0,9.0,2020.0
3,3,ACoAAARi9fcBfH1GjbFHnWc1QqHftfh3LQjRUEg,Jaillot,Bastien,France,Greater Paris Metropolitan Region,"IT Consultant, Technical Expert, Lead develope...",IT Services and IT Consulting,Web expert chez Jolicode,[{'entityUrn': 'urn:li:fs_position:(ACoAAARi9f...,...,,,,,,2004.0,,,,
4,4,ACoAAApOhv4B3_GF-OGg8-ipxxkMf6AOggWigl8,Vanneroy,Coline,France,Greater Paris Metropolitan Region,,IT Services and IT Consulting,Directrice des opérations chez Cap Collectif,[{'entityUrn': 'urn:li:fs_position:(ACoAAApOhv...,...,2009.0,[Computer Software],11.0,50.0,,2005.0,,2002.0,,


# Table 1 - Names

In [86]:
profiles_experience.rename(columns={"index": "ind_id"}, inplace=True)

In [87]:
names=profiles_experience[["ind_id", "lastName", 'firstName']]
profiles_experience.drop(columns=['profile_id', "lastName", 'firstName'], inplace=True)

In [88]:
names.to_csv("names.csv", sep=';', index=False)

# Table 2 - General individuals table

### Cleaning columns

In [89]:
print(list(profiles_experience.columns))

['ind_id', 'geoCountryName', 'geoLocationName', 'summary', 'industryName', 'headline', 'experience', 'education', 'languages', 'publications', 'certifications', 'volunteer', 'honors', 'projects', 'experience1', 'experience2', 'experience3', 'experience4', 'experience5', 'education1', 'education2', 'education3', 'languages_over2', 'honors_stated', 'publications_stated', 'volunteer_stated', 'projects_stated', 'certifications_stated', 'languages_stated', 'exp1_locationName', 'exp1_companyName', 'exp1_description', 'exp1_company', 'exp1_title', 'exp1_honors', 'exp2_companyName', 'exp2_company', 'exp2_title', 'exp2_locationName', 'exp2_description', 'exp2_honors', 'exp3_locationName', 'exp3_companyName', 'exp3_company', 'exp3_title', 'exp3_description', 'exp4_locationName', 'exp4_companyName', 'exp4_description', 'exp4_company', 'exp4_title', 'exp5_locationName', 'exp5_companyName', 'exp5_description', 'exp5_company', 'exp5_title', 'exp5_honors', 'ed1_school', 'ed1_degreeName', 'ed1_schoolN

#### Location

In [90]:
profiles_experience.geoLocationName.value_counts()
profiles_experience.exp5_locationName.value_counts(dropna=False)

NaN                                      16
Paris Area, France                       10
Région de Paris, France                   5
Paris                                     2
Paris, France                             2
Tunisia                                   1
Berlin, Germany                           1
Ville de Paris, Île-de-France, France     1
Beyrouth, Liban                           1
Antananarivo (Madagascar)                 1
Région de Lisbonne, Portugal              1
75011                                     1
Région de Nantes, France                  1
Paris, Île-de-France, France              1
Chengdu                                   1
Dubai, United Arab Emirates               1
Geneva Area, Switzerland                  1
Name: exp5_locationName, dtype: int64

In [91]:
# recoding location columns 
# note : this type of formatting flattens elements with different locations 
# giving priority to the french one

def recoding_location(x):
    x=str(x)
    if ("Paris" in x) or ("PAris" in x) or ("Montreuil" in x) or ("Puteaux" in x):
        return 'Paris Metropolitan Region'
    elif ("Brussels" in x) or ("Bruxelles" in x): 
         return 'Brussels Metropolitan Region'
    elif "Berlin" in x: 
         return 'Berlin Metropolitan Region'
    elif "Nantes" in x: 
         return 'Nantes Metropolitan Region'
    elif "Bordeaux" in x: 
         return 'Bordeaux Metropolitan Region'
    elif "Lyon" in x: 
         return 'Lyon Metropolitan Region'
    elif "Marseille" in x: 
         return 'Marseille Metropolitan Region'
    elif "Lille" in x: 
         return 'Lille Metropolitan Region'
    else:
        return x

# possible improvement with geopy library 

# replace " France" (if it is only the word, not Ile de France) by nothing? 

In [92]:
for i in profiles_experience.columns:
    if ('locationName' in i) or ('LocationName' in i):
        profiles_experience[i]= profiles_experience[i].apply(recoding_location)

profiles_experience.head()

Unnamed: 0,ind_id,geoCountryName,geoLocationName,summary,industryName,headline,experience,education,languages,publications,...,exp5_endDate_year,exp5_industry,exp5_company_empl_low,exp5_company_empl_high,ed1_startDate_month,ed1_startDate_year,ed2_startDate_month,ed2_startDate_year,ed3_startDate_month,ed3_startDate_year
0,0,France,Paris Metropolitan Region,Antoine croit à l’intelligence de tous et à la...,IT Services and IT Consulting,"Directeur associé de bluenove, initiateur du m...","[{'locationName': 'Paris Area, France', 'entit...",[{'entityUrn': 'urn:li:fs_education:(ACoAAAA61...,"[{'name': 'English', 'proficiency': 'FULL_PROF...","[{'date': {'month': 9, 'year': 2017, 'day': 1}...",...,2017.0,[Management Consulting],11.0,50.0,,1998.0,,1996.0,,1993.0
1,1,France,Paris Metropolitan Region,French Entrepreneur - Founder and Managing Par...,Venture Capital and Private Equity Principals,Founder & Managing Partner at ROCH Ventures,[{'entityUrn': 'urn:li:fs_position:(ACoAAAfK9Y...,[{'entityUrn': 'urn:li:fs_education:(ACoAAAfK9...,[],[],...,2017.0,[Venture Capital & Private Equity],2.0,10.0,,2006.0,,2005.0,,1998.0
2,2,France,Paris Metropolitan Region,Inspired by the power we can build to change t...,Civic and Social Organizations,Co-director Multitudes Foundation - Activist a...,[{'entityUrn': 'urn:li:fs_position:(ACoAAAUn_5...,[{'entityUrn': 'urn:li:fs_education:(ACoAAAUn_...,"[{'name': 'Anglais', 'proficiency': 'NATIVE_OR...","[{'date': {'month': 1, 'year': 2021, 'day': 21...",...,2021.0,[Nonprofit Organization Management],2.0,10.0,3.0,2022.0,9.0,2020.0,9.0,2020.0
3,3,France,Paris Metropolitan Region,"IT Consultant, Technical Expert, Lead develope...",IT Services and IT Consulting,Web expert chez Jolicode,[{'entityUrn': 'urn:li:fs_position:(ACoAAARi9f...,[{'entityUrn': 'urn:li:fs_education:(ACoAAARi9...,[{'name': 'Anglais'}],"[{'date': {'month': 2, 'year': 2015, 'day': 3}...",...,,,,,,2004.0,,,,
4,4,France,Paris Metropolitan Region,,IT Services and IT Consulting,Directrice des opérations chez Cap Collectif,[{'entityUrn': 'urn:li:fs_position:(ACoAAApOhv...,[{'entityUrn': 'urn:li:fs_education:(ACoAAApOh...,[],[],...,2009.0,[Computer Software],11.0,50.0,,2005.0,,2002.0,,


In [93]:
profiles_experience.geoCountryName.value_counts()

France     38
Germany     6
Belgium     1
Name: geoCountryName, dtype: int64

#### Company

In [94]:
list_companies=list(profiles_experience.exp1_companyName.str.lower())+list(profiles_experience.exp2_companyName.str.lower())+list(profiles_experience.exp3_companyName.str.lower())+list(profiles_experience.exp4_companyName.str.lower())+ list(profiles_experience.exp5_companyName.str.lower())
print(len(list_companies)), print(len(set(list_companies)))
df=pd.DataFrame(set(list_companies))
df.columns=['company']
dict_companies=df.company.value_counts().sort_index().to_dict()

235
149


In [95]:
def recoding_company(x):
    x=str(x).lower()
    if (x=="gov") or ("la météo de l" in x):
        return 'GOV la meteo de lopinion'
    elif ("make.org" in x): 
         return 'make.org'
    elif "orange" in x: 
         return 'Orange'
    elif ("freelance" in x) or ("self-employed" in x): 
         return 'freelance'
    elif "stig" in x: 
         return 'STIG'
    elif "voxe" in x: 
         return 'VOXE'
    else:
        return x

In [96]:
#test with 
# profiles_experience.exp1_companyName.apply(recoding_company)
#apply: 
for i in profiles_experience.columns:
    if ('companyName' in i):
        profiles_experience[i]= profiles_experience[i].apply(recoding_company)

print(len(list_companies)), print(len(set(list_companies)))

#list(profiles_experience.exp5_companyName)

235
149


(None, None)

#### Titles

In [97]:
list_titles=list(profiles_experience.exp1_title.str.lower())+list(profiles_experience.exp2_title.str.lower())+list(profiles_experience.exp3_title.str.lower())+list(profiles_experience.exp4_title.str.lower())+ list(profiles_experience.exp5_title.str.lower())
print(len(list_titles)), print(len(set(list_titles)))
df=pd.DataFrame(set(list_titles))
df.columns=['title']
dict_titles=df.title.value_counts().sort_index().to_dict()

235
205


In [98]:
# if includes "ceo" or "coo" or "cfo" or "président" or "directeur" or "directrice" or "director" or "cto" or "cpo" or "general manager" or "president" or "head of"
# if includes "consultant" or "conseiller" or "conseillère"
# if includes "founder" or "fondateur" or "fondatrice"

In [99]:
def recoding_title_dir(x):
     x=str(x).lower()
     dir=["ceo", "coo", "cfo", "président", "directeur", "directrice", "director", 
          "cto", "cpo", "general manager", "president", "head of"] 
     if any([y in x for y in dir]):
          return 1
     else: 
          return 0

def recoding_title_cs(x):
     x=str(x).lower()
     cs= ["consultant", "conseiller", "conseillère"]
     if any([y in x for y in cs]):
          return 1
     else: 
          return 0

def recoding_title_fond(x): 
     x=str(x).lower()
     fond=["founder", "fondateur", "fondatrice"]
     if any([y in x for y in fond]):
         return 1
     else: 
          return 0

In [100]:
print(list(profiles_experience.columns))

['ind_id', 'geoCountryName', 'geoLocationName', 'summary', 'industryName', 'headline', 'experience', 'education', 'languages', 'publications', 'certifications', 'volunteer', 'honors', 'projects', 'experience1', 'experience2', 'experience3', 'experience4', 'experience5', 'education1', 'education2', 'education3', 'languages_over2', 'honors_stated', 'publications_stated', 'volunteer_stated', 'projects_stated', 'certifications_stated', 'languages_stated', 'exp1_locationName', 'exp1_companyName', 'exp1_description', 'exp1_company', 'exp1_title', 'exp1_honors', 'exp2_companyName', 'exp2_company', 'exp2_title', 'exp2_locationName', 'exp2_description', 'exp2_honors', 'exp3_locationName', 'exp3_companyName', 'exp3_company', 'exp3_title', 'exp3_description', 'exp4_locationName', 'exp4_companyName', 'exp4_description', 'exp4_company', 'exp4_title', 'exp5_locationName', 'exp5_companyName', 'exp5_description', 'exp5_company', 'exp5_title', 'exp5_honors', 'ed1_school', 'ed1_degreeName', 'ed1_schoolN

In [101]:
#test with 
# profiles_experience.exp1_title.apply(recoding_title_dir)

In [102]:
list_title_columns= [i for i in profiles_experience.columns if 'title' in i]
print(list_title_columns)

for i in list_title_columns:
    profiles_experience[str(i)+'_direction']=profiles_experience[i].apply(recoding_title_dir)
for i in list_title_columns:
    profiles_experience[str(i)+'consulting']=profiles_experience[i].apply(recoding_title_cs)
for i in list_title_columns:
    profiles_experience[str(i)+'founder']=profiles_experience[i].apply(recoding_title_fond)

profiles_experience.columns

['exp1_title', 'exp2_title', 'exp3_title', 'exp4_title', 'exp5_title']


Index(['ind_id', 'geoCountryName', 'geoLocationName', 'summary',
       'industryName', 'headline', 'experience', 'education', 'languages',
       'publications',
       ...
       'exp1_titleconsulting', 'exp2_titleconsulting', 'exp3_titleconsulting',
       'exp4_titleconsulting', 'exp5_titleconsulting', 'exp1_titlefounder',
       'exp2_titlefounder', 'exp3_titlefounder', 'exp4_titlefounder',
       'exp5_titlefounder'],
      dtype='object', length=139)

In [103]:
profiles_experience['consulting_roles']= profiles_experience['exp1_titleconsulting']+profiles_experience['exp2_titleconsulting']+profiles_experience['exp3_titleconsulting']+profiles_experience['exp4_titleconsulting']+profiles_experience['exp5_titleconsulting']
profiles_experience['direction_roles']= profiles_experience['exp1_title_direction']+profiles_experience['exp2_title_direction']+profiles_experience['exp3_title_direction']+profiles_experience['exp4_title_direction']+profiles_experience['exp5_title_direction']
profiles_experience['founding_roles']= profiles_experience['exp1_titlefounder']+profiles_experience['exp2_titlefounder']+profiles_experience['exp3_titlefounder']+profiles_experience['exp4_titlefounder']+profiles_experience['exp5_titlefounder']
print(profiles_experience['consulting_roles'].value_counts()), print(profiles_experience['direction_roles'].value_counts()), print(profiles_experience.founding_roles.value_counts())

0    36
1    10
2     1
Name: consulting_roles, dtype: int64
3    13
2    12
1    12
0     6
4     3
5     1
Name: direction_roles, dtype: int64
0    21
1    13
2    10
3     3
Name: founding_roles, dtype: int64


(None, None, None)

### Main database production

In [104]:
print(list(profiles_experience.columns))
profiles_experience.to_csv("../modeling/firstround.csv", sep=';', index=False)
people_main_db=profiles_experience[['ind_id', 'geoCountryName', 'geoLocationName', 'summary', 'industryName', 'headline', 'languages', 'publications', 'certifications', 'volunteer', 'honors', 'projects',  'experience1', 'experience2', 'experience3', 'experience4', 'experience5', 'education1', 'education2', 'education3', 'languages_over2', 'honors_stated', 'publications_stated', 'volunteer_stated', 'projects_stated', 'certifications_stated', 'languages_stated', 'consulting_roles', 'direction_roles', 'founding_roles']]
people_main_db.to_csv("people_main_db.csv", sep=';', index=False)
# Note here we have removed the education and experience columns because they exist in exploded version

['ind_id', 'geoCountryName', 'geoLocationName', 'summary', 'industryName', 'headline', 'experience', 'education', 'languages', 'publications', 'certifications', 'volunteer', 'honors', 'projects', 'experience1', 'experience2', 'experience3', 'experience4', 'experience5', 'education1', 'education2', 'education3', 'languages_over2', 'honors_stated', 'publications_stated', 'volunteer_stated', 'projects_stated', 'certifications_stated', 'languages_stated', 'exp1_locationName', 'exp1_companyName', 'exp1_description', 'exp1_company', 'exp1_title', 'exp1_honors', 'exp2_companyName', 'exp2_company', 'exp2_title', 'exp2_locationName', 'exp2_description', 'exp2_honors', 'exp3_locationName', 'exp3_companyName', 'exp3_company', 'exp3_title', 'exp3_description', 'exp4_locationName', 'exp4_companyName', 'exp4_description', 'exp4_company', 'exp4_title', 'exp5_locationName', 'exp5_companyName', 'exp5_description', 'exp5_company', 'exp5_title', 'exp5_honors', 'ed1_school', 'ed1_degreeName', 'ed1_schoolN

# Table 3- Experience

In [45]:
exp1db=profiles_experience[[i for i in profiles_experience.columns if (i=='ind_id'or "exp1_" in i)]]
exp2db=profiles_experience[[i for i in profiles_experience.columns if (i=='ind_id'or "exp2_" in i)]]
exp3db=profiles_experience[[i for i in profiles_experience.columns if (i=='ind_id'or "exp3_" in i)]]
exp4db=profiles_experience[[i for i in profiles_experience.columns if (i=='ind_id'or "exp4_" in i)]]
exp5db=profiles_experience[[i for i in profiles_experience.columns if (i=='ind_id'or "exp5_" in i)]]

In [46]:
exp1db.shape, exp2db.shape, exp3db.shape, exp4db.shape, exp5db.shape

((47, 17), (47, 17), (47, 16), (47, 16), (47, 17))

In [47]:
exp1db.columns= ['ind_id', 'locationName', 'companyName', 'description',
       'company', 'title', 'honors', 'startDate_month',
       'startDate_year', 'endDate_month', 'endDate_year',
       'industry', 'company_empl_low', 'company_empl_high',
       'title_direction', 'titleconsulting', 'titlefounder']

In [48]:
exp2db.columns=['ind_id', 'companyName', 'company', 'title', 'locationName',
       'description', 'honors', 'startDate_month',
       'startDate_year', 'endDate_month', 'endDate_year',
       'industry', 'company_empl_low', 'company_empl_high',
       'title_direction', 'titleconsulting', 'titlefounder']

In [49]:
exp3db.columns= ['ind_id', 'locationName', 'companyName', 'company', 'title',
       'description', 'startDate_month', 'startDate_year',
       'endDate_month', 'endDate_year', 'industry',
       'company_empl_low', 'company_empl_high',
       'title_direction', 'titleconsulting', 'titlefounder']

In [50]:
exp4db.columns=['ind_id', 'locationName', 'companyName', 'description',
       'company', 'title', 'startDate_month',
       'startDate_year', 'endDate_month', 'endDate_year',
       'industry', 'company_empl_low', 'company_empl_high',
       'title_direction', 'titleconsulting', 'titlefounder']

In [51]:
exp5db.columns=['ind_id', 'locationName', 'companyName', 'description',
       'company', 'title', 'honors', 'startDate_month',
       'startDate_year', 'endDate_month', 'endDate_year',
       'industry', 'company_empl_low', 'company_empl_high',
       'title_direction', 'titleconsulting', 'titlefounder']

In [52]:
people_experience_db= pd.concat([exp1db,exp2db, exp3db, exp4db, exp5db]).sort_values(by='ind_id')
type(people_experience_db)

pandas.core.frame.DataFrame

In [53]:
people_experience_db.drop(columns='honors', inplace=True)
people_experience_db.isna().sum().sort_values()

ind_id                0
locationName          0
companyName           0
title_direction       0
titleconsulting       0
titlefounder          0
title                 5
startDate_year        5
startDate_month      12
industry             20
company_empl_low     25
company_empl_high    25
company              33
description          57
endDate_year         79
endDate_month        82
dtype: int64

In [54]:
people_experience_db.dtypes

ind_id                 int64
locationName          object
companyName           object
description           object
company               object
title                 object
startDate_month      float64
startDate_year       float64
endDate_month        float64
endDate_year         float64
industry              object
company_empl_low     float64
company_empl_high    float64
title_direction        int64
titleconsulting        int64
titlefounder           int64
dtype: object

In [55]:
people_experience_db.reset_index(inplace=True)

In [56]:
people_experience_db.drop(columns="company", inplace=True)

In [57]:
# drop description, 
# check company name that was weird (with #?)

In [58]:
people_experience_db['startDate_month']=np.array(people_experience_db['startDate_month'], np.int16) 
people_experience_db['startDate_year']=np.array(people_experience_db['startDate_year'], np.int16) 
people_experience_db['endDate_month']=np.array(people_experience_db['endDate_month'], np.int16) 
people_experience_db['endDate_year']=np.array(people_experience_db['endDate_year'], np.int16) 
people_experience_db['company_empl_low']=np.array(people_experience_db['company_empl_low'], np.int16) 
people_experience_db['company_empl_high']= np.array(people_experience_db['company_empl_high'], np.int16) 


In [59]:
people_experience_db.dtypes.sort_values()

startDate_month       int16
startDate_year        int16
endDate_month         int16
endDate_year          int16
company_empl_low      int16
company_empl_high     int16
index                 int64
ind_id                int64
title_direction       int64
titleconsulting       int64
titlefounder          int64
locationName         object
companyName          object
description          object
title                object
industry             object
dtype: object

In [60]:
people_experience_db.company_empl_high.value_counts()

10       95
50       80
0        25
500      10
200      10
1000      5
1         5
10000     5
Name: company_empl_high, dtype: int64

In [61]:
# without_= people_experience_db.drop(columns='description')
# without_desc.to_csv("without_desc.csv", sep=';', index=False)

In [62]:
people_experience_db.to_csv("people_experience_db.csv", sep=';', index=False)

In [63]:
people_experience_db.shape

(235, 16)

# Table 4 - Education

In [64]:
ed1db=profiles_experience[[i for i in profiles_experience.columns if (i=='ind_id'or "ed1_" in i)]]
ed2db=profiles_experience[[i for i in profiles_experience.columns if (i=='ind_id'or "ed2_" in i)]]
ed3db=profiles_experience[[i for i in profiles_experience.columns if (i=='ind_id'or "ed3_" in i)]]

In [65]:
ed1db.columns= [i if i=='ind_id' else i.split("_", 1)[1] for i in list(ed1db.columns)]
ed2db.columns= [i if i=='ind_id' else i.split("_", 1)[1] for i in list(ed2db.columns)]
ed3db.columns= [i if i=='ind_id' else i.split("_", 1)[1] for i in list(ed3db.columns)]


In [66]:
people_education_db= pd.concat([ed1db, ed2db, ed3db]).sort_values(by='ind_id')

In [67]:
people_education_db

Unnamed: 0,ind_id,school,degreeName,schoolName,fieldOfStudy,activities,description,grade,courses,honors,startDate_month,startDate_year
0,0,"{'objectUrn': 'urn:li:school:19908', 'entityUr...",Master,ESCP Europe,Business/Managerial Economics,,,,,,,1998.0
0,0,,,Prépa Saint Jean de Douai,,,,,,,,1996.0
0,0,,Baccalauréat,Lycée Kernanec,Economics,,,Mention très bien,,,,1993.0
1,1,"{'objectUrn': 'urn:li:school:12330', 'entityUr...",,Ecole des Hautes Etudes Politiques,Relations Internationales et Sciences Politiques,Junior Entreprise. Département Moyen-Orient.,,,,,,2006.0
1,1,"{'objectUrn': 'urn:li:school:13392', 'entityUr...",,The Hebrew University,Relations et affaires internationales,,,,,,,2005.0
...,...,...,...,...,...,...,...,...,...,...,...,...
45,45,"{'objectUrn': 'urn:li:school:21235', 'entityUr...",Master 2,IFP - Institut français de presse,Journalisme,,,,,,,2007.0
45,45,,Diplôme Universitaire,Université Paris 7,Mandarin,,,,,,,2002.0
46,46,"{'objectUrn': 'urn:li:school:12548', 'entityUr...",master 2,Université Paris-Sorbonne,"communication, marketing",,,,,,,2003.0
46,46,,Magistère,CELSA,"Communication, General",,,,,,,2004.0


In [68]:
### Removing columns with too many missing values
a= pd.DataFrame(people_education_db.isna().sum().sort_values(ascending=False))
a.columns=["column"]
coltodrop= list(a.loc[a['column']>100].index)
people_education_db.drop(columns=coltodrop, inplace=True)
# possible improvement with more data
#coltodrop= list(a.loc[a['column'].isna()>len(people_education_db)/2].index)

In [69]:
people_education_db.to_csv("people_education_db.csv", sep=';', index=False)