## Importing information

In [1]:
import pandas as pd
import json
import numpy as np
import os

In [2]:
def create_profile(x, url):
    with open(str(url)+str(x)) as f:
        dict1 = json.load(f)
    list_col= ['experience', 'education', 'languages','languages','publications', 'certifications','volunteer', 'honors', 'projects']
    for n in list_col:
        if n in dict1:
            for i in range(len(dict1[n])): 
                dict1[str(n+str(i+1))]= dict1[n][i]
    data = pd.DataFrame.from_dict(dict1, orient='index').T
    return data

In [3]:
file_list = [f for f in os.listdir('../data/jsons') if f.endswith(".json")]
len(file_list)

49

In [4]:
initial_profile=create_profile('brachetantoine.json', '../data/jsons/')

In [5]:
profiles = initial_profile
for i in file_list:
    a= create_profile(i, '../data/jsons/')
    profiles= pd.concat((profiles,a),axis=0)

In [6]:
file_list2 = [f for f in os.listdir('../data/jsons2') if f.endswith(".json")]
len(file_list2)

124

In [7]:
profiles2 = initial_profile
for i in file_list2:
    a= create_profile(i,'../data/jsons2/')
    profiles2= pd.concat((profiles2,a),axis=0)

In [8]:
data=pd.concat([profiles, profiles2]).reset_index(drop=True)
data.head(2)

Unnamed: 0,summary,industryName,lastName,student,geoCountryName,geoCountryUrn,geoLocationBackfilled,elt,industryUrn,firstName,...,publications9,publications10,maidenName,projects8,certifications5,certifications6,certifications7,certifications8,certifications9,certifications10
0,Antoine croit √† l‚Äôintelligence de tous et √† la...,IT Services and IT Consulting,Brachet,False,France,urn:li:fs_geo:105015875,False,False,urn:li:fs_industry:96,Antoine,...,,,,,,,,,,
1,French Entrepreneur - Founder and Managing Par...,Venture Capital and Private Equity Principals,Demri,False,France,urn:li:fs_geo:105015875,False,False,urn:li:fs_industry:106,Bobby,...,,,,,,,,,,


## Initial dataset cleaning

In [9]:
data.shape

(173, 89)

In [10]:
print(list(data.columns))

['summary', 'industryName', 'lastName', 'student', 'geoCountryName', 'geoCountryUrn', 'geoLocationBackfilled', 'elt', 'industryUrn', 'firstName', 'entityUrn', 'geoLocation', 'geoLocationName', 'location', 'headline', 'displayPictureUrl', 'img_100_100', 'img_200_200', 'img_400_400', 'img_800_800', 'profile_id', 'profile_urn', 'member_urn', 'public_id', 'experience', 'education', 'languages', 'publications', 'certifications', 'volunteer', 'honors', 'projects', 'experience1', 'experience2', 'experience3', 'experience4', 'experience5', 'education1', 'education2', 'education3', 'languages1', 'languages2', 'languages3', 'publications1', 'publications2', 'publications3', 'publications4', 'publications5', 'publications6', 'publications7', 'publications8', 'locationName', 'honors1', 'honors2', 'honors3', 'honors4', 'honors5', 'honors6', 'honors7', 'volunteer1', 'volunteer2', 'volunteer3', 'projects1', 'projects2', 'projects3', 'birthDate', 'languages4', 'projects4', 'projects5', 'address', 'lan

### Drop duplicates in rows

In [11]:
mask = data.duplicated(subset=['headline'], keep='last')
dropped = data.loc[mask]
dropped

Unnamed: 0,summary,industryName,lastName,student,geoCountryName,geoCountryUrn,geoLocationBackfilled,elt,industryUrn,firstName,...,publications9,publications10,maidenName,projects8,certifications5,certifications6,certifications7,certifications8,certifications9,certifications10
0,Antoine croit √† l‚Äôintelligence de tous et √† la...,IT Services and IT Consulting,Brachet,False,France,urn:li:fs_geo:105015875,False,False,urn:li:fs_industry:96,Antoine,...,,,,,,,,,,
1,French Entrepreneur - Founder and Managing Par...,Venture Capital and Private Equity Principals,Demri,False,France,urn:li:fs_geo:105015875,False,False,urn:li:fs_industry:106,Bobby,...,,,,,,,,,,
2,Inspired by the power we can build to change t...,Civic and Social Organizations,Durieux,False,France,urn:li:fs_geo:105015875,False,False,urn:li:fs_industry:90,Sarah,...,,,,,,,,,,
3,Antoine croit √† l‚Äôintelligence de tous et √† la...,IT Services and IT Consulting,Brachet,False,France,urn:li:fs_geo:105015875,False,False,urn:li:fs_industry:96,Antoine,...,,,,,,,,,,
11,‚áí Rendez-vous sur jerem.io pour t√©l√©charger mo...,"Technology, Information and Internet",Paret,False,France,urn:li:fs_geo:105015875,False,False,urn:li:fs_industry:6,Jeremie,...,,,,,,,,,,


In [12]:
data.drop_duplicates(subset=['headline'], keep='last', inplace=True)

### Dropping unnecessary columns (specific to LIn, personal information, etc.)

In [13]:
pd.options.display.max_rows = 100

In [14]:
todrop=['industryName', 'experience', 'education', 'languages','languages','publications', 
        'certifications','volunteer', 'honors', 'projects', 
        'displayPictureUrl', 'img_100_100', 'img_200_200', 'img_400_400', 
        'img_800_800', 'student', 'lastName', 'firstName', 'maidenName', 'birthDate', 
        'address','geoCountryUrn', 'geoLocationBackfilled', 'elt', 'industryUrn',
        'entityUrn', 'geoLocation', 'location', 'locationName', 
        'profile_id', 'profile_urn', 'member_urn', 'public_id']
data.drop(columns=todrop, inplace=True)

In [15]:
data.describe().T.sort_index()

Unnamed: 0,count,unique,top,freq
certifications1,42,42,{'name': 'Psychology'},1
certifications10,3,3,"{'authority': 'LinkedIn', 'name': 'Project Man...",1
certifications2,21,21,"{'authority': 'OpenClassrooms', 'name': 'Learn...",1
certifications3,12,12,"{'authority': 'COLIBRIS', 'name': 'Quelle d√©mo...",1
certifications4,7,7,"{'authority': 'A Cloud Guru', 'name': 'Introdu...",1
certifications5,6,6,"{'authority': 'Google', 'name': 'Google Ads Ce...",1
certifications6,6,6,"{'authority': 'Google', 'name': 'Certification...",1
certifications7,6,6,"{'authority': 'HubSpot', 'name': 'Hubspot Inbo...",1
certifications8,4,4,"{'authority': 'LinkedIn', 'name': 'Become an H...",1
certifications9,4,4,"{'authority': 'ETS Global', 'name': 'TOEIC', '...",1


### Exploding columns

In [16]:
# there must be a more functional way to do what is happening below
# but i'm too tired :)

In [17]:
for i in range (1,4): 
    data=pd.concat([data, 
    pd.DataFrame(data['education'+str(i)].apply(pd.Series)).add_prefix('ed_'+str(i)), 
    pd.DataFrame(data['volunteer'+str(i)].apply(pd.Series)).add_prefix('vol_'+str(i))], 
    axis=1).reset_index(drop=True)
    data.drop(columns=['education'+str(i),'volunteer'+str(i)], inplace=True)   

In [18]:
for i in range (1,6): 
    data=pd.concat([data, 
    pd.DataFrame(data['experience'+str(i)].apply(pd.Series)).add_prefix('exp_'+str(i))], 
    axis=1).reset_index(drop=True)
    data.drop(columns='experience'+str(i), inplace=True)   

In [19]:
for i in range (1,8): 
    data=pd.concat([data, 
    pd.DataFrame(data['honors'+str(i)].apply(pd.Series)).add_prefix('hon_'+str(i)), 
    pd.DataFrame(data['languages'+str(i)].apply(pd.Series)).add_prefix('lang_'+str(i))], 
    axis=1).reset_index(drop=True)
    data.drop(columns=['honors'+str(i),'languages'+str(i)], inplace=True)    

In [20]:
for i in range (1,9): 
    data=pd.concat([data, 
    pd.DataFrame(data['projects'+str(i)].apply(pd.Series)).add_prefix('proj_'+str(i))], 
    axis=1).reset_index(drop=True)
    data.drop(columns='projects'+str(i), inplace=True)

In [21]:
for i in range (1,11): 
    data=pd.concat([data, 
    pd.DataFrame(data['certifications'+str(i)].apply(pd.Series)).add_prefix('cert_'+str(i)), 
    pd.DataFrame(data['publications'+str(i)].apply(pd.Series)).add_prefix('pub_'+str(i))], 
    axis=1).reset_index(drop=True)
    data.drop(columns=['certifications'+str(i),'publications'+str(i)], inplace=True)

In [22]:
list_columns=[i for i in data.columns if ("Urn" in i or "Logo" in i or "_region" in i or '_abuse' in i or '_geo' in i or '_proj' in i or '_organi' in i or '_0' in i)]
#to check before dropping 
#data[list_columns].T
data.drop(columns=list_columns, inplace=True)

### Dropping columns with too many missing values

In [23]:
a= list(data.columns)
a.sort()
data= data[a]

In [24]:
nullvalues=pd.DataFrame(data.isna().sum().sort_values(ascending=False))

In [25]:
print(list(nullvalues.index))

['cert_10', 'pub_20', 'ed_30', 'cert_40', 'lang_40', 'proj_10', 'exp_50', 'hon_30', 'hon_70', 'cert_50', 'proj_80', 'ed_20', 'cert_60', 'pub_10', 'pub_100', 'lang_70', 'ed_10', 'exp_30', 'cert_70', 'exp_40', 'lang_10', 'lang_60', 'lang_50', 'lang_20', 'lang_30', 'cert_90', 'cert_80', 'proj_70', 'hon_60', 'proj_20', 'vol_20', 'cert_100', 'pub_90', 'hon_20', 'pub_80', 'vol_30', 'cert_20', 'proj_30', 'proj_40', 'proj_60', 'hon_40', 'exp_10', 'pub_70', 'hon_10', 'exp_20', 'pub_60', 'pub_50', 'proj_50', 'hon_50', 'cert_30', 'pub_40', 'pub_30', 'vol_10', 'exp_2courses', 'exp_1courses', 'proj_8title', 'cert_9licenseNumber', 'proj_8url', 'proj_8timePeriod', 'proj_8occupation', 'proj_8members', 'proj_8description', 'proj_6occupation', 'exp_2organizations', 'exp_2honors', 'cert_8licenseNumber', 'exp_1projects', 'exp_5courses', 'cert_3licenseNumber', 'cert_10licenseNumber', 'hon_7description', 'cert_5licenseNumber', 'exp_4courses', 'exp_5organizations', 'cert_4licenseNumber', 'proj_6url', 'proj_7

In [26]:
#columns= data.columns()
columnstodrop=[i for i in data.columns if data[i].isna().sum() > len(data)/2+10]
data.drop(columns=columnstodrop, inplace=True)

In [27]:
nullvalues=pd.DataFrame(data.isna().sum().sort_values(ascending=False))
nullvalues.head()

Unnamed: 0,0
exp_2region,94
vol_1role,93
exp_3region,93
vol_1companyName,93
exp_1region,83


### Other recoding - could be improved

In [28]:
data.drop(columns=['ed_1school', 'ed_2school','ed_3school'], inplace=True)

#### Time columns could be split

In [29]:
timecols= [i for i in data.columns if "time" in i]
timecols

['ed_1timePeriod',
 'ed_2timePeriod',
 'ed_3timePeriod',
 'exp_1timePeriod',
 'exp_2timePeriod',
 'exp_3timePeriod',
 'exp_4timePeriod',
 'exp_5timePeriod']

In [30]:
#data.isna().sum()
data.fillna("[]", inplace=True)
# we are filling na with [] because special characters will be removed later on. 

In [31]:
data.head(10)

Unnamed: 0,ed_1degreeName,ed_1fieldOfStudy,ed_1schoolName,ed_1timePeriod,ed_2degreeName,ed_2fieldOfStudy,ed_2schoolName,ed_2timePeriod,ed_3degreeName,ed_3fieldOfStudy,...,headline,lang_1name,lang_1proficiency,lang_2name,lang_2proficiency,lang_3name,lang_3proficiency,summary,vol_1companyName,vol_1role
0,Master informatique,Architecture logicielle distribu√©e,Universit√© Bordeaux I,"{'endDate': {'year': 2009}, 'startDate': {'yea...",[],[],[],[],[],[],...,Web expert chez Jolicode,Anglais,[],[],[],[],[],"IT Consultant, Technical Expert, Lead develope...",[],[]
1,Master 2,Sciences politiques,Universit√© Paris 1 Panth√©on-Sorbonne,"{'endDate': {'year': 2007}, 'startDate': {'yea...",Licence,Droit,Universit√© Lille 2 Droit et Sant√©,"{'endDate': {'year': 2005}, 'startDate': {'yea...",[],[],...,Directrice des op√©rations chez Cap Collectif,[],[],[],[],[],[],[],La Cravate Solidaire,Administratrice
2,Master's 2 Degree Marketing & Strategy - Inte...,[],Universit√© Paris Dauphine,"{'endDate': {'year': 2015}, 'startDate': {'yea...",Postgraduate master‚Äôs degree in Information an...,[],Universit√© Sorbonne Nouvelle (Paris III),"{'endDate': {'year': 2013}, 'startDate': {'yea...",Bachelor's degree,"Communication, r√©daction et multim√©dia",...,"Regional Director (France, Italy, Russia and S...",Anglais,FULL_PROFESSIONAL,Espagnol,LIMITED_WORKING,Fran√ßais,NATIVE_OR_BILINGUAL,People Power,Samu Social de Paris,Volunteer
3,Masters Degree,Written & Audiovisual Journalism,Universit√© libre de Bruxelles,"{'endDate': {'year': 2006}, 'startDate': {'yea...",Creating Collaborative Solutions,Political Science and Government,Harvard Kennedy School,"{'endDate': {'year': 2017}, 'startDate': {'yea...",9 weeks full stack bootcamp,Computer Science,...,Co-founder & CEO @Fluicity üí° Co-founder Associ...,Arabic,LIMITED_WORKING,English,[],French,NATIVE_OR_BILINGUAL,16+ years of experience driving change in the ...,BeCode.org,Board Member
4,"Master of management, organisations and govern...",[],The London School of Economics and Political S...,"{'endDate': {'year': 2009}, 'startDate': {'yea...",[],[],Institut National des Etudes Territoriales (INET),"{'endDate': {'year': 2011}, 'startDate': {'yea...",Master Affaires Publiques,[],...,Directrice g√©n√©rale adjointe chez Adie,Anglais,PROFESSIONAL_WORKING,Fran√ßais,NATIVE_OR_BILINGUAL,[],[],[],[],[]
5,MBA,Strategy,ESSEC - ESSEC Business School,"{'endDate': {'year': 2006}, 'startDate': {'yea...",Master of Business Administration (MBA),Strategy and organisation,ESSEC - ESSEC Business School,"{'endDate': {'year': 2006}, 'startDate': {'yea...",Master of Business Administration - MBA,[],...,Directeur Associ√© - Chief Revenue Officer (CRO...,[],[],[],[],[],[],"17 ans d'exp√©rience, intelligence collective, ...",[],[]
6,Master 2,Management des Ressources Humaines,"Pantheon ASSAS University, Paris","{'endDate': {'year': 2005}, 'startDate': {'yea...",[],[],[],[],[],[],...,CEO & Co-founder at Civocracy\n!We are hiring!,Chinese,PROFESSIONAL_WORKING,English,FULL_PROFESSIONAL,French,NATIVE_OR_BILINGUAL,After eight years of social entrepreneurship i...,[],[]
7,Masters,International Politics,Sciences Po Bordeaux,"{'endDate': {'year': 2010}, 'startDate': {'yea...",Social entrepreneurship summer training,"Business Administration and Management, General",University of Cambridge,"{'endDate': {'year': 2015}, 'startDate': {'yea...",Licenciatura,Ciencias Politicas y de la Administracion,...,#onrecrute - CEO de Voxe m√©dia-√©cole d'empower...,English,NATIVE_OR_BILINGUAL,French,NATIVE_OR_BILINGUAL,Spanish,NATIVE_OR_BILINGUAL,Experienced Chief Executive Officer with a dem...,Empow'Her,Mentor du programme Woman'Act
8,Dut SRC,[],Universit√© de Caen Basse Normandie,[],[],[],Universit√© du Maine-Le Mans-Laval,[],[],[],...,"Directeur Technique chez Monsieur SLOOP, ABCDE...",[],[],[],[],[],[],[],[],[]
9,Start-up Launchpad,Master of Digital Business,HEC Paris,"{'endDate': {'year': 2017}, 'startDate': {'yea...",Master's degree,Public affairs and management,Sciences Po Lille,"{'endDate': {'year': 2016}, 'startDate': {'yea...",Bachelor,Law,...,Belgium France @Sdui | Tech-For-Good Leader & ...,[],[],[],[],[],[],Drawing on 7 years of experience as a Team Lea...,Anacej,Vice-Pr√©sident


In [32]:
data.shape

(168, 62)

### Make dataframe with described dataset (modes, counts...) - not interesting if the dataframe is at this level of transformation

In [33]:
#data.describe().T
#general_pop_data= pd.DataFrame(data.describe(include="O").T)
#general_pop_data.to_csv("general_pop_data.csv", sep=';')

In [34]:
pd.options.display.max_rows = 100
data.head(5).T

Unnamed: 0,0,1,2,3,4
ed_1degreeName,Master informatique,Master 2,Master's 2 Degree Marketing & Strategy - Inte...,Masters Degree,"Master of management, organisations and govern..."
ed_1fieldOfStudy,Architecture logicielle distribu√©e,Sciences politiques,[],Written & Audiovisual Journalism,[]
ed_1schoolName,Universit√© Bordeaux I,Universit√© Paris 1 Panth√©on-Sorbonne,Universit√© Paris Dauphine,Universit√© libre de Bruxelles,The London School of Economics and Political S...
ed_1timePeriod,"{'endDate': {'year': 2009}, 'startDate': {'yea...","{'endDate': {'year': 2007}, 'startDate': {'yea...","{'endDate': {'year': 2015}, 'startDate': {'yea...","{'endDate': {'year': 2006}, 'startDate': {'yea...","{'endDate': {'year': 2009}, 'startDate': {'yea..."
ed_2degreeName,[],Licence,Postgraduate master‚Äôs degree in Information an...,Creating Collaborative Solutions,[]
ed_2fieldOfStudy,[],Droit,[],Political Science and Government,[]
ed_2schoolName,[],Universit√© Lille 2 Droit et Sant√©,Universit√© Sorbonne Nouvelle (Paris III),Harvard Kennedy School,Institut National des Etudes Territoriales (INET)
ed_2timePeriod,[],"{'endDate': {'year': 2005}, 'startDate': {'yea...","{'endDate': {'year': 2013}, 'startDate': {'yea...","{'endDate': {'year': 2017}, 'startDate': {'yea...","{'endDate': {'year': 2011}, 'startDate': {'yea..."
ed_3degreeName,[],[],Bachelor's degree,9 weeks full stack bootcamp,Master Affaires Publiques
ed_3fieldOfStudy,[],[],"Communication, r√©daction et multim√©dia",Computer Science,[]


### Exporting our dataset

In [35]:
data.to_csv("dataset2.csv", sep=';', index=False)