# Random Forest Model

## Initializing

In [223]:
import pandas as pd
import numpy as np

In [224]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
pd.set_option("display.max_colwidth", 500)

In [225]:
# only if starting from this step 
df= pd.read_csv("dataset.csv", sep=';')

In [226]:
print(df.shape)
print(list(df.columns))

(156, 72)
['ID', 'geoCountryName', 'geoLocationName', 'summary', 'industryName', 'headline', 'languages', 'publications', 'certifications', 'volunteer', 'honors', 'projects', 'exp1_locationName', 'exp1_companyName', 'exp1_description', 'exp1_title', 'exp1_startDate_month', 'exp1_startDate_year', 'exp1_industry', 'exp1_company_empl_low', 'exp2_locationName', 'exp2_companyName', 'exp2_description', 'exp2_title', 'exp2_startDate_month', 'exp2_startDate_year', 'exp2_endDate_month', 'exp2_endDate_year', 'exp2_industry', 'exp2_company_empl_low', 'exp3_locationName', 'exp3_companyName', 'exp3_title', 'exp3_description', 'exp3_startDate_month', 'exp3_startDate_year', 'exp3_endDate_month', 'exp3_endDate_year', 'exp3_industry', 'exp3_company_empl_low', 'exp4_locationName', 'exp4_companyName', 'exp4_description', 'exp4_title', 'exp4_startDate_month', 'exp4_startDate_year', 'exp4_endDate_month', 'exp4_endDate_year', 'exp4_industry', 'exp4_company_empl_low', 'exp5_locationName', 'exp5_companyName',

In [227]:
df.exp1_industry.value_counts()

Computer Software                      19
Information Technology and Services    17
Civic Social Organization              13
Government Administration              12
Management Consulting                   8
                                       ..
Education Management                    1
Investment Banking                      1
Market Research                         1
Retail                                  1
Political Organization                  1
Name: exp1_industry, Length: 43, dtype: int64

In [228]:
len(df.exp1_industry.unique())

44

In [229]:
print(len(list(df.exp1_companyName.unique())))
print(list(df.exp1_companyName.unique()))

128
['bluenove', 'jolicode', 'cap collectif', 'change.org', 'fluicity', 'adie', 'civocracy', 'VOXE', 'abcdeep', 'sdui', 'leto', 'make.org', 'klarna', 'open source politics', 'sell - syndicat des éditeurs de logiciels de loisirs', 'freelance', 'ad education', 'nestlé france', 'analog sport', 'la cour des contes', 'malt', 'decidim', 'sloop', 'impact hub berlin', 'the one campaign', 'billy', 'institut des politiques publiques', 'egis', 'roch ventures', 'multitudes foundation', "ville d'alfortville", 'afd', 'trikoï conseil', 'google', 'sciences po ofce', 'orchestre national de jazz', 'hippocrate développement', 'sobi - swedish orphan biovitrum ab (publ)', 'capgemini invent', 'impact tank', 'ville de lyon', 'datagora', 'grandvision france', 'cour des comptes', "l'institut paris region", 'umi', 'udaf 71', 'avolta', 'university of paris i: panthéon-sorbonne', 'ville de cachan', 'edf', 'combo (ex-snapshift)', 'cerba healthcare', 'accenta.ai', 'sicoval', 'haute autorité de santé', "journaliste 

## Preparing the dataframe

In [230]:
# our "y" here is the exp1_industry 
# it could be the companies but there are a lot of them (128)
# next we are going to drop everything else related to the 1st (last) experience
# ['exp1_locationName', 'exp1_companyName',
#       'exp1_description', 'exp1_title', 'exp1_startDate_month',
#       'exp1_startDate_year', 'exp1_industry', 'exp1_company_empl_low']
# as well as the ID 

In [231]:
df.set_index(['exp1_industry'], drop=True, inplace=True)

In [232]:
df.drop(columns=["ID", 'exp1_locationName', 'exp1_companyName',
       'exp1_description', 'exp1_title', 'exp1_startDate_month',
       'exp1_startDate_year', 'exp1_company_empl_low'], inplace=True)

In [233]:
list_df_columns= list(df.columns)
print(list_df_columns)

['geoCountryName', 'geoLocationName', 'summary', 'industryName', 'headline', 'languages', 'publications', 'certifications', 'volunteer', 'honors', 'projects', 'exp2_locationName', 'exp2_companyName', 'exp2_description', 'exp2_title', 'exp2_startDate_month', 'exp2_startDate_year', 'exp2_endDate_month', 'exp2_endDate_year', 'exp2_industry', 'exp2_company_empl_low', 'exp3_locationName', 'exp3_companyName', 'exp3_title', 'exp3_description', 'exp3_startDate_month', 'exp3_startDate_year', 'exp3_endDate_month', 'exp3_endDate_year', 'exp3_industry', 'exp3_company_empl_low', 'exp4_locationName', 'exp4_companyName', 'exp4_description', 'exp4_title', 'exp4_startDate_month', 'exp4_startDate_year', 'exp4_endDate_month', 'exp4_endDate_year', 'exp4_industry', 'exp4_company_empl_low', 'exp5_locationName', 'exp5_companyName', 'exp5_description', 'exp5_title', 'exp5_startDate_month', 'exp5_startDate_year', 'exp5_endDate_month', 'exp5_endDate_year', 'exp5_industry', 'exp5_company_empl_low', 'ed1_degreeNa

In [234]:
df=df.applymap(str)

In [235]:
df["text"] = df[list_df_columns].apply(lambda x: " ".join(x), axis =1)
df.reset_index(inplace=True)
mod_df=df[["exp1_industry", "text"]]
mod_df.head(2)

Unnamed: 0,exp1_industry,text
0,Management Consulting,"France Paris Metropolitan Region Antoine croit à l’intelligence de tous et à la responsabilité de chacun pour réinventer un monde qui reste à hauteur d’homme.\nIl est à ce titre partie prenante de nombreux projets qui tous visent à rallumer les soleils humains, et notamment : \nDirecteur associé de Bluenove, qui assume que les organisations positives les plus impactantes de demain sont celles qui feront levier de l’intelligence collective\nA l’initiative du projet #BrightMirror visant à perm..."
1,,"France Paris Metropolitan Region IT Consultant, Technical Expert, Lead developer, I have an extensive experience in Web development and software architecture. I like solving complex situation, both human and technical.\n\nExpertises: \n- PHP development, CMS (huge experience with Drupal), ""components"" (Symfony2, Silex, Slim), MVC frameworks (particularly Jelix, symfony 1), \n- Software architecture, using the *right* number of moving part to easily scale your infrastructure while enhancing t..."


In [236]:
# mod_df.isna().sum()
# keep this dataframe because it could be useful to test the model later
exp1_industry_nan= mod_df.loc[mod_df["exp1_industry"].isna()]
mod_df= mod_df.loc[mod_df["exp1_industry"].notna()]

In [237]:
# String with special characters 
special_string="sPe@#$ci8<\n7al*& m_ot.h[r f{uc'6r"
print("String before conversion: ",special_string)
# Create a list with normal characters using the isalnum() method
# use the join() function to convert the list to string
normal_string="".join(ch for ch in special_string if ch.isalnum() or ch==' ')
# print the normal string 
print("string after conversion:",normal_string)

String before conversion:  sPe@#$ci8<
7al*& m_ot.h[r f{uc'6r
string after conversion: sPeci87al mothr fuc6r


In [238]:
def clean_text_col(x):
    x=str(x)
    x= "".join(ch for ch in x if ch.isalnum() or ch==' ')
    x= x.lower()
    return x

In [239]:
mod_df.columns

Index(['exp1_industry', 'text'], dtype='object')

In [240]:
mod_df["text"] = mod_df['text'].apply(clean_text_col)

In [241]:
mod_df.head(2)

Unnamed: 0,exp1_industry,text
0,Management Consulting,france paris metropolitan region antoine croit à lintelligence de tous et à la responsabilité de chacun pour réinventer un monde qui reste à hauteur dhommeil est à ce titre partie prenante de nombreux projets qui tous visent à rallumer les soleils humains et notamment directeur associé de bluenove qui assume que les organisations positives les plus impactantes de demain sont celles qui feront levier de lintelligence collectivea linitiative du projet brightmirror visant à permettre la scénar...
2,Computer Software,france paris metropolitan region nan it services and it consulting directrice des opérations chez cap collectif nan nan nan role administratrice companyname la cravate solidaire timeperiod enddate month 6 year 2021 startdate month 9 year 2016 company minicompany objecturn urnlicompany5186825 entityurn urnlifsminicompany5186825 name la cravate solidaire showcase false active true logo comlinkedincommonvectorimage artifacts width 200 fileidentifyingurlpathsegment 20020001519903535287e169344000...


In [242]:
mod_df.exp1_industry.value_counts()

Computer Software                      19
Information Technology and Services    17
Civic Social Organization              13
Government Administration              12
Management Consulting                   8
                                       ..
Education Management                    1
Investment Banking                      1
Market Research                         1
Retail                                  1
Political Organization                  1
Name: exp1_industry, Length: 43, dtype: int64

In [243]:
factor = pd.factorize(mod_df['exp1_industry'])
mod_df.exp1_industry = factor[0]
definitions = factor[1]
print(mod_df.exp1_industry.head())
print(definitions)


0    0
2    1
3    2
4    3
5    4
Name: exp1_industry, dtype: int64
Index(['Management Consulting', 'Computer Software', 'Internet',
       'Information Technology and Services', 'Civic Social Organization',
       'Higher Education', 'Performing Arts', 'Marketing and Advertising',
       'Research', 'Civil Engineering', 'Venture Capital Private Equity',
       'Nonprofit Organization Management', 'Government Relations', 'Banking',
       'Education Management', 'Pharmaceuticals', 'Government Administration',
       'Information Services', 'Retail', 'Market Research',
       'Investment Banking', 'Renewables Environment', 'Hospital Health Care',
       'Chemicals', 'Human Resources', 'Public Relations and Communications',
       'International Affairs', 'Real Estate', 'Environmental Services',
       'Think Tanks', 'Farming', 'Architecture Planning',
       'Leisure Travel Tourism', 'Online Media', 'Computer Network Security',
       'Professional Training Coaching', 'Security and Inv

In [244]:
mod_df.head()

Unnamed: 0,exp1_industry,text
0,0,france paris metropolitan region antoine croit à lintelligence de tous et à la responsabilité de chacun pour réinventer un monde qui reste à hauteur dhommeil est à ce titre partie prenante de nombreux projets qui tous visent à rallumer les soleils humains et notamment directeur associé de bluenove qui assume que les organisations positives les plus impactantes de demain sont celles qui feront levier de lintelligence collectivea linitiative du projet brightmirror visant à permettre la scénar...
2,1,france paris metropolitan region nan it services and it consulting directrice des opérations chez cap collectif nan nan nan role administratrice companyname la cravate solidaire timeperiod enddate month 6 year 2021 startdate month 9 year 2016 company minicompany objecturn urnlicompany5186825 entityurn urnlifsminicompany5186825 name la cravate solidaire showcase false active true logo comlinkedincommonvectorimage artifacts width 200 fileidentifyingurlpathsegment 20020001519903535287e169344000...
3,2,france paris metropolitan region people power civic and social organizations regional director france italy russia and spain at changeorg name anglais proficiency fullprofessional name espagnol proficiency limitedworking name français proficiency nativeorbilingual name hindi proficiency elementary nan nan role volunteer companyname samu social de paris timeperiod enddate month 12 year 2012 startdate month 12 year 2009 cause socialservices description giving meals to homeles during christmas...
4,3,france paris metropolitan region 16 years of experience driving change in the tech media industrygovernment business development expertisedelivering results in versatile and fastpaced environmentsawarded mit innovator under 35 top social innovator tedxparis inspiring 50 award for women in techin parallel of my role of ceo fluicity i volunteer in ngos as board member becode or founding member association civic tech europe it services and it consulting cofounder ceo fluicity cofounder asso...
5,4,france paris metropolitan region nan nonprofit organizations directrice générale adjointe chez adie name anglais proficiency professionalworking name français proficiency nativeorbilingual nan nan nan nan nan paris metropolitan region adie nan directrice des relations institutionnelles et du plaidoyer 90 20190 nan nan civic social organization 5010 paris metropolitan region fluicity responsable du développement start up de la civic tech plateforme numérique de participation citoyennedévelop...


## Setting X, y, test & train sets and vectorize

In [245]:
X= mod_df["text"]
y= mod_df["exp1_industry"]

In [246]:
texts_train, texts_test, Y_train, Y_test = train_test_split(X, y, test_size=0.15,
                                                            random_state=432)

In [247]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(texts_train)
X_test = vectorizer.transform(texts_test)

## Random Forest Modeling

In [248]:
model_rf = RandomForestClassifier()#(class_weight='balanced_subsample')

In [249]:
# to check if there is a problem (and there often is) 
# note XGBoost works for binary classification only 
# even though the error message says somthing else 

model_rf.fit(X_train, Y_train)

In [250]:
y_pred = model_rf.predict(X_test)

In [251]:
#Reverse factorize (converting y_pred from 0s,1s and 2s to industries)
#reversefactor = dict(zip(range(44),definitions))
#Y_test = np.vectorize(reversefactor.get)(Y_test)
#y_pred = np.vectorize(reversefactor.get)(y_pred)
# Making the Confusion Matrix
print(pd.crosstab(Y_test, y_pred, rownames=['Industry'], colnames=['Predicted Industry']))


Predicted Industry  0   1   3   4   6   8   16
Industry                                      
0                    1   1   0   0   0   0   0
1                    0   2   0   0   0   0   0
3                    0   0   4   0   0   0   0
4                    0   1   0   3   0   0   0
5                    0   0   1   0   0   0   0
...                 ..  ..  ..  ..  ..  ..  ..
19                   0   0   0   0   0   1   0
20                   0   0   1   0   0   0   0
31                   0   0   0   1   0   0   0
32                   0   0   0   0   1   0   0
33                   0   1   0   0   0   0   0

[13 rows x 7 columns]


In [252]:
accuracy_score(Y_test, y_pred)

0.5238095238095238

## Additional work

In [253]:
# try to get the ROC-AUC score 
# Clean the columns in original dataframe to avoid getting URNs - languages, publications, etc
# try vectorizer without stemming but with stop words and vectorizer with stemming 
# try TDIF 
# Try KNN (cf. Thomas's message)

# remove numbers ? 
# impute industry for the missing ones 
# handle imbalance ? 
# should some words have more weight than others ? 

# Notes for later : 
# cosine similarity instead of TDIF vectorizer ? 
# use PCA and make clusters of people 
# change the Y to 1/0 based on a civic tech list (e.g. the one that appears when creating companies SQL tables)
# work with companies' data to make clusters of similar companies and predict for that/ recommend closest neighbors (and NGOs)


### Vectorizer without stemming

In [254]:
vectorizer = CountVectorizer(stop_words="english", ngram_range=(1, 2))

In [255]:
vectors = vectorizer.fit_transform(X).todense()

In [256]:
pd.set_option("display.max_rows", 10)

In [257]:
vocabulary = vectorizer.get_feature_names_out()

In [258]:
pd.DataFrame(vectors, columns=vocabulary, index=X).shape

(140, 85569)

### Vectorizer with stemming 

In [259]:
from nltk.stem.snowball import EnglishStemmer, FrenchStemmer
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS

In [260]:
stemmer = EnglishStemmer()

In [261]:
default_analyzer = CountVectorizer(stop_words=ENGLISH_STOP_WORDS).build_analyzer()

In [262]:
def analyze_with_stemming(text):
    unstemmed_words = default_analyzer(text)
    return (stemmer.stem(word) for word in unstemmed_words)

In [263]:
stemmer_vectorizer = CountVectorizer(analyzer=analyze_with_stemming)

In [264]:
vectors = stemmer_vectorizer.fit_transform(X).todense()
vocabulary = stemmer_vectorizer.get_feature_names_out()
pd.DataFrame(vectors, columns=vocabulary, index=X).shape

(140, 17549)