# Random Forest Model

## Initializing

In [57]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [58]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS

from nltk.stem.snowball import EnglishStemmer, FrenchStemmer

from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import NearestNeighbors

from sklearn.metrics import (confusion_matrix,
    accuracy_score, precision_score, recall_score,
    precision_recall_curve, roc_curve, roc_auc_score)

from sklearn.utils.class_weight import compute_sample_weight

pd.set_option("display.max_colwidth", 500)

In [59]:
# import file
df= pd.read_csv("dataset.csv", sep=';')

In [60]:
# check it 
print(df.shape)
print(list(df.columns))

(168, 63)
['ed_1degreeName', 'ed_1fieldOfStudy', 'ed_1schoolName', 'ed_1timePeriod', 'ed_2degreeName', 'ed_2fieldOfStudy', 'ed_2schoolName', 'ed_2timePeriod', 'ed_3degreeName', 'ed_3fieldOfStudy', 'ed_3schoolName', 'ed_3timePeriod', 'exp_1company', 'exp_1companyName', 'exp_1description', 'exp_1geoLocationName', 'exp_1locationName', 'exp_1region', 'exp_1timePeriod', 'exp_1title', 'exp_2company', 'exp_2companyName', 'exp_2description', 'exp_2geoLocationName', 'exp_2locationName', 'exp_2region', 'exp_2timePeriod', 'exp_2title', 'exp_3company', 'exp_3companyName', 'exp_3description', 'exp_3geoLocationName', 'exp_3locationName', 'exp_3region', 'exp_3timePeriod', 'exp_3title', 'exp_4company', 'exp_4companyName', 'exp_4description', 'exp_4geoLocationName', 'exp_4locationName', 'exp_4timePeriod', 'exp_4title', 'exp_5company', 'exp_5companyName', 'exp_5description', 'exp_5geoLocationName', 'exp_5locationName', 'exp_5timePeriod', 'exp_5title', 'geoCountryName', 'geoLocationName', 'headline', 'in

## Creating the target column

In [61]:
import re

In [62]:
def create_target(x):
    x=str(x)
    pattern = r"\['.*'\]"
    a= re.findall(pattern, x)
    a= ''.join(a)
    return a[2:-2]

In [63]:
# testing the function 
#text = "{'employeeCountRange': {'start': 2, 'end': 10}, 'industries': ['Computer Software']} "
#create_target(text)

In [64]:
df['target']= df['exp_1company'].apply(create_target)

In [65]:
print(len(df.target.unique()))
df.target.value_counts()

47


Computer Software                      19
                                       17
Information Technology and Services    17
Government Administration              14
Civic & Social Organization            13
                                       ..
Pharmaceuticals                         1
Education Management                    1
Government Relations                    1
Civil Engineering                       1
Political Organization                  1
Name: target, Length: 47, dtype: int64

In [66]:
companies= list(df.exp_1companyName.unique())
companies.sort()
print(len(companies))
print(companies)

141
['1001mots', 'A Voté', 'AD Education', 'AFD', 'AI Builders', "ANSSI - Agence nationale de la sécurité des systèmes d'information", 'APALA', "ASFE - Alliance Solidaire des Français de l'Etranger", 'Accenta.ai', 'Adie', 'Agence Française de Développement', 'Agence Proches', 'Agence Régionale du Tourisme Grand Est [ART GE]', 'AgroParisTech', 'Altermakers', 'Amethis', 'Analog Sport', 'Antidox', 'ArsLonga', 'Assistance Publique - Hôpitaux de Paris', 'Asterion Ventures', 'Avere-France', 'Avolta', 'BESNARD CHARPENTE SARL', 'BINGE AUDIO', 'Billy', 'Boston Consulting Group (BCG)', 'CERBA HEALTHCARE', "CNIL - Commission Nationale de l'Informatique et des Libertés", "CRT Côte d'Azur France", 'Cap Collectif', 'Capgemini Invent', 'Carbone 4', 'Change.org', 'Citipo - citipo.com', 'Civocracy', 'Ckatalyzen', 'Coca-Cola Europacific Partners', 'Colas Rail', 'Combo (ex-Snapshift)', 'Communauté urbaine du Grand Reims', 'Cour des comptes', 'Crédit Agricole CIB', 'Dailymotion', 'Datagora', 'Decidim', 'D

## Preparing the dataframe

In [67]:
# our "y" here is the exp1_industry 
# it could be the companies but there are a lot of them (128)
# next we are going to drop everything else related to the 1st (last) experience
# ['exp1_locationName', 'exp1_companyName',
#       'exp1_description', 'exp1_title', 'exp1_startDate_month',
#       'exp1_startDate_year', 'exp1_industry', 'exp1_company_empl_low']

In [68]:
exp1_columns= [i for i in df.columns if 'exp_1' in i]
df.drop(columns=exp1_columns, inplace=True)

In [69]:
df=df.applymap(str)

In [70]:
# creating a new database with only 2 columns (joining all text)
df.set_index("target", drop=True, inplace=True)
list_df_columns=list(df.columns)
df["text"] = df[list_df_columns].apply(lambda x: " ".join(x), axis =1)
df.reset_index(inplace=True)
mod_df=df[["target", "text"]]
mod_df.head(2)

Unnamed: 0,target,text
0,,"Master informatique Architecture logicielle distribuée Université Bordeaux I {'endDate': {'year': 2009}, 'startDate': {'year': 2004}} [] [] [] [] [] [] [] [] {'employeeCountRange': {'start': 201, 'end': 500}, 'industries': ['Information Technology and Services']} Clever Age Expert PHP : audit, performance, méthodologie autour des développements framework (symfony, Jelix), et CMS (Drupal). Accompagnement dans le développement et formation. [] [] [] {'endDate': {'month': 1, 'year': 2012}, 'sta..."
1,Computer Software,"Master 2 Sciences politiques Université Paris 1 Panthéon-Sorbonne {'endDate': {'year': 2007}, 'startDate': {'year': 2005}} Licence Droit Université Lille 2 Droit et Santé {'endDate': {'year': 2005}, 'startDate': {'year': 2002}} [] [] [] [] {'employeeCountRange': {'start': 11, 'end': 50}, 'industries': ['Computer Software']} Cap Collectif Key figures : 250 clients / 2,5 M€ CA / 30 ETP. \nPrincipaux clients : ministères (République numérique, Grand débat national...), collectivités territorial..."


In [71]:
def clean_text_col(x):
    x=str(x)
    x= "".join(ch for ch in x if ch.isalnum() or ch==' ')
    x= x.lower()
    return x

In [72]:
mod_df["text"] = mod_df['text'].apply(clean_text_col)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mod_df["text"] = mod_df['text'].apply(clean_text_col)


In [73]:
# keep this dataframe because it could be useful to test the model later
exp1_industry_nan= mod_df.loc[mod_df["target"]=='']
mod_df= mod_df.loc[mod_df["target"]!='']
exp1_industry_nan.head()

Unnamed: 0,target,text
0,,master informatique architecture logicielle distribuée université bordeaux i enddate year 2009 startdate year 2004 employeecountrange start 201 end 500 industries information technology and services clever age expert php audit performance méthodologie autour des développements framework symfony jelix et cms drupal accompagnement dans le développement et formation enddate month 1 year 2012 startdate month 9 year 2009 consultant expert php solution interactive enddate month 9 ...
23,,graphiste plurimédia école des métiers de linformation scop sa enddate year 2022 startdate year 2021 communication général université sorbonne nouvelle enddate year 1994 startdate year 1991 sociologie et linguistique université paris descartes enddate year 1991 startdate year 1990 employeecountrange start 201 end 500 industries civic social organization mouvement international atd quart monde stage pratique de fin de formation au sein de la direction de la communication et de lengagement...
27,,diploma in global leadership and public policy for the 21st century harvard kennedy school enddate year 2013 startdate year 2013 ecole nationale dadministration enddate year 2002 startdate year 2000 economics and finance political science sciences po enddate year 1990 startdate year 1987 sustainable leading sme towards sustainability to generate shared value along the supply chain strategy sustainability consulting optimizing blue chip companies supply chain through sustainability ...
29,,sciences economiques université paris 1 panthéonsorbonne enddate year 1974 startdate year 1968 librairie résistances paris paris enddate month 2 year 2020 startdate month 9 year 2012 gérant industries graphic design typical paris paris enddate month 2 year 2020 startdate month 1 year 1995 cofondateur gérant sepelco signalétique paris paris enddate month 10 year 2010 startdate month 5 year 1980 cofondateur gérant employeecountrange start 10001 industries public policy ministèr...
77,,centre de formation des journalistes enddate year 2015 startdate year 2013 bachelor in international business international business grenoble ecole de management enddate year 2012 startdate year 2009 sciences politiques et gouvernement classe préparatoire intégrale enddate year 2009 startdate year 2008 supplément dâme startdate month 11 year 2018 créatrice du podcast supplément dâme employeecountrange start 1001 end 5000 industries international affairs un environment région de p...


In [74]:
factor = pd.factorize(mod_df['target'])
mod_df.target = factor[0]
definitions = factor[1]
print(list(definitions))


['Computer Software', 'Internet', 'Information Technology and Services', 'Civic & Social Organization', 'Management Consulting', 'Higher Education', 'Performing Arts', 'Marketing and Advertising', 'Research', 'Civil Engineering', 'Venture Capital & Private Equity', 'Nonprofit Organization Management', 'Government Relations', 'Banking', 'Education Management', 'Pharmaceuticals', 'Government Administration', 'Information Services', 'Retail', 'Market Research', 'Investment Banking', 'Transportation/Trucking/Railroad', 'Consumer Goods', 'Renewables & Environment', 'Hospital & Health Care', 'Chemicals', 'Human Resources', 'Public Relations and Communications', 'International Affairs', 'Real Estate', 'Environmental Services', 'Think Tanks', 'Online Media', 'Farming', 'Architecture & Planning', 'Leisure, Travel & Tourism', 'Building Materials', 'Computer & Network Security', 'Professional Training & Coaching', 'Security and Investigations', 'Restaurants', 'Telecommunications', 'Insurance', 'F

In [75]:
mod_df.head(2)

Unnamed: 0,target,text
1,0,master 2 sciences politiques université paris 1 panthéonsorbonne enddate year 2007 startdate year 2005 licence droit université lille 2 droit et santé enddate year 2005 startdate year 2002 employeecountrange start 11 end 50 industries computer software cap collectif key figures 250 clients 25 m ca 30 etp principaux clients ministères république numérique grand débat national collectivités territoriales 100 communes départements et régions entreprises la poste edf carrefour colas ass...
2,1,masters 2 degree marketing strategy integrated marketing communication imc université paris dauphine enddate year 2015 startdate year 2014 postgraduate masters degree in information and communication sorbonne nouvelle university université sorbonne nouvelle paris iii enddate year 2013 startdate year 2009 bachelors degree communication rédaction et multimédia université de sherbrooke sherbrooke university enddate year 2012 startdate year 2011 employeecountrange start 0 end 1 industries...


## Random Forest with basic CountVectorizer

In [76]:
X= mod_df["text"]
y= mod_df["target"]
texts_train, texts_test, Y_train, Y_test = train_test_split(X, y, test_size=0.15,
                                                            random_state=432)

In [77]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(texts_train)
X_test = vectorizer.transform(texts_test)

In [78]:
model_rf = RandomForestClassifier()#(class_weight='balanced_subsample')
model_rf.fit(X_train, Y_train)

In [79]:
y_pred = model_rf.predict(X_test)

In [80]:
accuracy_score(Y_test, y_pred)

0.13043478260869565

In [81]:
#Reverse factorize (converting y_pred from 0s,1s and 2s to industries)
#reversefactor = dict(zip(range(44),definitions))
#Y_test = np.vectorize(reversefactor.get)(Y_test)
#y_pred = np.vectorize(reversefactor.get)(y_pred)

# Making the Confusion Matrix is actually useless with 44 categories 
#print(pd.crosstab(Y_test, y_pred, rownames=['Industry'], colnames=['Predicted Industry']))

*Notes on this try*  
Accuracy is around 20. Compared to 1/47, it's ok.  
But adding the information on languages, publications, honors, etc. actually confuses the model :)  
Regarding measures : ROC AUC apparently only works for binary (which makes sense)  

## Random Forest with CountVectorizer and stop words

In [82]:
X= mod_df["text"]
y= mod_df["target"]
texts_train, texts_test, Y_train, Y_test = train_test_split(X, y, test_size=0.15,
                                                            random_state=432)

In [83]:
vectorizer = CountVectorizer(stop_words="english", ngram_range=(1, 2))
vectors = vectorizer.fit_transform(X).todense()
pd.set_option("display.max_rows", 10)
vocabulary = vectorizer.get_feature_names_out()
pd.DataFrame(vectors, columns=vocabulary, index=X).shape

(151, 39648)

In [84]:
X_train = vectorizer.fit_transform(texts_train)
X_test = vectorizer.transform(texts_test)
model_rf = RandomForestClassifier()
model_rf.fit(X_train, Y_train)

In [85]:
y_pred = model_rf.predict(X_test)
accuracy_score(Y_test, y_pred)

0.21739130434782608

Notes on this try :  
Returns approximately the same accuracy score

### Random Forest with CountVectorizer, stop words & stemming 

In [86]:
X= mod_df["text"]
y= mod_df["target"]
texts_train, texts_test, Y_train, Y_test = train_test_split(X, y, test_size=0.15,
                                                            random_state=432)

In [87]:
stemmer = EnglishStemmer()
default_analyzer = CountVectorizer(stop_words=ENGLISH_STOP_WORDS).build_analyzer()
def analyze_with_stemming(text):
    unstemmed_words = default_analyzer(text)
    return (stemmer.stem(word) for word in unstemmed_words)
stemmer_vectorizer = CountVectorizer(analyzer=analyze_with_stemming)

In [88]:
vectors = stemmer_vectorizer.fit_transform(X).todense()
vocabulary = stemmer_vectorizer.get_feature_names_out()
pd.DataFrame(vectors, columns=vocabulary, index=X).shape

(151, 7205)

In [89]:
X_train = vectorizer.fit_transform(texts_train)
X_test = vectorizer.transform(texts_test)
model_rf = RandomForestClassifier()#(class_weight='balanced_subsample')
model_rf.fit(X_train, Y_train)

In [90]:
y_pred = model_rf.predict(X_test)
accuracy_score(Y_test, y_pred)

0.21739130434782608

This doesn't change anything either... 

### Random Forest with TDIF vectorizer

In [91]:
X= mod_df["text"]
y= mod_df["target"]
texts_train, texts_test, Y_train, Y_test = train_test_split(X, y, test_size=0.15,
                                                            random_state=432)

In [92]:
vectorizer = TfidfVectorizer(stop_words="english")
vectors = vectorizer.fit_transform(X).todense()
pd.set_option("display.max_rows", 10)
vocabulary = vectorizer.get_feature_names_out()
pd.DataFrame(vectors, columns=vocabulary, index=X).shape

(151, 9123)

In [93]:
X_train = vectorizer.fit_transform(texts_train)
X_test = vectorizer.transform(texts_test)
model_rf = RandomForestClassifier()
model_rf.fit(X_train, Y_train)

In [94]:
y_pred = model_rf.predict(X_test)
accuracy_score(Y_test, y_pred)

0.13043478260869565

Scores evolve a lot  
Could that be related to the model not having a random state set ?

### Random Forest with count vectorizer and stemming, but with imbalance handling

In [95]:
X= mod_df["text"]
y= mod_df["target"]
texts_train, texts_test, Y_train, Y_test = train_test_split(X, y, test_size=0.15,
                                                            random_state=432)

In [96]:
vectorizer = CountVectorizer(stop_words="english", strip_accents='unicode', max_df=0.5) 
vectors = vectorizer.fit_transform(X).todense()
pd.set_option("display.max_rows", 10)
vocabulary = vectorizer.get_feature_names_out()
pd.DataFrame(vectors, columns=vocabulary, index=X).shape

(151, 8902)

In [97]:
X_train = vectorizer.fit_transform(texts_train)
X_test = vectorizer.transform(texts_test)
model_rf = RandomForestClassifier(class_weight='balanced_subsample')
model_rf.fit(X_train, Y_train)

In [98]:
y_pred = model_rf.predict(X_test)
accuracy_score(Y_test, y_pred)

0.17391304347826086

In [99]:
# stripping accents doesn't seem to do much...  
# same for setting max_df at different ranges (0.5, 0.7, 0.8) - not many words seem to be shared in the corpus, which is odd. 


# ADDITIONAL STUFF TO TRY

Model tweaking with Gridsearch & parameters for the vectorizers: 
* Count : https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
* TDIF : https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
* Random Forest parameters: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

Try nearest neighbors: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html#sklearn.neighbors.NearestNeighbors  
And the associated user guide: https://scikit-learn.org/stable/modules/neighbors.html#unsupervised-neighbors  
Given an array or dataframe of vectors (each row representing an item) and a new vector (just one row, representing the reference item chosen by the user), it will give you the k closest items (smallest distance).  


Text preparation issues : 
* remove numbers ? 
* impute industry for the missing ones 
* should some words have more weight than others ? 

### Notes for later : 
* cosine similarity instead of TDIF vectorizer ? 
* use PCA and make clusters of people ? 
* change the Y to 1/0 based on a civic tech list (e.g. the one that appears when creating companies SQL tables)
* work with companies' data to make clusters of similar companies and predict for that/ recommend closest neighbors (and NGOs)
