# Random Forest

- Un random forest est constitué d’un ensemble d’arbres de décision indépendants, ou l'algorithme essai de séparé les différents types de données en fonction de la décision majoritaire de chaque arbre de décision

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from matplotlib import pyplot as plt
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier 
from sklearn import svm


In [2]:
# resumé de l'étape : cleaning
data = pd.read_csv('./Data/SMSSpamCollection.txt', sep='\t', header=None)
data.columns = ['label', 'content']

en_stop_words = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()


def clean_data(texte):
    return [ps.stem(word) for word in re.split("\W+", "".join([ch for ch in texte if ch not in string.punctuation])) if word not in en_stop_words]


# resumé de l'étape : vectorisation & Feature Engineering
def count_punct(texte):
    return round(sum([1 for ch in texte if ch in string.punctuation])/(len(texte) - texte.count(" ")), 4)*100, (len(texte) - texte.count(" "))


data['punct_rate'] = data['content'].apply(lambda x: count_punct(x)[0])
data['content_len'] = data['content'].apply(lambda x: count_punct(x)[1])

vectorization_full_tf = TfidfVectorizer(analyzer=clean_data)
vect_final_tf = vectorization_full_tf.fit_transform(data['content'])
pd_vect_final_tf = pd.DataFrame(vect_final_tf.toarray())
pd_vect_final_tf.columns = vectorization_full_tf.get_feature_names_out()

all_data = pd.concat([pd_vect_final_tf,
                     data['punct_rate'], data['content_len']], axis=1)

all_data


Unnamed: 0,Unnamed: 1,0,008704050406,0089mi,0121,01223585236,01223585334,0125698789,02,020603,...,zogtoriu,zoom,zouk,zyada,é,ü,üll,〨ud,punct_rate,content_len
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,9.78,92
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,25.00,24
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,4.69,128
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,15.38,39
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,4.08,49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,6.11,131
5568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.311086,0.0,0.0,3.45,29
5569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,14.58,48
5570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,1.00,100


In [3]:
X_train, X_test, Y_train, Y_test = train_test_split(all_data, data['label'], test_size=0.2)
algo_random_forest = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
model = algo_random_forest.fit(X_train, Y_train)

In [4]:
predictions = model.predict(X_test)
predictions

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

In [5]:
# result
precision, recall, fscore, _ = score(Y_test, predictions, pos_label='spam', average='binary')
print("precision = {}, recall = {}, accuracy = {}".format
      (round(precision,3), round(recall, 3), round((predictions == Y_test).sum()/len(predictions), 3)))

precision = 1.0, recall = 0.695, accuracy = 0.961


## RandomForest_CrossValidation

In [6]:
algo_random_forest = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
k_fold = KFold(n_splits=4)
accuray_rf = cross_val_score(algo_random_forest, all_data, data['label'], scoring='accuracy', cv=k_fold, n_jobs=-1)
print("accuracy pour chaque batch = {}\naccuracy totale = {}".format(
    accuray_rf, sum(accuray_rf)/4))


accuracy pour chaque batch = [0.93754487 0.95262024 0.94903087 0.94615937]
accuracy totale = 0.9463388370423547
