### Entrainement sur une répartition 50 -50, 1 millions de lignes et dataset amélioré

In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
df_ad = pd.read_excel('adresses_50_50_am.xlsx', encoding='latin-1')
df_bruit = pd.read_excel('bruit.xlsx', encoding='latin-1')

In [3]:
df_50_50 = pd.concat([df_ad, df_bruit])

In [4]:
df_50_50=sklearn.utils.shuffle(df_50_50)

In [5]:
from sklearn.model_selection import train_test_split 
#on divise l'échantillon en train et test 
X = df_50_50['Adresses']
y = df_50_50['Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((683004,), (336406,), (683004,), (336406,))

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

tfidf_vect = TfidfVectorizer()
X_train_tf = tfidf_vect.fit_transform(X_train.astype('U'))
vectorize = tfidf_vect
pickle.dump(vectorize,open("vectorizer.pkl","wb"))
X_test_tf = tfidf_vect.transform(X_test.astype('U'))
X_train_tf.shape, X_test_tf.shape

((683004, 279655), (336406, 279655))

In [7]:
X_train_tf

<683004x279655 sparse matrix of type '<class 'numpy.float64'>'
	with 2877230 stored elements in Compressed Sparse Row format>

- ### Modèle numéro 1 : Naïve Bayes multinomial 

In [19]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB(alpha = 0.01, fit_prior = False)
clf = clf.fit(X_train_tf, y_train)

y_pred = clf.predict(X_test_tf)

In [20]:
from sklearn import metrics #on imprime les métrics d'évaluations
from sklearn.metrics import accuracy_score

target_names = ['bruit', 'adresse']
print(metrics.classification_report(y_test, y_pred, target_names=target_names))
print("accuracy of the model is :", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

       bruit       1.00      0.94      0.97    165799
     adresse       0.95      1.00      0.97    170607

    accuracy                           0.97    336406
   macro avg       0.97      0.97      0.97    336406
weighted avg       0.97      0.97      0.97    336406

accuracy of the model is : 0.9706307259680267


In [14]:
metrics.confusion_matrix(y_test, y_pred)

array([[156080,   9719],
       [   161, 170446]], dtype=int64)

In [15]:
from sklearn.model_selection import GridSearchCV
import time
X = X_train_tf
y = y_train
parameters = {'alpha': (3,2, 1.3, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001, 0.0000001), 'fit_prior' : (True, False)}
gs_clf = GridSearchCV(MultinomialNB(), parameters, cv=8, n_jobs=-1)
gs_clf.fit(X, y)  
#print("done in {0}s".format(time() - t0))  
print("Best score: {0}".format(gs_clf.best_score_))  
print("Best parameters set:")  
best_parameters = gs_clf.best_estimator_.get_params()
for param_name in sorted(list(parameters.keys())):  
            print("\t{0}: {1}".format(param_name, best_parameters[param_name]))

Best score: 0.9709503897488155
Best parameters set:
	alpha: 0.01
	fit_prior: False


In [18]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(MultinomialNB(alpha = 0.01, fit_prior = False), X, y, cv=8)
print("Cross-validation scores: {}".format(scores))

Cross-validation scores: [0.97140883 0.97166651 0.97074119 0.97156109 0.9709634  0.97016691
 0.97028404 0.97081113]


- ### Modèle numéro 2 : Régression Logistique

In [21]:
from sklearn.linear_model import LogisticRegression

Lr = LogisticRegression(random_state=0, max_iter = 200, solver = 'lbfgs', class_weight = 'dict', C = 1.2)
Lr = Lr.fit(X_train_tf, y_train)
y_pred = Lr.predict(X_test_tf) 

In [22]:
from sklearn import metrics #on imprime les métrics d'évaluations
from sklearn.metrics import accuracy_score

target_names = ['bruit', 'adresse']
print(metrics.classification_report(y_test, y_pred, target_names=target_names))
print("accuracy of the model is :", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

       bruit       0.99      1.00      0.99    165799
     adresse       1.00      0.99      0.99    170607

    accuracy                           0.99    336406
   macro avg       0.99      0.99      0.99    336406
weighted avg       0.99      0.99      0.99    336406

accuracy of the model is : 0.9931957218361147


- ### Modèle numéro 3 : Random Forest Classifier

In [23]:
from sklearn.ensemble import RandomForestClassifier

Rf = RandomForestClassifier(max_depth=8, random_state=0, n_estimators = 350, bootstrap = True, max_features = 'auto', min_samples_split = 5)
Rf = Rf.fit(X_train_tf, y_train)
y_pred = Rf.predict(X_test_tf) 

In [24]:
from sklearn import metrics #on imprime les métrics d'évaluations
from sklearn.metrics import accuracy_score

target_names = ['bruit', 'adresse']
print(metrics.classification_report(y_test, y_pred, target_names=target_names))
print("accuracy of the model is :", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

       bruit       0.96      1.00      0.98    165799
     adresse       1.00      0.96      0.98    170607

    accuracy                           0.98    336406
   macro avg       0.98      0.98      0.98    336406
weighted avg       0.98      0.98      0.98    336406

accuracy of the model is : 0.9758387186911054


- ### Modèle numéro 4 : gradient Boost Classifier

In [78]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=8, random_state=0)
gb = gb.fit(X_train_tf, y_train)
y_pred = gb.predict(X_test_tf)

In [77]:
from sklearn import metrics #on imprime les métrics d'évaluations
from sklearn.metrics import accuracy_score

target_names = ['bruit', 'adresse']
print(metrics.classification_report(y_test, y_pred, target_names=target_names))
print("accuracy of the model is :", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

       bruit       0.99      1.00      0.99    165799
     adresse       1.00      0.99      0.99    170607

    accuracy                           0.99    336406
   macro avg       0.99      0.99      0.99    336406
weighted avg       0.99      0.99      0.99    336406

accuracy of the model is : 0.9929906125336647


- Une fois les modèle entrainé, testé puis amélioré on peut les enregistrer 

In [67]:
import pickle

# pickle.dump(clf, open('nbm.pkl', 'wb'))
# pickle.dump(Lr, open('Lr.pkl', 'wb'))
# pickle.dump(Rf, open('Rf.pkl', 'wb'))
pickle.dump(gb, open('gb.pkl', 'wb'))

#### Test sur deux répartitions biaisées 

In [30]:
df_991 = pd.read_excel('99-1.xlsx', encoding='latin-1')
df_199 = pd.read_excel('1-99.xlsx', encoding='latin-1')


### Répartition 99% d'adresse vs 1% de bruit

In [68]:


X = df_991['Adresses']
y = df_991['Target']


In [69]:
import pickle

vectorisation = pickle.load(open("vectorizer.pkl", "rb"))
X_tf = vectorisation.transform(X.astype('U'))

X_tf.shape, y.shape

((100001, 279655), (100001,))

In [33]:
import pickle
# load the model from disk

nbm = pickle.load(open("nbm.pkl", "rb"))
y_pred = nbm.predict(X_tf)


In [34]:
from sklearn import metrics #on imprime les métrics d'évaluations
from sklearn.metrics import accuracy_score

target_names = ['bruit', 'adresse']
print(metrics.classification_report(y, y_pred, target_names=target_names))
print("accuracy of the model is :", accuracy_score(y, y_pred))

              precision    recall  f1-score   support

       bruit       0.64      0.78      0.70      1002
     adresse       1.00      1.00      1.00     98999

    accuracy                           0.99    100001
   macro avg       0.82      0.89      0.85    100001
weighted avg       0.99      0.99      0.99    100001

accuracy of the model is : 0.993360066399336


In [35]:
import pickle
# load the model from disk

lr = pickle.load(open("Lr.pkl", "rb"))
y_pred = lr.predict(X_tf)

In [36]:
from sklearn import metrics #on imprime les métrics d'évaluations
from sklearn.metrics import accuracy_score

target_names = ['bruit', 'adresse']
print(metrics.classification_report(y, y_pred, target_names=target_names))
print("accuracy of the model is :", accuracy_score(y, y_pred))

              precision    recall  f1-score   support

       bruit       0.27      1.00      0.42      1002
     adresse       1.00      0.97      0.99     98999

    accuracy                           0.97    100001
   macro avg       0.63      0.99      0.70    100001
weighted avg       0.99      0.97      0.98    100001

accuracy of the model is : 0.972370276297237


In [37]:
import pickle
# load the model from disk

rf = pickle.load(open("Rf.pkl", "rb"))
y_pred = rf.predict(X_tf)


In [38]:
from sklearn import metrics #on imprime les métrics d'évaluations
from sklearn.metrics import accuracy_score

target_names = ['bruit', 'adresse']
print(metrics.classification_report(y, y_pred, target_names=target_names))
print("accuracy of the model is :", accuracy_score(y, y_pred))

              precision    recall  f1-score   support

       bruit       0.11      1.00      0.19      1002
     adresse       1.00      0.92      0.96     98999

    accuracy                           0.92    100001
   macro avg       0.55      0.96      0.57    100001
weighted avg       0.99      0.92      0.95    100001

accuracy of the model is : 0.9159008409915901


In [70]:
import pickle
# load the model from disk

gb = pickle.load(open("gb.pkl", "rb"))
y_pred = gb.predict(X_tf)

In [71]:
from sklearn import metrics #on imprime les métrics d'évaluations
from sklearn.metrics import accuracy_score

target_names = ['bruit', 'adresse']
print(metrics.classification_report(y, y_pred, target_names=target_names))
print("accuracy of the model is :", accuracy_score(y, y_pred))

              precision    recall  f1-score   support

       bruit       0.24      1.00      0.39      1002
     adresse       1.00      0.97      0.98     98999

    accuracy                           0.97    100001
   macro avg       0.62      0.98      0.69    100001
weighted avg       0.99      0.97      0.98    100001

accuracy of the model is : 0.9684403155968441


### Répartition 1% d'adresse vs 99% de bruit

In [72]:
# df_991
# df_199

X = df_199['Adresses']
y = df_199['Target']

In [73]:
import pickle

vectorisation = pickle.load(open("vectorizer.pkl", "rb"))
X_tf199 = vectorisation.transform(X.astype('U'))


X_tf199.shape, y.shape

((99999, 279655), (99999,))

In [49]:
import pickle
# load the model from disk

nbm = pickle.load(open("nbm.pkl", "rb"))
y_pred = nbm.predict(X_tf199)


In [50]:
from sklearn import metrics #on imprime les métrics d'évaluations
from sklearn.metrics import accuracy_score

target_names = ['bruit', 'adresse']
print(metrics.classification_report(y, y_pred, target_names=target_names))
print("accuracy of the model is :", accuracy_score(y, y_pred))

              precision    recall  f1-score   support

       bruit       1.00      0.76      0.87     98999
     adresse       0.04      1.00      0.08      1000

    accuracy                           0.77     99999
   macro avg       0.52      0.88      0.47     99999
weighted avg       0.99      0.77      0.86     99999

accuracy of the model is : 0.7658176581765818


In [51]:
import pickle
# load the model from disk

lr = pickle.load(open("Lr.pkl", "rb"))
y_pred = lr.predict(X_tf199)

In [52]:
from sklearn import metrics #on imprime les métrics d'évaluations
from sklearn.metrics import accuracy_score

target_names = ['bruit', 'adresse']
print(metrics.classification_report(y, y_pred, target_names=target_names))
print("accuracy of the model is :", accuracy_score(y, y_pred))

              precision    recall  f1-score   support

       bruit       1.00      0.98      0.99     98999
     adresse       0.33      0.96      0.49      1000

    accuracy                           0.98     99999
   macro avg       0.66      0.97      0.74     99999
weighted avg       0.99      0.98      0.98     99999

accuracy of the model is : 0.979839798397984


In [55]:
import pickle
# load the model from disk

rf = pickle.load(open("Rf.pkl", "rb"))
y_pred = rf.predict(X_tf199)

In [56]:
from sklearn import metrics #on imprime les métrics d'évaluations
from sklearn.metrics import accuracy_score

target_names = ['bruit', 'adresse']
print(metrics.classification_report(y, y_pred, target_names=target_names))
print("accuracy of the model is :", accuracy_score(y, y_pred))

              precision    recall  f1-score   support

       bruit       1.00      0.95      0.97     98999
     adresse       0.15      0.90      0.26      1000

    accuracy                           0.95     99999
   macro avg       0.58      0.92      0.62     99999
weighted avg       0.99      0.95      0.97     99999

accuracy of the model is : 0.94959949599496


In [74]:
import pickle
# load the model from disk

gb = pickle.load(open("gb.pkl", "rb"))
y_pred = gb.predict(X_tf199)

In [75]:
from sklearn import metrics #on imprime les métrics d'évaluations
from sklearn.metrics import accuracy_score

target_names = ['bruit', 'adresse']
print(metrics.classification_report(y, y_pred, target_names=target_names))
print("accuracy of the model is :", accuracy_score(y, y_pred))

              precision    recall  f1-score   support

       bruit       1.00      0.98      0.99     98999
     adresse       0.28      0.96      0.44      1000

    accuracy                           0.98     99999
   macro avg       0.64      0.97      0.71     99999
weighted avg       0.99      0.98      0.98     99999

accuracy of the model is : 0.975449754497545
