# Classification de texte equipe: MEN OF NIGHT

### 1. Objectifs

*   Entrainer un modèle de base pour la classification de texte 
*   Utiliser les vecteurs pré-entraînés glove pour la classification de texte

### 2. Plan
  - Prétraitement du corpus
    - Nettoyage du corpus
    - Tokenisation
  - Vectorisation 
    - Télécharger glove
    - Document to vector
  - Modélisation
    - Selection de modèle
    - Entrainement du meilleur modèle
    - Évaluation
  - Prédiction 

## Prétraitement du corpus

### Téléchargement du dataset

In [2]:
import os
import re
import random
import pandas as pd
from glob import glob
from typing import Union
%matplotlib inline

In [3]:
from tqdm.auto import tqdm
tqdm.pandas()

In [4]:
df = pd.read_csv('./train.csv')

In [34]:
df.head()
# df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.head()

Unnamed: 0,text,labels
0,A witness testimony IS evidence held up in cou...,4
1,that’s literally glorifying someones trauma. i...,4
2,not only was I raped by Jorge but also by his ...,4
3,You know what's unbelievable to me is these wo...,4
4,"owner of this house, he does not lock door...",4


### Nettoyage du corpus

In [5]:
def clean_text(text: str) -> str:
    """Cette fonction nettoie le texte qu'il prend en entrée.
    """
    # Put all test in lower case
    text = str(text).lower()
    text = text.replace("\n", " ")
    # keep only word, remove special char emoj
    text = re.sub(r'[^\w\s]', '', str(text))
    # remove all digit
    text = re.sub('[0-9]+', ' ', text)
    return text.strip()

In [6]:
df["text_clean"] = df["text"].apply(clean_text)

In [37]:
df.head()

Unnamed: 0,text,labels,text_clean
0,A witness testimony IS evidence held up in cou...,4,a witness testimony is evidence held up in cou...
1,that’s literally glorifying someones trauma. i...,4,thats literally glorifying someones trauma if ...
2,not only was I raped by Jorge but also by his ...,4,not only was i raped by jorge but also by his ...
3,You know what's unbelievable to me is these wo...,4,you know whats unbelievable to me is these wom...
4,"owner of this house, he does not lock door...",4,owner of this house he does not lock doors and...


- Afficher tous les caractères du corpus

In [38]:
print(sorted(set("".join(df.text_clean).lower())))

[' ', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\xa0', '½', 'ß', 'à', 'á', 'â', 'ä', 'ç', 'è', 'é', 'ê', 'ì', 'í', 'ï', 'ñ', 'ó', 'ö', 'ø', 'ú', 'ü', 'ć', 'č', 'ę', 'ī', 'ı', 'ł', 'ň', 'ş', 'š', 'ŷ', 'ƹ', 'ȃ', 'ȇ', 'ȋ', 'ȏ', 'ȓ', 'ȗ', 'ɔ', 'ɛ', 'ʒ', 'ʖ', 'а', 'в', 'д', 'е', 'и', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'ь', 'є', 'ӂ', 'ء', 'أ', 'إ', 'ئ', 'ا', 'ب', 'ة', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه', 'و', 'ى', 'ي', 'ٱ', 'چ', 'ڈ', 'ڑ', 'ک', 'گ', 'ھ', 'ہ', 'ی', 'ے', 'आ', 'ए', 'क', 'ग', 'झ', 'ड', 'त', 'द', 'न', 'प', 'ब', 'म', 'र', 'ल', 'ळ', 'व', 'श', 'स', 'ह', 'இ', 'க', 'ங', 'ச', 'ட', 'த', 'ப', 'ய', 'ர', 'ಠ', 'መ', 'ሚ', 'ሴ', 'ን', 'ከ', 'ዘ', '៹', 'ẹ', 'ọ', '\u200a', '⁹', 'ℹ', '⓿', 'あ', 'い', 'え', 'か', 'さ', 'じ', 'せ', 'た', 'だ', 'つ', 'で', 'と', 'に', 'の', 'は', 'る', 'れ', 'を', 'ん', 'イ', 'エ', 'ツ', 'ト', 'ヒ', 'ミ', 'ュ', 'リ', '

On remarque qu'il y a plusieurs texte en d'autre langues. Nous les suprimerons

In [7]:
toremove = "".join(['आ', 'ए', 'क', 'ग', 'झ', 'ड', 'त', 'द', 'न', 'प', 'ब', 'म', 'र', 'ल', 'ळ', 'व', 'श', 'स', 'ह', 'இ', 'க', 'ங', 'ச', 'ட', 'த', 'ப', 'ய', 'ர', 'ಠ', 'መ', 'ሚ', 'ሴ', 'ን', 'ከ', 'ዘ', '៹', 'ẹ', 'ọ',])
to_rm = "".join(['ö', 'ø', 'ú', 'ü', 'ć', 'č', 'ę', 'ī', 'ı', 'ł', 'ň', 'ş', 'š', 'ŷ', 'ƹ', 'ȃ', 'ȇ', 'ȋ', 'ȏ', 'ȓ', 'ȗ', 'ɔ', 'ɛ', 'ʒ', 'ʖ', 'а', 'в', 'д', 'е', 'и', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', '\u200a', '⁹', 'ℹ', '⓿', 'あ', 'い', 'え', 'か', 'さ', 'じ', 'せ', 'た', 'だ', 'つ', 'で', 'と', 'に', 'の', 'は', 'る', 'れ', 'を', 'ん', 'イ', 'エ', 'ツ', 'ト', 'ヒ', 'ミ', 'ュ', 'リ', 'ー', 'ㅅ', 'ㅤ', '一', '上', '了', '些', '人', '使', '例', '信', '個', '們', '先', '兒', '公', '出', '到', '前', '単', '取', '多', '大', '天', '姦', '子', '孩', '害', '少', '就', '屈', '希', '常', '強', '彼', '恥', '息', '感', '我', '文', '是', '最', '望', '正', '殺', '然', '照', '現', '用', '的', '看', '童', '義', '能', '臨', '英', '被', '覧', '語', '讀', '賦', '身', '辱', '這', '靈', '윤', '호', 'ﷺ',])
punc = "".join(['½', 'ß', 'à', 'á', 'â', 'ä', 'ç', 'è', 'é', 'ê', 'ì', 'í', 'ï', 'ñ', 'ó', '𝑒', '𝒸', '𝒻', '𝒽', '𝓀', '𝓅', '𝓈', '𝓉', '𝓊', '𝓬', '𝓸', '𝔻', '𝕔', '𝕦', '𝘦', '𝘩', '𝘰', '𝘴', '𝘵', '𝙴', '𝙻', '𝚂', '𝚊', '𝚘', '𝚙', '𝚛', '𝚜', '𝚝', '𝚟'])


In [11]:
to_remove = "ьєӂءأإئابةتثجحخدذرزسشصضطظعغفقكلمنهوىيٱچڈڑکگھہیےआएकगझडतदनपबमरलळवशसहஇகஙசடதபயரಠመሚሴንከዘ៹ẹọöøúüćčęīıłňşšŷƹȃȇȋȏȓȗɔɛʒʖавдеиклмнопрс ⁹ℹ⓿あいえかさじせただつでとにのはるれをんイエツトヒミュリーㅅㅤ一上了些人使例信個們先兒公出到前単取多大天姦子孩害少就屈希常強彼恥息感我文是最望正殺然照現用的看童義能臨英被覧語讀賦身辱這靈윤호ﷺ½ßàáâäçèéêìíïñó𝑒𝒸𝒻𝒽𝓀𝓅𝓈𝓉𝓊𝓬𝓸𝔻𝕔𝕦𝘦𝘩𝘰𝘴𝘵𝙴𝙻𝚂𝚊𝚘𝚙𝚛𝚜𝚝𝚟"

In [10]:
def remove_undesirable(text):
    text = re.sub('['+to_remove +']+', '', text)
    text = re.sub('\xa0', ' ', text)
    text = re.sub('_', ' ', text)
    return text


df["text_clean"] = df["text_clean"].apply(remove_undesirable)

In [42]:
print(sorted(set("".join(df.text_clean).lower())))

[' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


###  Tokenisation du corpus
spacy

In [12]:
import spacy

In [13]:
spacy.cli.download("en_core_web_lg")
nlp = spacy.load("en_core_web_lg", exclude=["parser", "tagger"])

Collecting en-core-web-lg==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.3.0/en_core_web_lg-3.3.0-py3-none-any.whl (400.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 400.7/400.7 MB 3.7 MB/s eta 0:00:00




[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [45]:
df["text_tokenized"] = df["text_clean"].progress_apply(lambda x: [i.text for i in nlp(x) if not (i.is_punct or i.is_space)])
df.head()

  0%|          | 0/31720 [00:00<?, ?it/s]



Unnamed: 0,text,labels,text_clean,text_tokenized
0,A witness testimony IS evidence held up in cou...,4,a witness testimony is evidence held up in cou...,"[a, witness, testimony, is, evidence, held, up..."
1,that’s literally glorifying someones trauma. i...,4,thats literally glorifying someones trauma if ...,"[that, s, literally, glorifying, someones, tra..."
2,not only was I raped by Jorge but also by his ...,4,not only was i raped by jorge but also by his ...,"[not, only, was, i, raped, by, jorge, but, als..."
3,You know what's unbelievable to me is these wo...,4,you know whats unbelievable to me is these wom...,"[you, know, what, s, unbelievable, to, me, is,..."
4,"owner of this house, he does not lock door...",4,owner of this house he does not lock doors and...,"[owner, of, this, house, he, does, not, lock, ..."


### Remove stopword

In [14]:
stopwords = nlp.Defaults.stop_words

In [47]:
df["text_tokenized"] = df["text_tokenized"].progress_apply(lambda x: [word for word in x if (word not in stopwords) and len(word) > 2])
df.head()

  0%|          | 0/31720 [00:00<?, ?it/s]

Unnamed: 0,text,labels,text_clean,text_tokenized
0,A witness testimony IS evidence held up in cou...,4,a witness testimony is evidence held up in cou...,"[witness, testimony, evidence, held, court, wi..."
1,that’s literally glorifying someones trauma. i...,4,thats literally glorifying someones trauma if ...,"[literally, glorifying, someones, trauma, says..."
2,not only was I raped by Jorge but also by his ...,4,not only was i raped by jorge but also by his ...,"[raped, jorge, girlfriends, son, groomed, forc..."
3,You know what's unbelievable to me is these wo...,4,you know whats unbelievable to me is these wom...,"[know, unbelievable, women, support, child, pr..."
4,"owner of this house, he does not lock door...",4,owner of this house he does not lock doors and...,"[owner, house, lock, doors, windows, white, pe..."


- Suppression des doc vide si ils existent

In [22]:
df.loc[df["text_tokenized"].progress_apply(lambda x: len(x)) == 0]

  0%|          | 0/31720 [00:00<?, ?it/s]

Unnamed: 0,text,labels,text_clean,text_tokenized


Il n'y a pas de document vide

### Corpus final

In [23]:
corpus = df.text_tokenized
labels = df.labels
corpus.head()

0    [witness, testimony, evidence, held, court, wi...
1    [literally, glorifying, someones, trauma, says...
2    [raped, jorge, girlfriends, son, groomed, forc...
3    [know, unbelievable, women, support, child, pr...
4    [owner, house, lock, doors, windows, white, pe...
Name: text_tokenized, dtype: object

In [24]:
labels.shape

(31720,)

## Vectorisation 

### Load pre-trained word vector: GloVe

In [15]:
!pip install gensim
import gensim.downloader as api

[0m

In [18]:
glove = api.load("glove-wiki-gigaword-50")

### document to vector

In [21]:
import numpy as np

In [33]:
def doc2vec(doc):
    """Convert doc to vector. The vector of the document is
    the mean of word vectors of the document. 
    Args:
    - doc (array): list or np.array of word

    Returns:
    """
    vectors = [glove.get_vector(word) for word in doc if word in glove.key_to_index]
    if len(vectors) > 0 : 
        return np.mean(vectors, axis=0)
    else :
        return np.zeros(50)

In [None]:
features = np.array([doc2vec(x) for x in corpus])

In [None]:
features=pd.DataFrame(features)
features.to_csv(r'./features.csv',index=False)
labels=pd.DataFrame(labels)
labels.to_csv(r'./labels.csv',index=False)

In [22]:
features=np.array(pd.read_csv('./features.csv'))
labels = df.labels


Chaque ligne de `feature` est le vecteur représentant le document correspodant dans notre corpus.


## Modélisation

### Entrainement et comparaison de modèles

Nous comparons la capacité predictive de plusieurs modèles de classification :
**Random Forest**, **Multinomial Logistic Regression**,**AdaBoostClassifier**, **XGBClassifier**,
**LGBMClassifier**, **PassiveAggressiveClassifier, RidgeClassifier, SGDClassifier, 
KNeighborsClassifier, MLPClassifier, SVClassifier**



In [24]:
!pip install lightgbm
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.multiclass import OneVsRestClassifier,OneVsOneClassifier
from sklearn.ensemble import ExtraTreesClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
import lightgbm as lgb
import xgboost as xgb
import numpy as np
import pandas
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
std_slc=StandardScaler()
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier,RidgeClassifier,SGDClassifier
import sklearn
from sklearn.model_selection import StratifiedKFold
X, y = features,labels

[0m

In [35]:
models = [
    SVC(kernel='linear'), LogisticRegression(random_state=0),RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    PassiveAggressiveClassifier(), RidgeClassifier(), SGDClassifier(), LGBMClassifier(n_estimators=500,max_depth=None),
    XGBClassifier(n_estimators=500), AdaBoostClassifier(n_estimators=100, learning_rate=0.5, random_state=1),KNeighborsClassifier(n_neighbors=2),
]

In [36]:
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
    
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [37]:
cv_df

Unnamed: 0,model_name,fold_idx,accuracy
0,SVC,0,0.968001
1,SVC,1,0.968001
2,SVC,2,0.967055
3,SVC,3,0.967528
4,SVC,4,0.967528
5,LogisticRegression,0,0.966267
6,LogisticRegression,1,0.967686
7,LogisticRegression,2,0.965637
8,LogisticRegression,3,0.967686
9,LogisticRegression,4,0.967844


In [38]:
cv_df.groupby("model_name").mean()

Unnamed: 0_level_0,fold_idx,accuracy
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1
AdaBoostClassifier,2.0,0.918253
KNeighborsClassifier,2.0,0.929319
LGBMClassifier,2.0,0.966204
LogisticRegression,2.0,0.967024
PassiveAggressiveClassifier,2.0,0.959458
RandomForestClassifier,2.0,0.884363
RidgeClassifier,2.0,0.934615
SGDClassifier,2.0,0.961003
SVC,2.0,0.967623
XGBClassifier,2.0,0.966299


**Observation et ajustement**

En general on obtient d'assez bons scores pour les modèles. Toutefois il faut noter le faible score du Random forest, l'un des meilleurs modèle de classification. Cela est du au fait que ce dernier est plus un classificateur binaire et que nous sommes en face d'une classification multi-class.
On règle ce problème en utilisant la methode heuristique OvR (One-vs-Rest).

In [12]:
models=[OneVsRestClassifier(RandomForestClassifier())]

Testons à nouveau son score de validation croisé

In [13]:
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
    
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

In [14]:
cv_df

Unnamed: 0,model_name,fold_idx,accuracy
0,OneVsRestClassifier,0,0.946091
1,OneVsRestClassifier,1,0.943884
2,OneVsRestClassifier,2,0.947037
3,OneVsRestClassifier,3,0.944672
4,OneVsRestClassifier,4,0.945618


In [15]:
cv_df.groupby("model_name").mean()

Unnamed: 0_level_0,fold_idx,accuracy
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1
OneVsRestClassifier,2.0,0.94546


### Selection des meilleurs modèles et optimisation de leurs paramètres

les scores moyens en validation croisée  des modèles suivant sont les meilleures:  **régression logistique multinomiale, Support vector machine(SVC), Light gbm(LGBMClassifier), xgboost(XGBMClassifier) et neural_network(MLPClassifier)** 
Grâce à GridSearchCV nous allons effectuer une optimisation rapide de certains paramètres de ces modèles.

In [8]:
#SVC

# define model
svc=SVC()
# define search space
param_grid = {'C': [0.1,1, 10, 100],
              'gamma': [1,0.1,0.01,0.001],
              'kernel': ['linear','rbf', 'poly', 'sigmoid']}
# define search
grid = GridSearchCV(svc,param_grid,scoring='accuracy',refit=True,verbose=2)
grid.fit(X,y)
#see best estimator
grid.best_params_ 

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ......................C=100, gamma=1, kernel=linear; total time=  28.4s
[CV] END ......................C=100, gamma=1, kernel=linear; total time=  29.2s
[CV] END ......................C=100, gamma=1, kernel=linear; total time=  27.6s
[CV] END ......................C=100, gamma=1, kernel=linear; total time=  27.8s
[CV] END ......................C=100, gamma=1, kernel=linear; total time=  26.8s


{'C': 100, 'gamma': 1, 'kernel': 'linear'}

In [None]:
#Logistic Regression

# define model
model = LogisticRegression()
# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define search space
space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
space['C'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
# define search
search = GridSearchCV(model, space, scoring='accuracy', n_jobs=-1, cv=cv)
search.fit(X,y)
#see best estimator
search.best_estimator_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [26]:
search.best_params_

{'C': 1e-05, 'penalty': 'none', 'solver': 'lbfgs'}

In [62]:
#MLPClassifier

# define model
mlp = MLPClassifier()
# define search space
parameter_space={
    'max_iter':[200,300,100],
    'hidden_layer_sizes':[(200,),(50,),(100,)],
    'activation':['tanh', 'relu'],
    'solver':['sgd', 'adam'],
    'alpha':[0.0001],
    'learning_rate':['constant','adaptive']
}

# define search
mlp_gs= GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=5)
mlp_gs.fit(X, y) 
#see best estimator
mlp_gs.best_estimator_

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

In [67]:
mlp_gs.best_params_

{'activation': 'tanh',
 'alpha': 0.0001,
 'hidden_layer_sizes': (50,),
 'learning_rate': 'adaptive',
 'max_iter': 200,
 'solver': 'adam'}

In [None]:
#XGBClassifier

# define model
xgb = XGBClassifier()
# define search space
param_grid = {'n_estimators': [200,500],
             'learning_rate': [0.001,0.1,0.05]
             }
#define search
xgb_rs = GridSearchCV(xgb, param_grid, cv=5, scoring='neg_mean_absolute_percentage_error')
# Search better parameters for xgb_model 
xgb_rs.fit(X,y)
#see best parameters
best_xgb = xgb_rs.best_params_

In [None]:
#LGBMClassifier

# define model
lgb = LGBMClassifier()
param_grid={
    'n_estimators': [400, 500, 700],
    'max_depth': [15,20,25],
    'num_leaves': [50, 100, 200],
}
#define search
lgb_gs= GridSearchCV(lgb, param_grid, cv=4, scoring='accuracy')
# Search better parameters for lgb_model 
lgb_gs.fit(X,y)
#see best parameters
best_gb = lgb_gs.best_params_

**MODELE FINALE**

Notre modèle finale sera construit à partir des meilleurs modèles dont on a optimisé les parmetres ci-haut. Il s'agira d'un **Voting**

In [7]:
estimators = [("lgbm", LGBMClassifier(n_estimators=500,max_depth=None)), ("svm", SVC(C=100, gamma=1, kernel='linear',probability=True)),
              ("xgb", XGBClassifier(n_estimators=500)), ("lr",LogisticRegression(C=1e-5,penalty='none',solver='newton-cg')),
              ("mlp",MLPClassifier(activation='tanh',alpha=0.0001,hidden_layer_sizes=(50,),learning_rate='adaptive',max_iter=200,solver='adam'))]

voting = VotingClassifier(estimators=estimators, voting="soft",
                          weights = [1, 2,1,2,2.9])
forecast_voting = np.zeros(labels.shape)
compt = 1
n_splits = 5
folds = StratifiedKFold(n_splits=n_splits, shuffle=True,random_state=42)
folds = list(folds.split(features, labels))
for train, test in folds:
    
    target_train, target_test = labels[train], labels[test]
    features_train, features_test = features[train], features[test]
    
    voting.fit(features_train, target_train)
    
    forecast_voting[test] = voting.predict(features_test)
    compt += 1




In [8]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(labels,forecast_voting)
print(accuracy)

0.9729192938209331


In [26]:
voting.fit(features, labels)



In [27]:
voting.score(features, labels)

0.9863493064312736

## Prédiction

In [28]:
test = pd.read_csv("./test.csv")

In [29]:
test.head()

Unnamed: 0,id,text
0,542,"Nah let me be honest with you, you do not take..."
1,929,Um.....did you think about this before you typ...
2,645,"humiliated me, and I know in his mind it felt ..."
3,619,This tweet is 5 hours old but he has some new ...
4,303,He may not have raped me but he facilitated it...


In [30]:
test["text"] = test["text"].apply(clean_text)
test["text"] = test["text"].apply(remove_undesirable)
test["text"] = test["text"].progress_apply(lambda x: [i.text for i in nlp(x) if not (i.is_punct or i.is_space)])
test["text"] = test["text"].progress_apply(lambda x: [word for word in x if (word not in stopwords) and len(word) > 2])
test.head()

  0%|          | 0/7930 [00:00<?, ?it/s]



  0%|          | 0/7930 [00:00<?, ?it/s]

Unnamed: 0,id,text
0,542,"[nah, let, honest, plea, deal, crime, commit, ..."
1,929,"[umdid, think, typed, like, accepting, husband..."
2,645,"[humiliated, know, mind, felt, justified, sure..."
3,619,"[tweet, hours, old, new, ones, notice, upset, ..."
4,303,"[raped, facilitated]"


In [31]:
test_corpus = test.text
test_corpus.head()

0    [nah, let, honest, plea, deal, crime, commit, ...
1    [umdid, think, typed, like, accepting, husband...
2    [humiliated, know, mind, felt, justified, sure...
3    [tweet, hours, old, new, ones, notice, upset, ...
4                                 [raped, facilitated]
Name: text, dtype: object

In [34]:
test_features = np.array([doc2vec(x) for x in test_corpus])
test_features.shape

(7930, 50)

In [35]:
pred = voting.predict(test_features)

In [36]:
pred

array([4, 1, 4, ..., 4, 4, 4])

In [38]:
test["prediction"] = pred
test.head()

Unnamed: 0,id,text,prediction
0,542,"[nah, let, honest, plea, deal, crime, commit, ...",4
1,929,"[umdid, think, typed, like, accepting, husband...",1
2,645,"[humiliated, know, mind, felt, justified, sure...",4
3,619,"[tweet, hours, old, new, ones, notice, upset, ...",4
4,303,"[raped, facilitated]",4


In [39]:
test[['id', 'prediction']].to_csv("submission.csv", index=False)