# Scikitlearn

### Lecture et preprocessing

In [14]:
categories = ['sadness', 'anger', 'love', 'surprise', 'fear', 'happy']

In [15]:
import pandas as pd
data = pd.read_csv("data/Emotion_final.csv")

In [16]:
from prepocessing import preprocessing
data = preprocessing(data)

### Mise en place GridSearch

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import  GridSearchCV
vect_nltk = CountVectorizer() 
text_clf = Pipeline([('vect', vect_nltk),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])
tuned_parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': [1, 1e-1, 1e-2]
}

In [18]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2,  random_state=42, shuffle=True, stratify= data["Emotion"])

In [19]:
clf = text_clf
clf.fit(train["clean_text"], train["Emotion"])

In [20]:
from sklearn.metrics import accuracy_score, classification_report

predicted_nltk = clf.predict(test["clean_text"])
accuracy_score(test.Emotion, predicted_nltk)

0.6703168685927307

In [21]:
print(classification_report(test["Emotion"], predicted_nltk, target_names=categories))

              precision    recall  f1-score   support

     sadness       0.94      0.32      0.47       599
       anger       0.90      0.25      0.40       530
        love       0.62      0.98      0.76      1406
    surprise       1.00      0.07      0.13       328
        fear       0.67      0.92      0.78      1253
       happy       1.00      0.02      0.03       176

    accuracy                           0.67      4292
   macro avg       0.86      0.43      0.43      4292
weighted avg       0.76      0.67      0.60      4292



In [22]:
clf = GridSearchCV(text_clf, tuned_parameters)
clf.fit(train["clean_text"], train["Emotion"])

On récupére les meilleurs paramètres

In [23]:
clf.best_params_

{'clf__alpha': 0.01,
 'tfidf__norm': 'l2',
 'tfidf__use_idf': False,
 'vect__ngram_range': (1, 2)}

### Modéle final

In [24]:
vect_nltk = CountVectorizer(ngram_range=(1,2)) 
text_clf = Pipeline([('vect', vect_nltk),
                     ('tfidf', TfidfTransformer(use_idf=False, norm='l2')),
                     ('clf', MultinomialNB(alpha=0.014))])


In [25]:
clf = text_clf
clf.fit(train["clean_text"], train["Emotion"])
predicted_nltk = clf.predict(test["clean_text"])
accuracy_score(test.Emotion, predicted_nltk)

0.78215284249767

In [26]:
print(classification_report(test["Emotion"], predicted_nltk, target_names=categories))

              precision    recall  f1-score   support

     sadness       0.85      0.68      0.76       599
       anger       0.83      0.65      0.73       530
        love       0.76      0.92      0.83      1406
    surprise       0.84      0.38      0.52       328
        fear       0.76      0.90      0.82      1253
       happy       0.87      0.34      0.49       176

    accuracy                           0.78      4292
   macro avg       0.82      0.65      0.69      4292
weighted avg       0.79      0.78      0.77      4292

