In [1]:
import pandas as pd
from pandas.core.frame import DataFrame
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

import warnings 
warnings.filterwarnings("ignore")

### Lecture du jeu de données Tweets:

In [2]:
 # the return value will be an iterable object of type TextFileReader:
chunks = pd.read_csv("french_tweets.csv",delimiter=",",chunksize=100000) ## 16*chunk
df = pd.concat([chunk for chunk in chunks]) 
liste_tweets = df["text"].values.tolist()  #1.7s

In [4]:
df.head()

Unnamed: 0,label,text
0,0,"- Awww, c'est un bummer. Tu devrais avoir davi..."
1,0,Est contrarié qu'il ne puisse pas mettre à jou...
2,0,J'ai plongé plusieurs fois pour la balle. A ré...
3,0,Tout mon corps a des démangeaisons et comme si...
4,0,"Non, il ne se comporte pas du tout. je suis en..."


In [3]:
# compter NB des tweets
text = df["text"]
print("Il y a %s tweets dans le jeu de données."%(len(text)))
# compter les mots
tokenizer = nltk.RegexpTokenizer("([A-Z]\.[A-Z]?\.?[0-9]?|[0-9]+[,.][0-9]+|[cdjls]'|qu'|[\w'-]+|\S)")
l = [len(tokenizer.tokenize(i)) for i in text]
nb_mots = sum(l)
print("Il y a %s mots dans le jeu de données."%(nb_mots))
# compter les instances négatives et positives
post_negatif_tweets = df["label"].value_counts()
print("Il y a %s tweets négatifs dans le jeu de données."%(post_negatif_tweets[0]))
print("Il y a %s tweets positifs dans le jeu de données."%(post_negatif_tweets[1]))

nb_ph= 0
for i in text:
    longueur_text = len(sent_tokenize(i))
    nb_ph += longueur_text
print("Il y a %s phrases dans l'ensemble du jeu de données."%(nb_ph))

nb_carac = 0
for ii in text:
    longueur_carac = len(i)
    nb_carac += longueur_carac
print("Il y a %s caractères dans le jeu de données."%(nb_carac)) #42.1s

Il y a 1526724 tweets dans le jeu de données.
Il y a 25979719 mots dans le jeu de données.
Il y a 771604 tweets négatifs dans le jeu de données.
Il y a 755120 tweets positifs dans le jeu de données.
Il y a 2572315 phrases dans l'ensemble du jeu de données.
Il y a 119084472 caractères dans le jeu de données.


### Vectorisation du jeu de données

In [56]:
vec = TfidfVectorizer()
X = vec.fit_transform(liste_tweets)
y = df['label']
# séparer les datasets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) #14.8s

In [57]:
## chargement des expériences déjà faites
import json
import os
chemin_expes = "dic_expes_tweets.json"
if os.path.exists(chemin_expes):
    f = open(chemin_expes)
    dic_expes = json.load(f)
    f.close()
else:
    dic_expes = {}
print("Expériences stockées : %s"%len(dic_expes))

Expériences stockées : 116


In [58]:
liste = sorted([ [score,str]for str,score in dic_expes.items()],reverse = True)
liste[:5]

[[0.8038133872467894, "['Logistic Regression', 1, 3, False, False, 40000]"],
 [0.801715216432542, "['linear_svc', 1, 3, False, False, 40000]"],
 [0.7905038666602623, "['Logistic Regression', 1, 3, False, False, 50000]"],
 [0.7902091184189267, "['Logistic Regression', 1, 4, False, False, 40000]"],
 [0.790209118418926, "['Logistic Regression', 1, 1, False, False, 40000]"]]

### Les paramètres max_depth, min_samples_split, max_features

In [37]:
# max_depthint, default = None # plus de depth, plus de précision. En outre, plus de temps d'éxecution
for i in range(1,20):
    DTC = DecisionTreeClassifier(max_depth=i) 
    DTC = DTC.fit(X_train,y_train)
    y_pred = DTC.predict(X_test)
    nom_classes = ["ham", "spam"]
    print("max_depth = ",i," ", accuracy_score(y_test, y_pred))
    #report = classification_report(y_test,y_pred,target_names=nom_classes) 
    #print(report)

max_depth =  1   0.5765144601303879
max_depth =  2   0.5901973284892733
max_depth =  3   0.5902955779030519
max_depth =  4   0.6016248269718657
max_depth =  5   0.6022448899388234
max_depth =  6   0.6098122781200739
max_depth =  7   0.6165238047412984
max_depth =  8   0.62195372234279
max_depth =  9   0.6229951661288421
max_depth =  10   0.6277635376775585
max_depth =  11   0.6344444978144964
max_depth =  12   0.6424856665021899
max_depth =  13   0.647943967267662
max_depth =  14   0.6521621420992189
max_depth =  15   0.6565506159146584
max_depth =  16   0.6602360605915052
max_depth =  17   0.662766528826378
max_depth =  18   0.6629106279665865
max_depth =  19   0.6650109821011402


In [5]:
#min_samples_split int or float, default=2. Nb de min_samples_split va nuancer la présicion de classification. 
for i in range(2,6):
    DTC = DecisionTreeClassifier(max_depth=20,min_samples_split=i) 
    DTC = DTC.fit(X_train,y_train)
    y_pred = DTC.predict(X_test)
    nom_classes = ["positif", "negatif"]
    print("min_samples_split = ",i,accuracy_score(y_test, y_pred))
    #report = classification_report(y_test,y_pred,target_names=nom_classes)
    #print(report)
    print("-"*20) 

min_samples_split =  2 0.672351304970547
--------------------
min_samples_split =  3 0.6723185551659542
--------------------
min_samples_split =  4 0.6723425716893222
--------------------
min_samples_split =  5 0.6722989052831985
--------------------


In [60]:
liste_classifieurs= [["naive_bayes",MultinomialNB()],
    ["Perceptron", Perceptron(eta0=0.1, random_state=0)],
    ["Logistic Regression", LogisticRegression()],
    ["Decision Tree", DecisionTreeClassifier(max_depth=5)],
    ["linear_svc", LinearSVC()]]
    #["multi_couches",MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(8, 2), random_state=1)]]#solver='sgd', max_iter=100000, random_state=0)]]
    

### TfidfVectoriser : Les paramètres lowercase, stop_words, max_features, analyzer = "char" ou "word"

In [40]:
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop

min_N, max_N = 1, 4  #1,1  #1,3

for enlever_stopwords in [False,True]:
  liste_stopwords = None
  if enlever_stopwords==True:
    liste_stopwords = fr_stop
    
  for en_minuscules in [False, True]:
    print(f"Stopwords : {enlever_stopwords}, Minuscules : {en_minuscules}")
    for max_F in [20000]:
        V = TfidfVectorizer(lowercase = en_minuscules, stop_words =  liste_stopwords, max_features = max_F )
        X = V.fit_transform(liste_tweets)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
        for nom, algo in liste_classifieurs:
            clf = algo.fit(X_train, y_train)
            expe = str([nom, min_N, max_N, enlever_stopwords, en_minuscules, max_F])
            if expe in dic_expes:
                score = dic_expes[expe]
                print("  Déjà vu :",  expe, "%.4f"%score)       
            else:
                print("  Nouvelle expérience")
                score = clf.score(X_test, y_test)
                print('  %s classifier : %.4f'%(nom, score)) 
                dic_expes[expe] = score


Stopwords : False, Minuscules : False
  Nouvelle expérience
  Perceptron classifier : 0.7077
  Nouvelle expérience
  Logistic Regression classifier : 0.7896
  Nouvelle expérience
  Decision Tree classifier : 0.6023
  Nouvelle expérience
  linear_svc classifier : 0.7876
Stopwords : False, Minuscules : True
  Nouvelle expérience
  Perceptron classifier : 0.7066
  Nouvelle expérience
  Logistic Regression classifier : 0.7891
  Nouvelle expérience
  Decision Tree classifier : 0.6064
  Nouvelle expérience
  linear_svc classifier : 0.7866
Stopwords : True, Minuscules : False
  Nouvelle expérience
  Perceptron classifier : 0.6866
  Nouvelle expérience
  Logistic Regression classifier : 0.7701
  Nouvelle expérience
  Decision Tree classifier : 0.5636
  Nouvelle expérience
  linear_svc classifier : 0.7679
Stopwords : True, Minuscules : True
  Nouvelle expérience
  Perceptron classifier : 0.6798
  Nouvelle expérience
  Logistic Regression classifier : 0.7638
  Nouvelle expérience
  Decision Tree

In [None]:
liste_classifieurs= [
    ["Logistic Regression", LogisticRegression()]]
for min_N in range(1, 3):
  for max_N in range(min_N, 4):
    #analyzer: words, char, char_wb
    V = TfidfVectorizer(ngram_range = (min_N, max_N), analyzer = "word")
    X = V.fit_transform(liste_tweets)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
    print(f"Ngram_range : ({min_N}, {max_N})")
    for nom, algo in liste_classifieurs:
            clf = algo.fit(X_train, y_train)
            score = clf.score(X_test, y_test)
            dic_expes[expe] = score
            print('%s classifier : %.4f'%(nom, score)) #13m11.5s

Ngram_range : (1, 1)
Logistic Regression classifier : 0.7896
Ngram_range : (1, 2)
Logistic Regression classifier : 0.8092
Ngram_range : (1, 3)
Logistic Regression classifier : 0.8107
Ngram_range : (2, 2)
Logistic Regression classifier : 0.7841
Ngram_range : (2, 3)
Logistic Regression classifier : 0.7818


In [None]:
liste_classifieurs= [
    ["Logistic Regression", LogisticRegression()]]
for min_N in range(4, 5):
  for max_N in range(min_N, 7):
    #analyzer: words, char, char_wb
    V = TfidfVectorizer(ngram_range = (min_N, max_N), analyzer = "char")
    X = V.fit_transform(liste_tweets)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
    print(f"Ngram_range : ({min_N}, {max_N})")
    for nom, algo in liste_classifieurs:
            clf = algo.fit(X_train, y_train)
            score = clf.score(X_test, y_test)
            dic_expes[expe] = score
            print('%s classifier : %.4f'%(nom, score)) #13m11.5s


Ngram_range : (4, 4)
Logistic Regression classifier : 0.7979
Ngram_range : (4, 5)
Logistic Regression classifier : 0.8052
Ngram_range : (4, 6)
Logistic Regression classifier : 0.8095


In [None]:
### Stockez les résultats de classification dans des fichiers Json
w = open(chemin_expes, "w")
w.write(json.dumps(dic_expes))
w.close()
print("-"*20)
print(f"Expériences stockées dans {chemin_expes}")
print("-"*20)

--------------------
Expériences stockées dans dic_expes_tweets.json
--------------------


### Mesure de l’efficacité de ces différents classifieurs: exactitude, précision, rappel, F-mesure

In [61]:
liste_Vectorizer = [
    ["param-vide",TfidfVectorizer()],
    ["max_f:20000",TfidfVectorizer(max_features = 30000)],
    ["word-ngram:1-2",TfidfVectorizer(ngram_range = (1,2),analyzer ="word")],
    ["word-ngram:1-3",TfidfVectorizer(ngram_range = (1,3),analyzer ="word")],
    ["char-ngram:1-8",TfidfVectorizer(ngram_range=(1,8), analyzer="char")]  # dico => liste   0.7s=>0.2s
]

In [None]:
# 3 variants --> les param de tfidfvectorisation ; les paroles pretraitées ou pas ; les 3 classifieurs 
# @param : list_string ==> pretraite ou pas | sortie : une liste triée (nb de vrai positif = bonne classification) avec les prama et les classifieurs choisis

def resultat_param_algo(liste_string):
    for nom_vec, V in liste_Vectorizer: 
        X = V.fit_transform(liste_string)
        y = df['label']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
        for nom, classifeur in liste_classifieurs: # on choisit les algorithmes
            classifeur.fit(X_train, y_train)
            y_pred = classifeur.predict(X_test)
            nom_classes = ["positive", "negative"]
            report = classification_report(y_test,y_pred,target_names=nom_classes)
            print(nom_vec,"+", nom)
            print(report)
resultat_param_algo(liste_tweets)   # 3 min 14.4s #20s

param-vide + Perceptron
              precision    recall  f1-score   support

    positive       0.72      0.71      0.71    231214
    negative       0.71      0.72      0.71    226804

    accuracy                           0.71    458018
   macro avg       0.71      0.71      0.71    458018
weighted avg       0.71      0.71      0.71    458018

param-vide + naive_bayes
              precision    recall  f1-score   support

    positive       0.75      0.80      0.77    231214
    negative       0.78      0.73      0.75    226804

    accuracy                           0.76    458018
   macro avg       0.76      0.76      0.76    458018
weighted avg       0.76      0.76      0.76    458018

param-vide + LinearSVC
              precision    recall  f1-score   support

    positive       0.79      0.78      0.78    231214
    negative       0.78      0.79      0.78    226804

    accuracy                           0.78    458018
   macro avg       0.78      0.78      0.78    458018
we

In [31]:
min_N, max_N = 1, 4
enlever_stopwords,en_minuscules = False,False

print(f"Stopwords : False, Minuscules : False")
#max_F = 40000
V = TfidfVectorizer(lowercase = False, stop_words = None, max_features = None, ngram_range=(1,4), analyzer = "word")
X = V.fit_transform(liste_tweets)
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
for nom, algo in liste_classifieurs:
    clf = algo.fit(X_train, y_train)
    expe = str([nom, min_N, max_N, enlever_stopwords, en_minuscules, None])
    
    if expe in dic_expes:
        score = dic_expes[expe]
            #expe.append()
        print("  Déjà vu :",  expe, "%.4f"%score)    
    else:
        print("  Nouvelle expérience")
        score = clf.score(X_test, y_test)
        print('  %s classifier : %.4f'%(nom, score))
        dic_expes[expe] = score



Stopwords : False, Minuscules : False
  Nouvelle expérience
  naive_bayes classifier : 0.7887


['Logistic Regression', 2, 3, False, False, None]: 0.781,
['Logistic Regression', 2, 2, False, False, None]: 0.784,
['Logistic Regression', 1, 2, False, False, 0]: 0.809,
['Logistic Regression', 1, 3, False, False, 0]: 0.81,
['Logistic Regression', 1, 1, False, False, 0]: 0.789,