In [34]:
import pandas as pd
from pandas.core.frame import DataFrame
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.linear_model import Perceptron


import warnings 
warnings.filterwarnings("ignore")

### Lecture du jeu de données Brown:

In [35]:
 # the return value will be an iterable object of type TextFileReader:
chunks = pd.read_csv("archive/brown.csv",delimiter=",",chunksize=10000) ## 16*chunk
df = pd.concat([chunk for chunk in chunks]) 
liste_brown = df["tokenized_text"].values.tolist() 

In [58]:
df.head()

Unnamed: 0,filename,para_id,sent_id,raw_text,tokenized_text,tokenized_pos,label
0,cd05,0,0,"Furthermore/rb ,/, as/cs an/at encouragement/n...","Furthermore , as an encouragement to revisioni...","rb , cs at nn in nn nn , pps rb bez jj to vb c...",religion
1,cd05,0,1,The/at Unitarian/jj clergy/nns were/bed an/at ...,The Unitarian clergy were an exclusive club of...,at jj nns bed at jj nn in vbn nns -- cs at nn ...,religion
2,cd05,0,2,"Ezra/np Stiles/np Gannett/np ,/, an/at honorab...","Ezra Stiles Gannett , an honorable representat...","np np np , at jj nn in at nn , vbd ppl rb in a...",religion
3,cd05,0,3,"Even/rb so/rb ,/, Gannett/np judiciously/rb ar...","Even so , Gannett judiciously argued , the Ass...","rb rb , np rb vbd , at nn-tl md rb vb cs np ``...",religion
4,cd05,0,4,We/ppss today/nr are/ber not/* entitled/vbn to...,We today are not entitled to excoriate honest ...,ppss nr ber * vbn to vb jj nns wps vbd np to b...,religion


In [57]:
# compter NB des tweets
text = df["tokenized_text"]
print("Il y a %s instances dans le jeu de données."%(len(text)))
# compter les mots
tokenizer = nltk.RegexpTokenizer("([A-Z]\.[A-Z]?\.?[0-9]?|[0-9]+[,.][0-9]+|[cdjls]'|qu'|[\w'-]+|\S)")
l = [len(tokenizer.tokenize(i)) for i in text]
nb_mots = sum(l)
print("Il y a %s mots dans le jeu de données."%(nb_mots))
# compter les phrases et les caractères
nb_ph= 0
for i in text:
    longueur_text = len(sent_tokenize(i))
    nb_ph += longueur_text
print("Il y a %s phrases dans l'ensemble du jeu de données."%(nb_ph))

nb_carac = 0
for ii in text:
    longueur_carac = len(i)
    nb_carac += longueur_carac
print("Il y a %s caractères dans le jeu de données."%(nb_carac)) 

# compter les instances de 15 genres
instances_genres = df["label"].value_counts()
print("Il y a %s classes dans l'ensemble du jeu de données."%(len(instances_genres)))
print((instances_genres))

Il y a 57340 instances dans le jeu de données.
Il y a 1175396 mots dans le jeu de données.
Il y a 61435 phrases dans l'ensemble du jeu de données.
Il y a 9518440 caractères dans le jeu de données.
Il y a 15 classes dans l'ensemble du jeu de données.
learned            7734
belles_lettres     7209
lore               4881
adventure          4637
news               4623
romance            4431
fiction            4249
hobbies            4193
mystery            3886
government         3032
editorial          2997
reviews            1751
religion           1716
humor              1053
science_fiction     948
Name: label, dtype: int64
{'belles_lettres', 'religion', 'mystery', 'reviews', 'government', 'adventure', 'humor', 'editorial', 'learned', 'lore', 'fiction', 'hobbies', 'romance', 'news', 'science_fiction'}


### Vectorisation du jeu de données

In [37]:

vec = TfidfVectorizer()
X = vec.fit_transform(liste_brown)
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) #0.7s

In [44]:
## chargement des expériences déjà faites
import json
import os
chemin_expes = "dic_expes_brown.json"
if os.path.exists(chemin_expes):
    f = open(chemin_expes)
    dic_expes = json.load(f)
    f.close()
else:
    dic_expes = {}
print("Expériences stockées : %s"%len(dic_expes))

Expériences stockées : 129


In [49]:
liste = sorted([ [score,str]for str,score in dic_expes.items()],reverse = True)
liste[:10]

[[0.5277293337983955, "['linear_svc', 1, 2, False, True, 30000]"],
 [0.5277293337983955, "['linear_svc', 1, 1, False, True, 30000]"],
 [0.5256365538890827, "['linear_svc', 1, 2, False, False, 30000]"],
 [0.5256365538890827, "['linear_svc', 1, 1, False, False, 30000]"],
 [0.5245320311591676, "['linear_svc', 1, 2, False, True, 20000]"],
 [0.5245320311591676, "['linear_svc', 1, 1, False, True, 20000]"],
 [0.5232531101034763, "['linear_svc', 1, 2, True, False, 40000]"],
 [0.5215091268457156, "['linear_svc', 1, 2, False, False, 20000]"],
 [0.5215091268457156, "['linear_svc', 1, 1, False, False, 20000]"],
 [0.5202302057900244, "['linear_svc', 1, 2, True, True, 40000]"]]

### Les paramètres max_depth, min_samples_split, max_features

In [7]:
#max_depthint, default=None
for i in range(1,20):
    DTC = DecisionTreeClassifier(max_depth=i)   
    DTC = DTC.fit(X_train,y_train)
    #y_pred = DTC.predict(X_test)
    print("max_depth = ",i )
    #report = classification_report(y_test,y_pred)
    #print(report)
    print(DTC.score(X_test,y_test))
    print("-"*20)    #max_depth =  15 0.1994535519125683

max_depth =  1
0.14126264387861875
--------------------
max_depth =  2
0.15341239390768516
--------------------
max_depth =  3
0.15666782932217185
--------------------
max_depth =  4
0.15934193698407162
--------------------
max_depth =  5
0.16718986164399488
--------------------
max_depth =  6
0.16992210208115335
--------------------
max_depth =  7
0.18061853272875247
--------------------
max_depth =  8
0.18021160330194164
--------------------
max_depth =  9
0.1879432624113475
--------------------
max_depth =  10
0.18980351121962563
--------------------
max_depth =  11
0.19143122892686898
--------------------
max_depth =  12
0.19480292989187303
--------------------
max_depth =  13
0.1949773282176491
--------------------
max_depth =  14
0.19567492152075341
--------------------
max_depth =  15
0.1994535519125683
--------------------
max_depth =  16
0.19863969305894663
--------------------
max_depth =  17
0.1975351703290315
--------------------
max_depth =  18
0.19643064759911638
--------

In [9]:
#min_samples_split int or float, default=2   没有影响
for i in range(2,10):
    DTC = DecisionTreeClassifier(max_depth=15,min_samples_split=i) 
    DTC = DTC.fit(X_train,y_train)
    print("min_samples_split = ",i )
    print(DTC.score(X_test,y_test)) # min_samples_split =  3 ou 7 ----   0.2000348796651552
    print("-"*20) 

min_samples_split =  2
0.19991861411463782
--------------------
min_samples_split =  3
0.2000348796651552
--------------------
min_samples_split =  4
0.19997674688989653
--------------------
min_samples_split =  5
0.19986048133937914
--------------------
min_samples_split =  6
0.19991861411463782
--------------------
min_samples_split =  7
0.2000348796651552
--------------------
min_samples_split =  8
0.19951168468782698
--------------------
min_samples_split =  9
0.19927915358679224
--------------------


In [32]:
liste_classifieurs= [
    ["Perceptron", Perceptron(eta0=0.1, random_state=0)],
    ["Logistic Regression", LogisticRegression()],
    ["Decision Tree", DecisionTreeClassifier(max_depth=15,min_samples_split=3)],
    ["linear_svc", LinearSVC()],
    ["naive_bayes",MultinomialNB()]]

### TfidfVectoriser : Les paramètres lowercase, stop_words, max_features, analyzer = "char" ou "word"

In [40]:
from sklearn.feature_extraction import _stop_words
min_N, max_N = 1, 2

for enlever_stopwords in [False,True]:
  liste_stopwords = None
  if enlever_stopwords==True:
    liste_stopwords = _stop_words.ENGLISH_STOP_WORDS
    
  for en_minuscules in [False, True]:
    print(f"Stopwords : {enlever_stopwords}, Minuscules : {en_minuscules}")
    for max_F in [40000]:
        V = TfidfVectorizer(lowercase = en_minuscules, stop_words =  liste_stopwords, max_features = max_F,ngram_range=(min_N, max_N),)
        X = V.fit_transform(liste_brown)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
        for nom, algo in liste_classifieurs:
            clf = algo.fit(X_train, y_train)
            expe = str([nom, min_N, max_N, enlever_stopwords, en_minuscules, max_F])
            if expe in dic_expes:
                score = dic_expes[expe]
                #expe.append()
                print("  Déjà vu :",  expe, "%.4f"%score)    
                
            else:

                print("  Nouvelle expérience")
                score = clf.score(X_test, y_test)
                print('  %s classifier : %.4f'%(nom, score))
                
                dic_expes[expe] = score


Stopwords : False, Minuscules : False
  Nouvelle expérience
  Perceptron classifier : 0.4402
  Nouvelle expérience
  Logistic Regression classifier : 0.4611
  Nouvelle expérience
  Decision Tree classifier : 0.1907
  Nouvelle expérience
  linear_svc classifier : 0.5023
  Nouvelle expérience
  naive_bayes classifier : 0.3835
Stopwords : False, Minuscules : True
  Nouvelle expérience
  Perceptron classifier : 0.4474
  Nouvelle expérience
  Logistic Regression classifier : 0.4645
  Nouvelle expérience
  Decision Tree classifier : 0.1953
  Nouvelle expérience
  linear_svc classifier : 0.5027
  Nouvelle expérience
  naive_bayes classifier : 0.3848
Stopwords : True, Minuscules : False
  Nouvelle expérience
  Perceptron classifier : 0.4603
  Nouvelle expérience
  Logistic Regression classifier : 0.4870
  Nouvelle expérience
  Decision Tree classifier : 0.1683
  Nouvelle expérience
  linear_svc classifier : 0.5233
  Nouvelle expérience
  naive_bayes classifier : 0.4262
Stopwords : True, Minusc

In [41]:
from sklearn.feature_extraction import _stop_words

min_N, max_N = 1, 3

for enlever_stopwords in [False, True]:
  liste_stopwords = None
  if enlever_stopwords==True:
    liste_stopwords = _stop_words.ENGLISH_STOP_WORDS
    
  for en_minuscules in [False, True]:
    print(f"Stopwords {enlever_stopwords}, Minuscules : {en_minuscules}")
    for max_F in [30000]:
        V = TfidfVectorizer(lowercase = en_minuscules, stop_words =  liste_stopwords, max_features = max_F,ngram_range=(min_N, max_N), )
        X = V.fit_transform(liste_brown)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
        for nom, algo in liste_classifieurs:
            clf = algo.fit(X_train, y_train)
            expe = str([nom, min_N, max_N, enlever_stopwords, en_minuscules, max_F])
            if expe in dic_expes:
                print("  Déjà vu :",  expe, score)
                score = dic_expes[expe]
            else:
                print("  Nouvelle expérience")
                print('  %s classifier : %.4f'%(nom, score))
                score = clf.score(X_test, y_test)
                dic_expes[expe] = score

Stopwords False, Minuscules : False
  Déjà vu : ['Perceptron', 1, 3, False, False, 30000] 0.42849668643181027
  Déjà vu : ['Logistic Regression', 1, 3, False, False, 30000] 0.4188466457388676
  Déjà vu : ['Decision Tree', 1, 3, False, False, 30000] 0.45273805371468434
  Déjà vu : ['linear_svc', 1, 3, False, False, 30000] 0.19608185094756422
  Déjà vu : ['naive_bayes', 1, 3, False, False, 30000] 0.4802929891873038
Stopwords False, Minuscules : True
  Nouvelle expérience
  Perceptron classifier : 0.3870
  Nouvelle expérience
  Logistic Regression classifier : 0.4226
  Nouvelle expérience
  Decision Tree classifier : 0.4574
  Nouvelle expérience
  linear_svc classifier : 0.1948
  Nouvelle expérience
  naive_bayes classifier : 0.4835
Stopwords True, Minuscules : False
  Nouvelle expérience
  Perceptron classifier : 0.3901
  Nouvelle expérience
  Logistic Regression classifier : 0.4532
  Nouvelle expérience
  Decision Tree classifier : 0.4859
  Nouvelle expérience
  linear_svc classifier : 

In [42]:
min_N, max_N = 1,3 
enlever_stopwords,en_minuscules = False,True

print(f"Stopwords : False, Minuscules : True")
max_F = 30000
V = TfidfVectorizer(lowercase = True, stop_words = None, max_features = max_F, ngram_range=(min_N, max_N), analyzer = "word")
X = V.fit_transform(liste_brown)
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
for nom, algo in liste_classifieurs:
    clf = algo.fit(X_train, y_train)
    expe = str([nom, min_N, max_N, enlever_stopwords, en_minuscules, max_F])
    
    if expe in dic_expes:
        score = dic_expes[expe]
            #expe.append()
        print("  Déjà vu :",  expe, "%.4f"%score)    
    else:
        print("  Nouvelle expérience")
        score = clf.score(X_test, y_test)
        print('  %s classifier : %.4f'%(nom, score))
        dic_expes[expe] = score



Stopwords : False, Minuscules : True
  Déjà vu : ['Perceptron', 1, 3, False, True, 30000] 0.4226
  Déjà vu : ['Logistic Regression', 1, 3, False, True, 30000] 0.4574
  Déjà vu : ['Decision Tree', 1, 3, False, True, 30000] 0.1948
  Déjà vu : ['linear_svc', 1, 3, False, True, 30000] 0.4835
  Déjà vu : ['naive_bayes', 1, 3, False, True, 30000] 0.3901


In [43]:
w = open(chemin_expes, "w")
w.write(json.dumps(dic_expes))
w.close()
print("-"*20)
print(f"Expériences stockées dans {chemin_expes}")
print("-"*20)

--------------------
Expériences stockées dans dic_expes_brown.json
--------------------
