In [1]:
import pandas as pd
import numpy as np
import string
import itertools
import collections
from gensim.models import Word2Vec
from nltk.corpus import stopwords
import statsmodels.api as sm
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
with open('d:/ressources/fr_stopwords.txt') as f: # stopwords perso tirés du web
    perso_stopwords = set(f.read().split('\n'))
nltkStopWords = set(stopwords.words('french')) # stopwords en nltk

In [3]:
myStopwords = nltkStopWords | perso_stopwords # fusion des 2

In [4]:
df = pd.read_pickle(r'd:/git/coursAlice/final1000.pickle')

## I. Ajout des annotations

### 1. taille du texte

In [5]:
def document_vector(text,model):
    """Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
    doc = [word for word in text if word in model.wv.vocab]
    vector = np.mean(model[doc], axis=0)
    return vector

In [6]:
model = Word2Vec.load("d:/git/coursAlice/model_8200_500dim_50cutoff.model")

In [7]:
df['tokens'] = df['text'].apply(lambda x:
            [mot for mot in x.replace("\n"," ").split(" ") if mot] # if mot pour enlever les "" mots vides
                       )

In [8]:
df["doc_vector"] = df["tokens"].apply(lambda x: 
          document_vector(x, model))

  after removing the cwd from sys.path.


In [9]:
df['taille'] = df["tokens"].apply(lambda x: len(x))

### 2. Distance cosinus du texte du vecteur barycentre de la classe 

In [10]:
def get_most_frequent(df, label, stopwords, n=10):
    mini_df = df[df['label'] == label] # sépare le label traité
    token_list = list(mini_df['tokens']) # the lists of lists of tokens as listed in the original df
    token_list = [token for liste in token_list for token in liste] # flat list where 1 item of list = 1 token
    dic = {}
    for token in token_list:
        if token not in stopwords: # not taking stopwords
            if token in dic:
                dic[token]+=1 # creating a dictionary with frequencies
            else:
                dic[token]=0
    dic_ord = {k: v for k, v in sorted(dic.items(), key=lambda item: item[1], reverse=True)} # order from most frequent
    dic_ord = collections.OrderedDict(dic_ord) # thanks to collections get an ordered dict
    dic_ord = itertools.islice(dic_ord.items(), 0, n)
    return [key for (key,value)in dic_ord] # take the items from the slice iterator

In [11]:
freq_eco = get_most_frequent(df, 'economie', myStopwords, 10)
freq_soc = get_most_frequent(df, 'societe', myStopwords, 10)
freq_pol = get_most_frequent(df, 'politique', myStopwords, 10)

In [12]:
pol_barycentre = document_vector(freq_pol, model)
soc_barycentre = document_vector(freq_soc, model)
eco_barycentre = document_vector(freq_eco, model)

  after removing the cwd from sys.path.


In [13]:
df["dist_cos_pol"] = df["doc_vector"].apply(lambda x: cosine_similarity(pol_barycentre.reshape(1,-1,), x.reshape(1,-1,))[0][0])

In [14]:
df["dist_cos_eco"] = df["doc_vector"].apply(lambda x: cosine_similarity(eco_barycentre.reshape(1,-1,), x.reshape(1,-1,))[0][0])

In [15]:
df["dist_cos_soc"] = df["doc_vector"].apply(lambda x: cosine_similarity(soc_barycentre.reshape(1,-1,), x.reshape(1,-1,))[0][0])

### 2. Présence ou non des mots les plus fréquents de chaque des classes

In [16]:
freq_words = list(set(freq_pol) | set(freq_eco) | set(freq_soc))

In [17]:
def check_presence(text, word):
    if word in text:
        return 1
    else:
        return 0

In [18]:
for i in range(len(freq_words)):
    df[freq_words[i]] = df['tokens'].apply(lambda x:
                check_presence(x, freq_words[i]))

In [19]:
df['label'].unique()

array(['economie', 'societe', 'politique'], dtype=object)

In [20]:
df.to_csv('features_expert.csv', sep=";")

In [21]:
df.head(3)

Unnamed: 0,label,text,tokens,doc_vector,taille,dist_cos_pol,dist_cos_eco,dist_cos_soc,président,euro,...,politique,national,million,UMP,France,ministre,marché,%,gouvernement,prendre
0,economie,syndicat patronat entendre sauver retraite com...,"[syndicat, patronat, entendre, sauver, retrait...","[0.08976937, -0.23544203, 0.1646332, -0.377167...",350,0.327722,0.536379,0.30275,1,1,...,0,0,0,0,0,0,0,1,1,1
1,economie,premier centrale nucléaire flottant monde rout...,"[premier, centrale, nucléaire, flottant, monde...","[0.098420516, 0.0009738469, 0.023506144, -0.18...",245,-0.012974,0.346578,0.029931,0,0,...,0,0,0,0,1,0,0,0,0,1
2,economie,Google Apple sceller rupture \n\nGoogle Apple ...,"[Google, Apple, sceller, rupture, Google, Appl...","[-0.025282267, -0.16704282, 0.1874238, -0.1612...",236,0.087018,0.3278,0.065241,0,0,...,0,0,0,0,0,0,0,0,1,1


## II. Regression logistique

In [22]:
X = df.iloc[:,4:].to_numpy() # on prend uniquement les features expert

In [23]:
X[0]

array([3.50000000e+02, 3.27722192e-01, 5.36378980e-01, 3.02749902e-01,
       1.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.00000000e+00, 1.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.00000000e+00, 1.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
       1.00000000e+00, 1.00000000e+00])

In [24]:
y = df['label']
X = sm.add_constant(X) # Ajout d'une dimension 1 qui permet à l'algorithme de savoir où commencent les features

# y = [1 if i == "politique" else 0 for i in y] # je regarde une classe en comparant avec les autres
# y = [1 if i == "societe" else 0 for i in y] 
y = [1 if i == "economie" else 0 for i in y] 


# Configuration du modèle
model = sm.Logit(y, X)
# result = model.fit(method='bfgs')
# result = model.fit(method='basinhopping')
result = model.fit(method='ncg')
result.pred_table()

Optimization terminated successfully.
         Current function value: 0.539544
         Iterations: 4
         Function evaluations: 8
         Gradient evaluations: 8
         Hessian evaluations: 4


array([[1967.,   33.],
       [ 703.,  297.]])

In [25]:
result.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,3000.0
Model:,Logit,Df Residuals:,2973.0
Method:,MLE,Df Model:,26.0
Date:,"Mon, 25 Jan 2021",Pseudo R-squ.:,0.1523
Time:,16:09:01,Log-Likelihood:,-1618.6
converged:,True,LL-Null:,-1909.5
Covariance Type:,nonrobust,LLR p-value:,3.649e-106

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.0763,0.158,-0.484,0.628,-0.385,0.233
x1,-0.0018,0.000,-4.957,0.000,-0.002,-0.001
x2,-0.1040,0.609,-0.171,0.864,-1.298,1.090
x3,0.1544,0.329,0.470,0.639,-0.490,0.799
x4,-0.0545,0.974,-0.056,0.955,-1.963,1.854
x5,-0.0865,0.092,-0.941,0.346,-0.267,0.094
x6,0.1550,0.109,1.429,0.153,-0.058,0.368
x7,-0.0555,0.092,-0.602,0.547,-0.236,0.125
x8,-0.0295,0.086,-0.344,0.730,-0.197,0.138


In [27]:
df.drop('Paris', axis=1)
df.drop('monsieur', axis=1)
df.drop('mettre', axis=1)

Unnamed: 0,label,text,tokens,doc_vector,taille,dist_cos_pol,dist_cos_eco,dist_cos_soc,président,euro,...,politique,national,million,UMP,France,ministre,marché,%,gouvernement,prendre
0,economie,syndicat patronat entendre sauver retraite com...,"[syndicat, patronat, entendre, sauver, retrait...","[0.08976937, -0.23544203, 0.1646332, -0.377167...",350,0.327722,0.536379,0.302750,1,1,...,0,0,0,0,0,0,0,1,1,1
1,economie,premier centrale nucléaire flottant monde rout...,"[premier, centrale, nucléaire, flottant, monde...","[0.098420516, 0.0009738469, 0.023506144, -0.18...",245,-0.012974,0.346578,0.029931,0,0,...,0,0,0,0,1,0,0,0,0,1
2,economie,Google Apple sceller rupture \n\nGoogle Apple ...,"[Google, Apple, sceller, rupture, Google, Appl...","[-0.025282267, -0.16704282, 0.1874238, -0.1612...",236,0.087018,0.327800,0.065241,0,0,...,0,0,0,0,0,0,0,0,1,1
3,economie,année exécrable marché emploi cadre \n\nmarché...,"[année, exécrable, marché, emploi, cadre, marc...","[-0.051098973, -0.06978898, -0.011798844, -0.2...",234,0.173263,0.689741,0.300586,1,0,...,0,0,0,0,0,0,1,1,0,0
4,economie,SoLocal ex-PagesJaunes conflit durcivre \n\nsi...,"[SoLocal, ex-PagesJaunes, conflit, durcivre, s...","[0.06475365, -0.16913302, 0.1664923, -0.161894...",315,0.312311,0.530769,0.299020,1,1,...,0,0,1,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19395,politique,François Bayrou implacable procureur Nicolas S...,"[François, Bayrou, implacable, procureur, Nico...","[0.00589556, -0.08423726, 0.0105079245, -0.210...",227,0.600908,0.062756,0.359545,1,0,...,1,1,0,0,0,0,0,1,0,0
19396,politique,François Hollande plaider faveur nouveau instr...,"[François, Hollande, plaider, faveur, nouveau,...","[0.14843817, -0.24743783, 0.32968348, -0.25446...",154,0.371472,0.432073,0.247631,1,1,...,0,0,0,0,0,0,0,0,0,0
19397,politique,Canard enchaîner Georges Tron louer bien appar...,"[Canard, enchaîner, Georges, Tron, louer, bien...","[-0.059412897, -0.016744621, 0.14009875, 0.006...",170,0.097784,0.451088,0.207147,0,1,...,0,0,0,1,0,0,0,1,0,1
19398,politique,sortie réussie Nicolas Sarkozy \n\nregarder fa...,"[sortie, réussie, Nicolas, Sarkozy, regarder, ...","[-0.00282282, -0.05974114, 0.08763091, -0.1142...",219,0.402374,0.094014,0.322189,1,0,...,0,0,0,1,0,1,0,0,0,1


In [28]:
df.to_csv('filtered_features_expert.csv', sep=";")