<a href="https://colab.research.google.com/github/yassine8enaddi/NLP_TP/blob/main/TP3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

In [55]:
df = pd.read_csv('movie_review.csv')
df

Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag
0,0,cv000,29590,0,films adapted from comic books have had plenty...,pos
1,0,cv000,29590,1,"for starters , it was created by alan moore ( ...",pos
2,0,cv000,29590,2,to say moore and campbell thoroughly researche...,pos
3,0,cv000,29590,3,"the book ( or "" graphic novel , "" if you will ...",pos
4,0,cv000,29590,4,"in other words , don't dismiss this film becau...",pos
...,...,...,...,...,...,...
64715,9,cv999,14636,20,that lack of inspiration can be traced back to...,neg
64716,9,cv999,14636,21,like too many of the skits on the current inca...,neg
64717,9,cv999,14636,22,"after watching one of the "" roxbury "" skits on...",neg
64718,9,cv999,14636,23,"bump unsuspecting women , and . . . that's all .",neg


In [56]:
df.drop(columns=["fold_id", "cv_tag", "html_id", "sent_id"], inplace=True)

In [57]:
df

Unnamed: 0,text,tag
0,films adapted from comic books have had plenty...,pos
1,"for starters , it was created by alan moore ( ...",pos
2,to say moore and campbell thoroughly researche...,pos
3,"the book ( or "" graphic novel , "" if you will ...",pos
4,"in other words , don't dismiss this film becau...",pos
...,...,...
64715,that lack of inspiration can be traced back to...,neg
64716,like too many of the skits on the current inca...,neg
64717,"after watching one of the "" roxbury "" skits on...",neg
64718,"bump unsuspecting women , and . . . that's all .",neg


# **Pre-processing des données textuelles**

In [27]:
import nltk

In [33]:
from nltk.corpus import stopwords

In [32]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [34]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

In [37]:
import string
PUNCT_TO_REMOVE = string.punctuation

In [38]:
def remove_punctuation(text):
  return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

In [63]:
df["text"] = df["text"].str.lower()\
.apply(lambda text: remove_stopwords(text))\
.apply(lambda text: remove_punctuation(text))

In [65]:
df["text"]

0        films adapted comic books plenty success  whet...
1        starters  created alan moore  eddie campbell  ...
2        say moore campbell thoroughly researched subje...
3        book   graphic novel    500 pages long include...
4                              words  dismiss film source 
                               ...                        
64715     lack inspiration traced back insipid characters 
64716    like many skits current incarnation saturdayni...
64717    watching one  roxbury  skits snl  come away ch...
64718                   bump unsuspecting women     thats 
64719           watching anightattheroxbury  left exactly 
Name: text, Length: 64720, dtype: object

# **Entraînement du modèle Word2Vec**

In [75]:
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [76]:
tokenized_text = [word_tokenize(text) for text in df["text"]]

In [77]:
word2vec_model = Word2Vec(sentences=tokenized_text, vector_size=100, window=5, min_count=1, workers=4)

# Vectorisation des reviews de **movies**

In [122]:
import numpy as np

review_embeddings = []
for review in tokenized_text:
    review_embedding = []
    for token in review:
        if token in word2vec_model.wv:
            embedding = word2vec_model.wv[token]
            review_embedding.append(embedding)
    if review_embedding:
        average_embedding = np.mean(review_embedding, axis=0)
    else:
        average_embedding = np.zeros(word2vec_model.vector_size)
    review_embeddings.append(average_embedding)

In [83]:
review_embeddings[0]

array([-0.4366832 ,  0.445935  ,  0.2876816 , -0.00441311,  0.06068467,
       -0.81582326,  0.2846321 ,  1.136826  , -0.13174546, -0.48931032,
       -0.2129914 , -0.9154563 , -0.15683824,  0.05902815,  0.35515797,
       -0.3956693 ,  0.1389147 , -0.6295987 , -0.08212687, -0.98089993,
        0.3457688 ,  0.5198359 ,  0.45921028, -0.19481567, -0.02203831,
        0.26690558, -0.22550742, -0.10526925, -0.51244265,  0.03633578,
        0.46138117,  0.08604437,  0.3137935 , -0.37778842, -0.2959046 ,
        0.37294927,  0.27079436, -0.44553438, -0.24670556, -0.92562246,
        0.04359349, -0.5753262 , -0.14434649,  0.25879303,  0.4720411 ,
       -0.41634652, -0.27993682, -0.13648677,  0.02099169,  0.62386805,
        0.19734696, -0.36885735, -0.331928  , -0.28612953, -0.4215896 ,
        0.3787788 ,  0.25340205,  0.09929857, -0.3008415 ,  0.1792139 ,
       -0.21712738,  0.26329386,  0.03005627,  0.12663586, -0.5257291 ,
        0.4622703 ,  0.3116731 ,  0.56160754, -0.6127003 ,  0.45

In [89]:
df['text'] = review_embeddings

# **Division des données**

In [84]:
from sklearn import preprocessing

In [90]:
le = preprocessing.LabelEncoder()
df['tag'] = le.fit_transform(df['tag'])
df.head()

Unnamed: 0,text,tag
0,"[-0.4366832, 0.445935, 0.2876816, -0.004413106...",1
1,"[-0.26715586, 0.20385613, 0.29668295, -0.19282...",1
2,"[-0.41774717, 0.5030935, 0.27413383, -0.144148...",1
3,"[-0.26386932, 0.23620021, 0.25452435, -0.04814...",1
4,"[-0.4335088, 0.35403326, 0.36222118, 0.2431679...",1


In [88]:
from sklearn.model_selection import train_test_split

In [102]:
X = np.array(df["text"].tolist())
y = df.tag

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X, df['tag'], test_size=0.3, random_state=42)

# Construction d&#39;un **classificateur**

Logistic regression model

In [94]:
from sklearn.linear_model import LogisticRegression

In [95]:
clf = LogisticRegression()

In [105]:
clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [114]:
y_pred = clf.predict(X_test)

# Évaluation du **modèle**

In [112]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [120]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [121]:
print("Accuracy : ", accuracy)
print("Precision : ", precision)
print("Recall : ", recall)
print("F1 score : ", f1)

Accuracy :  0.569324268644417
Precision :  0.5625940163797426
Recall :  0.6827586206896552
F1 score :  0.6168789517089711
