In [2]:
# Import required libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Nouvelle section

In [4]:
df_reviews = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/df_reviews.csv')


In [5]:
# Preprocess text data
df_reviews.loc[:, 'processed_text'] = df_reviews['tokens'].apply(
    lambda tokens: ' '.join(eval(tokens)) if isinstance(tokens, str) else ' '.join(tokens)
)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=2)
X_tfidf = tfidf_vectorizer.fit_transform(df_reviews['processed_text'])

# Assuming 'label' column contains target labels
y = df_reviews['sentiment']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)


In [7]:
# Entraîner le modèle
lr_model = LogisticRegression(class_weight='balanced', max_iter=1000)
lr_model.fit(X_train, y_train)

# Prédictions
y_pred = lr_model.predict(X_test)
y_pred_prob = lr_model.predict_proba(X_test)

# Rapport de classification
print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, y_pred))

# Score ROC-AUC
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class='ovr')
print(f"Logistic Regression ROC-AUC Score: {roc_auc}")


Logistic Regression Classification Report:
              precision    recall  f1-score   support

    negative       0.75      0.78      0.77      1877
     neutral       0.35      0.57      0.43      1130
    positive       0.95      0.84      0.89      6993

    accuracy                           0.80     10000
   macro avg       0.68      0.73      0.70     10000
weighted avg       0.84      0.80      0.82     10000

Logistic Regression ROC-AUC Score: 0.9021178732924898


In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Entraîner le modèle
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

# Prédictions
y_pred = dt_model.predict(X_test)

# Vérifier si le modèle peut prédire les probabilités
if hasattr(dt_model, "predict_proba"):
    y_pred_prob = dt_model.predict_proba(X_test)
else:
    y_pred_prob = None

# Rapport de classification
print("\nDecision Tree Classification Report:")
print(classification_report(y_test, y_pred))

# Score ROC-AUC (calculable uniquement si les probabilités sont disponibles)
if y_pred_prob is not None:
    roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class='ovr')
    print(f"Decision Tree ROC-AUC Score: {roc_auc}")
else:
    print("ROC-AUC Score not calculable for Decision Tree (predict_proba not available).")



Decision Tree Classification Report:
              precision    recall  f1-score   support

    negative       0.58      0.56      0.57      1877
     neutral       0.22      0.20      0.21      1130
    positive       0.83      0.85      0.84      6993

    accuracy                           0.72     10000
   macro avg       0.54      0.54      0.54     10000
weighted avg       0.71      0.72      0.72     10000

Decision Tree ROC-AUC Score: 0.6701783773528063


In [11]:
from sklearn.ensemble import RandomForestClassifier

# Entraîner le modèle
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# Prédictions
y_pred = rf_model.predict(X_test)
y_pred_prob = rf_model.predict_proba(X_test)

# Rapport de classification
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, y_pred))

# Score ROC-AUC
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class='ovr')
print(f"Random Forest ROC-AUC Score: {roc_auc}")



Random Forest Classification Report:
              precision    recall  f1-score   support

    negative       0.86      0.53      0.66      1877
     neutral       0.54      0.01      0.02      1130
    positive       0.79      0.99      0.88      6993

    accuracy                           0.80     10000
   macro avg       0.73      0.51      0.52     10000
weighted avg       0.77      0.80      0.74     10000

Random Forest ROC-AUC Score: 0.8752527887488183


In [12]:
from sklearn.neighbors import KNeighborsClassifier

# Entraîner le modèle
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)

# Prédictions
y_pred = knn_model.predict(X_test)
y_pred_prob = knn_model.predict_proba(X_test)

# Rapport de classification
print("\nK-Nearest Neighbors Classification Report:")
print(classification_report(y_test, y_pred))

# Score ROC-AUC
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class='ovr')
print(f"K-Nearest Neighbors ROC-AUC Score: {roc_auc}")



K-Nearest Neighbors Classification Report:
              precision    recall  f1-score   support

    negative       0.62      0.45      0.52      1877
     neutral       0.22      0.08      0.12      1130
    positive       0.78      0.92      0.85      6993

    accuracy                           0.74     10000
   macro avg       0.54      0.48      0.50     10000
weighted avg       0.69      0.74      0.70     10000

K-Nearest Neighbors ROC-AUC Score: 0.7091870224212816


In [13]:
from sklearn.naive_bayes import MultinomialNB

# Entraîner le modèle
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Prédictions
y_pred = nb_model.predict(X_test)
y_pred_prob = nb_model.predict_proba(X_test)

# Rapport de classification
print("\nNaive Bayes Classification Report:")
print(classification_report(y_test, y_pred))

# Score ROC-AUC
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class='ovr')
print(f"Naive Bayes ROC-AUC Score: {roc_auc}")



Naive Bayes Classification Report:
              precision    recall  f1-score   support

    negative       0.91      0.39      0.54      1877
     neutral       0.00      0.00      0.00      1130
    positive       0.76      1.00      0.86      6993

    accuracy                           0.77     10000
   macro avg       0.55      0.46      0.47     10000
weighted avg       0.70      0.77      0.70     10000

Naive Bayes ROC-AUC Score: 0.8411334744702584


In [14]:
from sklearn.svm import SVC

# Entraîner le modèle
svm_model = SVC(probability=True)
svm_model.fit(X_train, y_train)

# Prédictions
y_pred = svm_model.predict(X_test)
y_pred_prob = svm_model.predict_proba(X_test)

# Rapport de classification
print("\nSVM Classification Report:")
print(classification_report(y_test, y_pred))

# Score ROC-AUC
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class='ovr')
print(f"SVM ROC-AUC Score: {roc_auc}")



SVM Classification Report:
              precision    recall  f1-score   support

    negative       0.81      0.76      0.78      1877
     neutral       0.57      0.15      0.23      1130
    positive       0.86      0.98      0.91      6993

    accuracy                           0.84     10000
   macro avg       0.74      0.63      0.64     10000
weighted avg       0.82      0.84      0.81     10000

SVM ROC-AUC Score: 0.910689567012962


In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score

# Entraîner le modèle
svm_model = SVC(probability=True)
svm_model.fit(X_train, y_train)

# Prédictions
y_pred = svm_model.predict(X_test)
y_pred_prob = svm_model.predict_proba(X_test)

# Rapport de classification
print("\nSVM Classification Report:")
print(classification_report(y_test, y_pred))

# Score ROC-AUC
roc_auc = roc_auc_score(y_test, y_pred_prob, multi_class='ovr')
print(f"SVM ROC-AUC Score: {roc_auc}")


In [None]:
# Define models
models = {
    "Logistic Regression": LogisticRegression(class_weight='balanced', max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(probability=True),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Naive Bayes": MultinomialNB()
}