In [6]:
import pandas as pd
import re
import nltk
import joblib
import time
from datetime import datetime
import os
import pyarrow
import requests
import tarfile
from tqdm import tqdm

# ML
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.metrics import accuracy_score, f1_score, classification_report


#kaggle
import kagglehub

environnement recuperation dataset

In [None]:
RAW_DATA_PATH = "../data/raw/"
JSON_FILE_PATH= os.path.join(RAW_DATA_PATH, "yelp_academic_dataset_review.json")
PROCESSED_PATH = "../data/processed/"
PROCESSED_FILE_PATH = os.path.join(PROCESSED_PATH, "data_cleaned01.parquet")

if os.path.exists(JSON_FILE_PATH):
    print(f"Le fichier '{os.path.basename(JSON_FILE_PATH)}' la dataset est disponible")
    if os.path.exists(PROCESSED_FILE_PATH):
        print(f"Chargement des données nettoyées depuis '{PROCESSED_FILE_PATH}'...")
        df = pd.read_parquet(PROCESSED_FILE_PATH)
      
        print(df.columns)
      
    else:
        print('aucune version  de fichier  recente donc recuperation du fichier brute(raw)...')
        df = pd.read_json(JSON_FILE_PATH,lines=True,nrows=200000)
        # df=df[["user_id","text","useful","funny","cool","stars"]]
        print(df.columns)
        pass    
else:
      print(f"Le fichier '{os.path.basename(JSON_FILE_PATH)}' n'existe pas")

Le fichier 'yelp_academic_dataset_review.json' la dataset est disponible
Chargement des données nettoyées depuis '../data/processed/data_cleaned01.parquet'...
Index(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny',
       'cool', 'text', 'date'],
      dtype='object')


Traitement avec pandas

In [None]:
#creation du label sentiment 
#j'isole les avis neutres avant de le faire
df=  df[df['stars']!=3].copy()

df['sentiment_label']= df['stars'].apply(lambda x:1 if x>3 else 0)
print(df['sentiment_label'].value_counts())



sentiment_label
1    139314
0     38038
Name: count, dtype: int64


netooyage du texte

In [9]:
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))


def preprocess_text(text):
    text = text.lower()
    re.sub(r'[^a-z\s]', '', text) # Garder uniquement les lettres et espaces
    # je etire les stopwords
    return ' '.join([word for word in text.split() if word not in stop_words])


df["cleaned_text"] = df['text'].apply(preprocess_text)
print('nettoyage terminé')

df = df[['cleaned_text', 'sentiment_label']]
print(f"Sauvegarde des données nettoyées dans '{PROCESSED_FILE_PATH}'...")
df.to_parquet(PROCESSED_FILE_PATH)

nettoyage terminé
Sauvegarde des données nettoyées dans '../data/processed/data_cleaned01.parquet'...


Vectorisation et Séparation des Données

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    df['cleaned_text'],
    df['sentiment_label'],
    test_size=0.2, 
    random_state=42, 
    stratify=df['sentiment_label'] #pour les imbalanced classe
)

vectorizer = TfidfVectorizer(max_features=10000) #10.000 mots les plus frequents
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf =vectorizer.transform(X_test)
print('vectorisation terminé')

vectorisation terminé


entrainement des modèles et benchmark

In [11]:
models = {
    "Régression Logistique": LogisticRegression(max_iter=1000, random_state=42),
    "LightGBM": lgb.LGBMClassifier(random_state=42)
}
results = {}
for name,model in models.items():
    print(f'entrainement du modèle {name}')
    start_time = time.time()
    model.fit(X_train_tfidf, y_train)
    training_time = time.time() - start_time

    #evaluation
    y_pred = model.predict(X_test_tfidf)
    proba = model.predict_proba(X_test_tfidf).max()
    print(proba)
    f1 = f1_score(y_test, y_pred)

    results[name]={
        "F1-Score(test)":f1,
        "Temps d'entrainement":f1_score(y_test, y_pred)
    }
    print(f"Entraînement terminé en {training_time:.2f} secondes.")

benchmark_df = pd.DataFrame(results).T.sort_values(by="F1-Score(test)", ascending=False)
print("Benchmark ")
print(benchmark_df)

entrainement du modèle Régression Logistique
0.9999999240736397
Entraînement terminé en 2.80 secondes.
entrainement du modèle LightGBM
[LightGBM] [Info] Number of positive: 111451, number of negative: 30430
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 4.776735 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 825326
[LightGBM] [Info] Number of data points in the train set: 141881, number of used features: 9994
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.785524 -> initscore=1.298156
[LightGBM] [Info] Start training from score 1.298156




0.9993594563283212
Entraînement terminé en 66.88 secondes.
Benchmark 
                       F1-Score(test)  Temps d'entrainement
Régression Logistique        0.971008              0.971008
LightGBM                     0.958849              0.958849


In [12]:
best_model_name = benchmark_df.index[0]
best_model = models[best_model_name]
best_model

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1000


Sauvegarde du modèle et du vectorizer

In [13]:
today_str = datetime.now().strftime("%Y%m%d")
os.makedirs('../models', exist_ok=True)
model_filename = f'sentiment_model_yelp_RegLog_{today_str}.joblib'
vectorizer_filename = f'tfidf_vectorizer_yelp_{today_str}.joblib'

joblib.dump(best_model, f'../models/{model_filename}')
joblib.dump(vectorizer, f'../models/{vectorizer_filename}')
print("modèle et vectorizer sauvegardé!")

modèle et vectorizer sauvegardé!
