In [4]:
from mlflow import MlflowClient
import pandas as pd
import mlflow

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
nltk.download('punkt_tab')
nltk.download('stopwords')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import pickle

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\mariu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mariu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Exploration de données

In [5]:
data_path = "data"
df = pd.read_csv(f"{data_path}/twitter_training.csv",names=["index_category", "game_category","sentiment_category","Tweet"])

In [6]:
df.head()

Unnamed: 0,index_category,game_category,sentiment_category,Tweet
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


### Preprocessing

In [7]:
df = df.drop(columns='index_category',axis=1)
df = df.drop_duplicates()

In [8]:
df = df[df['sentiment_category'] != 'Irrelevant']

In [9]:
english_stopwords = set(stopwords.words('english'))

def remove_emoji():
  regex_pattern = re.compile(pattern = "["    #Pattern pour enlever les emojis
          u"\U0001F600-\U0001F64F"  # emoticons
          u"\U0001F300-\U0001F5FF"  # symbols & pictographs
          u"\U0001F680-\U0001F6FF"  # transport & map symbols
          u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
          u"\U00002702-\U000027B0"
          u"\U00002702-\U000027B0"
          u"\U000024C2-\U0001F251"
          u"\U0001f926-\U0001f937"
          u"\U00010000-\U0010ffff"
          u"\u2640-\u2642" 
          u"\u2600-\u2B55"
          u"\u200d"
          u"\u23cf"
          u"\u23e9"
          u"\u231a"
          u"\ufe0f"  # dingbats
          u"\u3030"
                            "]+", flags = re.UNICODE)
  return regex_pattern

def lematize(text):
  lemmatizer = WordNetLemmatizer()
  return " ".join([lemmatizer.lemmatize(word) for word in text.split()])


def clean_text(text):
    
    """
    On retire tout d'abord nettoyer les tweets qui ne pourront pas nous servir
    """
    text = text.lower()
    text = re.sub(r'{link}', '',text) #Remove links 
    text = re.sub(r"\[video\]", '',text) #Remove videos
    text = re.sub(r'&[a-z]+;', '',text) #Remove HTML references

    text = re.sub(r'@\w+', '', text) #Remove mention
    text = re.sub(r'#\w+', '', text) #Remove hashtag

    text = re.sub(r'\d+', '', text) #Remove numbers 
    text = re.sub(r'http\S+', '',text) #Remove HTML
    text = re.sub(r'www\S+', '',text) #Remove HTML
    text = re.sub(r'[^\w\s]+',' ',text) #Remove ponctuation et apostrophes
    text = re.sub(r'\s+',' ', text) #Remove new line characters
    text = re.sub(r'[^\w\s]+',' ',text) #Remove ponctuation et apostrophes
    text = re.sub(remove_emoji(),'',text)  #Remove les emojis
    text = lematize(text) #Lemmatization

    """
    Puis on souhaitera de transformer les mots courants vers des mots plus expressives en anglais
    """
    text = re.sub(r"won\'t", "would not", text)
    text = re.sub(r"im", "i am", text)
    text = re.sub(r"Im", "I am", text)
    text  = re.sub(r"can\'t", "can not", text)
    text  = re.sub(r"don\'t", "do not", text)
    text  = re.sub(r"shouldn\'t", "should not", text)
    text  = re.sub(r"needn\'t", "need not", text)
    text  = re.sub(r"hasn\'t", "has not", text)
    text  = re.sub(r"haven\'t", "have not", text)
    text  = re.sub(r"weren\'t", "were not", text)
    text  = re.sub(r"mightn\'t", "might not", text )
    text  = re.sub(r"didn\'t", "did not", text )
    text  = re.sub(r"n\'t", " not", text )
    text  = re.sub(r"\'re", " are", text )
    text  = re.sub(r"\'s", " is", text )
    text  = re.sub(r"\'d", " would", text )
    text  = re.sub(r"\'ll", " will", text )
    text  = re.sub(r"\'t", " not", text )
    text  = re.sub(r"\'ve", " have", text )
    text  = re.sub(r"\'m", " am", text )
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)

    # Remove stopwords from the tokenized text
    filtered_tokens = [word for word in tokens if word.lower() not in english_stopwords]

    # Join the filtered tokens back into a single string
    filtered_text = " ".join(filtered_tokens)

    return filtered_text

In [10]:
df['Tweet'] = df['Tweet'].fillna("")
df['tweet_clean'] = df['Tweet'].apply(clean_text)

In [11]:
df.head()

Unnamed: 0,game_category,sentiment_category,Tweet,tweet_clean
0,Borderlands,Positive,im getting on borderlands and i will murder yo...,getting borderland murder
1,Borderlands,Positive,I am coming to the borders and I will kill you...,coming border kill
2,Borderlands,Positive,im getting on borderlands and i will kill you ...,getting borderland kill
3,Borderlands,Positive,im coming on borderlands and i will murder you...,coming borderland murder
4,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,getting borderland murder


### Feature Engineering TF-IDF et séparation de données

In [12]:
X = df['tweet_clean']
y = df['sentiment_category']

words_to_remove = ['dead','com','wa','pic','get','unk','ti','red','redemption','borderland','ame','one','gta','creed','assassin','go',
                   'look','tv','ha','call','duty','twitter','fifa','pubg','player','ban','battlefield','see','league','legend','twitch','rhandlerr','still']

vectorizer = TfidfVectorizer(
    max_features = 1000,
    stop_words=words_to_remove,
    max_df=0.8, 
    min_df=5
)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42)
X_train_vec = vectorizer.fit_transform(X_train) #Entrainement 80%
X_val_vec = vectorizer.transform(X_val) #Validation 20%

### Encodage des valeurs Y

In [13]:
# Encodage des catégories
sentiments = LabelEncoder()
df['sentiment_category_encoded'] = sentiments.fit_transform(df['sentiment_category'])
y = df['sentiment_category_encoded']

### Application MLFlow

Voici la commande qui permet de lancer MLflow sur votre console <br>
mlflow server --host 127.0.0.1 --port 8080

In [14]:
client = MlflowClient(tracking_uri="http://127.0.0.1:8080")

### Creation Experiment

In [15]:
# Description de l'expérience
experiment_description = (
    "This is the sentiments-analysis-tweets project. "
    "This experiment contains the produce models for Tweets"
)

# Tags de l'expérience
experiment_tags = {
    "project_name": "sentiment-analysis",
    "store_dept": "produce",
    "team": "stores-ml",
    "project_quarter": "Q3-2025",
    "mlflow.note.content": experiment_description,
}

# Nom de l'expérience
experiment_name = "Tweets_model"

# Vérifier si l'expérience existe déjà
existing_experiment = client.get_experiment_by_name(experiment_name)

if existing_experiment:
    print(f"L'expérience '{experiment_name}' existe déjà avec l'ID {existing_experiment.experiment_id}.")
    experiment_id = existing_experiment.experiment_id
else:
    # Créer l'expérience si elle n'existe pas
    experiment_id = client.create_experiment(name=experiment_name, tags=experiment_tags)
    print(f"L'expérience '{experiment_name}' a été créée avec l'ID {experiment_id}.")


L'expérience 'Tweets_model' a été créée avec l'ID 129569839661631736.


In [16]:
tweets_experiment = client.search_experiments(
    filter_string="tags.`project_name` = 'sentiment-analysis'"
)

print(vars(tweets_experiment[0]))

{'_experiment_id': '129569839661631736', '_name': 'Tweets_model', '_artifact_location': 'mlflow-artifacts:/129569839661631736', '_lifecycle_stage': 'active', '_tags': {'mlflow.note.content': 'This is the sentiments-analysis-tweets project. This experiment contains the produce models for Tweets', 'project_name': 'sentiment-analysis', 'project_quarter': 'Q3-2025', 'store_dept': 'produce', 'team': 'stores-ml'}, '_creation_time': 1742334520442, '_last_update_time': 1742334520442}


### Première connexion sur MLflow

In [48]:
mlflow.set_tracking_uri("http://127.0.0.1:8080")

In [52]:
tweets_experiment = mlflow.set_experiment("Tweets_model")

# Define a run name for this iteration of training.
# If this is not set, a unique name will be auto-generated for your run.
run_name_RF = "test_2_random_forest"
run_name_LR = "test_2_logistic_regression"

# Define an artifact path that the model will be saved to.
artifact_path = "artefact_model_tweets"

In [53]:
params_randomforest = {
    'n_estimators': 200,   # Distribution pour le nombre d'arbres
    'max_depth': None,     # Max profondeur de l'arbre
    'min_samples_split': 4,  # Nombre minimal d'échantillons
    'min_samples_leaf': 3    # Nombre de feuilles
}

params_logisticregression = {
    'penalty': 'l2',            # Régularisation L1, L2 par défaut 'l2')
    'C': 1.0,                   # Inverse de la force de régularisation (plus grand C = moins de régularisation)
    'solver': 'lbfgs',          # Algorithme d'optimisation ('lbfgs' recommandé pour multi-classes)
    'max_iter': 1200,           # Nombre max d'itérations pour la convergence
    'multi_class': 'multinomial',  # Mode multinomial pour classification multi-classes
    'random_state': 44         # Fixer le générateur aléatoire pour reproductibilité
}

In [54]:
# Modèle Random forest
model_RFC = RandomForestClassifier(**params_randomforest)
model_RFC.fit(X_train_vec, y_train)

# Predict on the validation set
y_pred = model_RFC.predict(X_val_vec)

accuracy = accuracy_score(y_val, y_pred) 
precision = precision_score(y_val, y_pred,average="macro") 
recall = recall_score(y_val, y_pred,average="macro") 

metrics_RFC = {"accuracy": accuracy, "precision": precision, "recall": recall}

with mlflow.start_run(run_name=run_name_RF) as run:
    # Log the parameters used for the model fit
    mlflow.log_params(params_randomforest)

    # Log the error metrics that were calculated during validation
    mlflow.log_metrics(metrics_RFC)

    # Log an instance of the trained model for later use
    mlflow.sklearn.log_model(
        sk_model=model_RFC, input_example=X_val_vec, artifact_path=artifact_path
    )

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 56.02it/s]  


🏃 View run test_2_random_forest at: http://127.0.0.1:8080/#/experiments/129569839661631736/runs/2f94905055f24374bdafceaaf23a97a5
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/129569839661631736


In [None]:
model_LR = LogisticRegression(**params_logisticregression)
model_LR.fit(X_train_vec, y_train)

# Predict on the validation set
y_pred = model_LR.predict(X_val_vec)

accuracy = accuracy_score(y_val, y_pred) 
precision = precision_score(y_val, y_pred,average="macro") 
recall = recall_score(y_val, y_pred,average="macro") 


metrics_RFC = {"accuracy": accuracy, "precision": precision, "recall": recall}

with mlflow.start_run(run_name=run_name_LR) as run:
    # Log the parameters used for the model fit
    mlflow.log_params(params_logisticregression)

    # Log the error metrics that were calculated during validation
    mlflow.log_metrics(metrics_RFC)

    # Log an instance of the trained model for later use
    mlflow.sklearn.log_model(
        sk_model=model_LR, input_example=X_val_vec, artifact_path=artifact_path
    )

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 81.93it/s]


🏃 View run test_2_logistic_regression at: http://127.0.0.1:8080/#/experiments/129569839661631736/runs/0502eebd41654c07b255488bacde3a55
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/129569839661631736


### Sauvegarder le meilleur modèle et le vectoriser

In [22]:
path_models = "models"

with open(f"{path_models}/RandomClassifierForest.pkl", "wb") as file:
    pickle.dump(model_LR, file)

with open(f"{path_models}/tfidf_vectorizer.pkl", "wb") as file:
    pickle.dump(vectorizer, file)