In [630]:
import pandas as pd

In [631]:
df = pd.read_csv("twitter_training.csv",names=["index_category", "game_category","sentiment_category","Tweet"])

In [632]:
df.head()

Unnamed: 0,index_category,game_category,sentiment_category,Tweet
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [633]:
print("Taille de données restant:", df.shape)

Taille de données restant: (74682, 4)


In [634]:
df = df.drop(columns='index_category',axis=1)
df.head()

Unnamed: 0,game_category,sentiment_category,Tweet
0,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,Borderlands,Positive,I am coming to the borders and I will kill you...
2,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,Borderlands,Positive,im coming on borderlands and i will murder you...
4,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [635]:
df.to_csv("new_twitter_training.csv",index=False)

In [636]:
df = df.drop_duplicates()
print("Taille de données restant:", df.shape)


Taille de données restant: (71061, 3)


In [637]:
exclude_categories = ['johnson&johnson', 'Amazon','Nvidia','PlayStation5(PS5)','Xbox(Xseries)','HomeDepot','Verizon','Facebook','Google','Microsoft']

df = df[~df['game_category'].isin(exclude_categories)]

In [638]:
df['sentiment_category'] = df['sentiment_category'].replace('Irrelevant', 'Neutral')

In [639]:
print("Taille de données restant:", df.shape)

Taille de données restant: (48799, 3)


In [640]:
df.head()

Unnamed: 0,game_category,sentiment_category,Tweet
0,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,Borderlands,Positive,I am coming to the borders and I will kill you...
2,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,Borderlands,Positive,im coming on borderlands and i will murder you...
4,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


## Data exploration

### Liste des noms de jeux et entreprises

In [641]:
all_games_category = df['game_category'].unique()
print(all_games_category)

['Borderlands' 'CallOfDutyBlackopsColdWar' 'Overwatch' 'NBA2K' 'Dota2'
 'WorldOfCraft' 'CS-GO' 'AssassinsCreed' 'ApexLegends' 'LeagueOfLegends'
 'Fortnite' 'Hearthstone' 'Battlefield'
 'PlayerUnknownsBattlegrounds(PUBG)' 'FIFA' 'RedDeadRedemption(RDR)'
 'CallOfDuty' 'TomClancysRainbowSix' 'GrandTheftAuto(GTA)' 'MaddenNFL'
 'Cyberpunk2077' 'TomClancysGhostRecon']


In [642]:
all_sentiments = df['sentiment_category'].unique()
print(all_sentiments)

['Positive' 'Neutral' 'Negative']


In [643]:
counts_games = df['game_category'].value_counts().sort_index()
print(counts_games)

game_category
ApexLegends                          2249
AssassinsCreed                       2147
Battlefield                          2240
Borderlands                          2194
CS-GO                                2173
CallOfDuty                           2307
CallOfDutyBlackopsColdWar            2237
Cyberpunk2077                        2150
Dota2                                2221
FIFA                                 2226
Fortnite                             2166
GrandTheftAuto(GTA)                  2204
Hearthstone                          2204
LeagueOfLegends                      2231
MaddenNFL                            2296
NBA2K                                2292
Overwatch                            2212
PlayerUnknownsBattlegrounds(PUBG)    2116
RedDeadRedemption(RDR)               2127
TomClancysGhostRecon                 2266
TomClancysRainbowSix                 2290
WorldOfCraft                         2251
Name: count, dtype: int64


In [644]:
counts_sentiments = df['sentiment_category'].value_counts().sort_index()
print(counts_sentiments)

sentiment_category
Negative    15007
Neutral     19453
Positive    14339
Name: count, dtype: int64


In [645]:
# import plotly.express as px
# import plotly.io as pio
# pio.templates.default = "plotly_white"

# # Calculate the percentage of each sentiment class
# sentiment_percentage = df['sentiment_category'].value_counts(normalize=True) * 100

# # Convert the calculated percentages into DataFrame for plotting
# percentage_df = pd.DataFrame({'Sentiment': sentiment_percentage.index, 'Percentage': sentiment_percentage.values})

# # Plot the bar plot with percentages visible
# fig = px.bar(percentage_df, x='Sentiment', y='Percentage', text='Percentage',
#              title="Percentage of Each Sentiment Class", labels={'Sentiment': 'Sentiment Class', 'Percentage': 'Percentage (%)'})
# fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
# fig.update_layout(width=600, height=500)
# fig.show()


### Data preprocessing

In [646]:
import numpy as np
import re

import nltk
from nltk.corpus import stopwords
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Vincent\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vincent\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [647]:
english_stopwords = set(stopwords.words('english'))

def clean_text(text):
    if not isinstance(text, str):
        return ""  # Retournez une chaîne vide si ce n'est pas une chaîne

    # Replace all non-alphabetic characters with a space
    cleaned_text = re.sub(r'[^a-zA-Z]', ' ', text)

    # Convert text to lowercase
    cleaned_text = cleaned_text.lower()

    # Tokenize the text
    tokens = nltk.word_tokenize(cleaned_text)

    # Remove stopwords from the tokenized text
    filtered_tokens = [word for word in tokens if word.lower() not in english_stopwords]

    # Join the filtered tokens back into a single string
    filtered_text = ' '.join(filtered_tokens)

    return filtered_text

df['tweet_clean'] = df['Tweet'].apply(clean_text)

In [648]:
df.head()

Unnamed: 0,game_category,sentiment_category,Tweet,tweet_clean
0,Borderlands,Positive,im getting on borderlands and i will murder yo...,im getting borderlands murder
1,Borderlands,Positive,I am coming to the borders and I will kill you...,coming borders kill
2,Borderlands,Positive,im getting on borderlands and i will kill you ...,im getting borderlands kill
3,Borderlands,Positive,im coming on borderlands and i will murder you...,im coming borderlands murder
4,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,im getting borderlands murder


In [649]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['tweet_clean'])

# Encodage des catégories
sentiments = LabelEncoder()
df['sentiment_category_encoded'] = sentiments.fit_transform(df['sentiment_category'])

y = df['sentiment_category_encoded']

# Division des données
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [650]:
# from sklearn.model_selection import GridSearchCV

# # Définir les hyperparamètres à tester
# param_grid = {
#     'n_estimators': [100, 200],  # Nombre d'arbres
#     'max_depth': [None, 6],  # Profondeur maximale de l'arbre
#     'min_samples_split': [2, 5],  # Nombre minimal d'échantillons pour diviser un nœud
#     'min_samples_leaf': [1, 2]     # Nombre minimal d'échantillons dans une feuille
# }

# # Instanciation du modèle
# rf_model = RandomForestClassifier()

# # GridSearchCV
# grid_search = GridSearchCV(estimator=rf_model, 
#                            param_grid=param_grid, 
#                            scoring='accuracy',  # Vous pouvez utiliser une métrique adaptée à vos données
#                            cv=5,                  # Nombre de folds pour la validation croisée
#                            verbose=1)            

# grid_search.fit(X_train, y_train)

# # Get the best parameters and score
# best_params = grid_search.best_params_
# best_score = grid_search.best_score_

# print("Best Parameters:", best_params)
# print("Best Score:", best_score)

In [651]:
# Modèle
#model = RandomForestClassifier(**param_grid)
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Prédictions
y_pred = model.predict(X_test)

# Rapport de classification
print(classification_report(y_test, y_pred, target_names=sentiments.classes_))

              precision    recall  f1-score   support

    Negative       0.90      0.86      0.88      4588
     Neutral       0.82      0.90      0.86      5759
    Positive       0.88      0.81      0.84      4293

    accuracy                           0.86     14640
   macro avg       0.87      0.85      0.86     14640
weighted avg       0.86      0.86      0.86     14640



In [652]:
# Exemple de tweets à prédire
new_tweets = [
    # Positive
    "I love playing Call of Duty, the action is incredible!",
    "Borderlands 3 is such an amazing game, highly recommended!",
    # Negative
    "Dota2's community is so toxic, it's hard to enjoy the game sometimes.",
    "Cyberpunk2077 still has bugs even after all these patches.",
    # Neutral
    "AssassinsCreed's open worlds are impressive, but they can feel overwhelming.",
    "Thinking about trying out Cyberpunk 2077; heard mixed reviews. #Cyberpunk2077",
    "Did you know Overwatch lore? But I'm not into gaming much.",
    "Fortnite is creative with its events, but I'm not into building mechanics."
]

# Vectoriser les nouveaux tweets
new_tweets_tfidf = vectorizer.transform(new_tweets)

# Faire les prédictions
predicted_sentiments = model.predict(new_tweets_tfidf)

# Afficher les résultats
for tweet, sentiment in zip(new_tweets, predicted_sentiments):
    sentiment_class = sentiments.inverse_transform([sentiment])[0]
    print(f"Tweet: {tweet}")
    print(f"Predicted Sentiment: {sentiment} = {sentiment_class}")

Tweet: I love playing Call of Duty, the action is incredible!
Predicted Sentiment: 2 = Positive
Tweet: Borderlands 3 is such an amazing game, highly recommended!
Predicted Sentiment: 2 = Positive
Tweet: Dota2's community is so toxic, it's hard to enjoy the game sometimes.
Predicted Sentiment: 0 = Negative
Tweet: Cyberpunk2077 still has bugs even after all these patches.
Predicted Sentiment: 0 = Negative
Tweet: AssassinsCreed's open worlds are impressive, but they can feel overwhelming.
Predicted Sentiment: 1 = Neutral
Tweet: Thinking about trying out Cyberpunk 2077; heard mixed reviews. #Cyberpunk2077
Predicted Sentiment: 1 = Neutral
Tweet: Did you know Overwatch lore? But I'm not into gaming much.
Predicted Sentiment: 1 = Neutral
Tweet: Fortnite is creative with its events, but I'm not into building mechanics.
Predicted Sentiment: 1 = Neutral
