In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.regularizers import l2
import pickle
from google.colab import drive


In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
data_path = '/content/drive/My Drive/Machine_Learning_Data/classification.csv'
df = pd.read_csv(data_path)

In [5]:
print(df.head())

                                                text  emotion
0                              feel incredibly weepy        0
1                      feeling contented wife mother        1
2  pick novels feel like dropping luggage signing...        1
3  little extra interest right feeling like husba...        1
4  feel become even lot vital high eighty five we...        1


In [6]:
# Séparation des données en entrée (textes) et étiquettes (sentiments)
texts = df['text'].values  # tous les textes sont stockés dans le tableau texts
labels = df['emotion'].values  # toutes les étiquettes sont stockées dans le tableau labels

In [7]:
# Tokenisation des textes
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# Padding des séquences
padded_sequences = pad_sequences(sequences, maxlen=150)

In [8]:
# Division des données en ensembles d'entraînement, de validation et de test
X_train, X_temp, y_train, y_temp = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Afficher la taille de chaque ensemble
print(f"Taille de l'ensemble d'entraînement: {len(X_train)}")
print(f"Taille de l'ensemble de validation: {len(X_val)}")
print(f"Taille de l'ensemble de test: {len(X_test)}")

Taille de l'ensemble d'entraînement: 22626
Taille de l'ensemble de validation: 2828
Taille de l'ensemble de test: 2829


In [9]:
#  Construction du modèle avec régularisation
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=150))  # Augmentation de la dimension de l'embedding
model.add(GRU(units=32, return_sequences=True, kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01)))  # Réduction des unités avec régularisation L2
model.add(GRU(units=16, kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01)))
model.add(Dropout(0.4))  # Augmentation du taux de dropout
model.add(Dense(1, activation='sigmoid', kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01)))

# Compilation du modèle
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Loss: {loss}, Accuracy: {accuracy}')

Loss: 0.2167314887046814, Accuracy: 0.9508660435676575


In [None]:
# Après l'entraînement
model.save('/content/drive/My Drive/Machine_Learning_Data/sentiment_analysis_model.h5')


# Sauvegarde du tokenizer
with open('/content/drive/My Drive/Machine_Learning_Data/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

  saving_api.save_model(
