In [None]:
import polars as pl
from sklearn.model_selection import train_test_split

df = pl.read_parquet("dataframes/preprocessed.parquet")

df = df.with_columns(
	pl.col("clean_content")
	.str.to_lowercase()
	.str.replace_all(r'[^\w\s]', ' ')
	.str.replace_all(r'\s+', ' ')
	.alias("clean_content")
)

train_df, test_df = train_test_split(df["clean_content", "target"], test_size=0.3, random_state=42)

train_data = list(zip(train_df["clean_content"], train_df["target"]))
test_data = list(zip(test_df["clean_content"], test_df["target"]))

data = [row[0].split() for row in train_data]
data[:5]

In [None]:
from gensim.models import Word2Vec

EMBEDDING_DIM = 300

w2v_model = Word2Vec(
	sentences=data,
	vector_size=EMBEDDING_DIM,
	window=5,
	min_count=5,
	workers=-1,
	sg=1
)

W2V_VOCAB_DICT: dict[str, int] = {str(word): i + 2 for i, word in enumerate(w2v_model.wv.index_to_key)}

W2V_VOCAB_DICT['<PAD>'] = 0
W2V_VOCAB_DICT['<UNK>'] = 1

W2V_VOCAB_SIZE = len(W2V_VOCAB_DICT)

print(f"Taille finale du vocabulaire (incluant <PAD>/<UNK>): {W2V_VOCAB_SIZE}")

In [None]:
from gensim.models import FastText

ft_model = FastText(
    sentences=data,
    vector_size=EMBEDDING_DIM,
	window=5,
	min_count=5,
	workers=-1,
	sg=1
)

FT_VOCAB_DICT: dict[str, int] = {str(word): i + 2 for i, word in enumerate(ft_model.wv.index_to_key)}

FT_VOCAB_DICT['<PAD>'] = 0
FT_VOCAB_DICT['<UNK>'] = 1

FT_VOCAB_SIZE = len(FT_VOCAB_DICT)

print(f"Taille finale du vocabulaire (incluant <PAD>/<UNK>): {FT_VOCAB_SIZE}")

In [None]:
MAX_SEQ_LEN = 15

def text_to_sequence(text, vocab_dict):
	text = text.split(" ")

	sequence = [vocab_dict.get(token, vocab_dict['<UNK>']) for token in text]

	if len(sequence) < MAX_SEQ_LEN:
		sequence += [vocab_dict['<PAD>']] * (MAX_SEQ_LEN - len(sequence))
	else:
		sequence = sequence[:MAX_SEQ_LEN]
	return sequence

In [None]:
import numpy as np

X_train_w2v = np.array([text_to_sequence(tweet, W2V_VOCAB_DICT) for tweet in train_df["clean_content"]])
y_train_w2v = np.array(train_df["target"], dtype=np.float32)
X_test_w2v = np.array([text_to_sequence(tweet, W2V_VOCAB_DICT) for tweet in test_df["clean_content"]])
y_test_w2v = np.array(test_df["target"], dtype=np.float32)

print(f"X_train_w2v shape: {X_train_w2v.shape}, y_train shape_w2v: {y_train_w2v.shape}")

In [None]:
X_train_ft = np.array([text_to_sequence(tweet, FT_VOCAB_DICT) for tweet in train_df["clean_content"]])
y_train_ft = np.array(train_df["target"], dtype=np.float32)
X_test_ft = np.array([text_to_sequence(tweet, FT_VOCAB_DICT) for tweet in test_df["clean_content"]])
y_test_ft = np.array(test_df["target"], dtype=np.float32)

print(f"X_train_w2v shape: {X_train_ft.shape}, y_train shape_w2v: {y_train_ft.shape}")

In [None]:
w2v_embedding_matrix = np.zeros((W2V_VOCAB_SIZE, EMBEDDING_DIM))

for word, i in W2V_VOCAB_DICT.items():
	if word in w2v_model.wv:
		w2v_embedding_matrix[i] = w2v_model.wv[word]
	elif word not in ['<UNK>', '<PAD>']:
		w2v_embedding_matrix[i] = np.random.uniform(low=-0.6, high=0.6, size=(EMBEDDING_DIM,))

In [None]:
ft_embedding_matrix = np.zeros((FT_VOCAB_SIZE, EMBEDDING_DIM))

for word, i in FT_VOCAB_DICT.items():
	if word in ft_model.wv:
		ft_embedding_matrix[i] = ft_model.wv[word]

In [None]:
import os
os.environ["KERAS_BACKEND"] = "torch"
import keras
import torch

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def create_lstm_model(hidden_dim, n_layers, dropout, learning_rate, vocab_size, embedding_matrix, trainable):

	inputs = keras.Input(shape=(MAX_SEQ_LEN,), dtype="int32")
	
	embedding_layer = keras.layers.Embedding(
		input_dim=vocab_size,
		output_dim=EMBEDDING_DIM,
		weights=[embedding_matrix],
		trainable=trainable,
		mask_zero=True
	)
	x = embedding_layer(inputs)
	
	x = keras.layers.Dropout(dropout)(x)

	for i in range(n_layers - 1):
		x = keras.layers.Bidirectional(
			keras.layers.LSTM(hidden_dim, return_sequences=True, dropout=dropout)
		)(x)

	x = keras.layers.Bidirectional(
		keras.layers.LSTM(hidden_dim, return_sequences=False, dropout=dropout)
	)(x)
	
	x = keras.layers.Dropout(dropout)(x)
	outputs = keras.layers.Dense(1, activation="sigmoid")(x)
	
	model = keras.Model(inputs=inputs, outputs=outputs)

	model.compile(
		optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
		loss="binary_crossentropy",
		metrics=["accuracy"]
	)

	return model

In [None]:
import mlflow
mlflow.autolog(disable=True)

HIDDEN_DIM = 256
N_LAYERS = 2
DROPOUT = 0.2
LEARNING_RATE = 1e-3
N_EPOCHS = 3
BATCH_SIZE = 4096


experiment_name = "Réalisez une analyse de sentiments grâce au Deep Learning"
mlflow.set_experiment(experiment_name=experiment_name)


# w2v_lstm_model = create_lstm_model(HIDDEN_DIM, N_LAYERS, DROPOUT, LEARNING_RATE, W2V_VOCAB_SIZE, w2v_embedding_matrix)

# with mlflow.start_run(run_name="Word2Vec Test run") as run:

# 	history = w2v_lstm_model.fit(
# 		X_train_w2v,
# 		y_train_w2v,
# 		epochs=N_EPOCHS,
# 		batch_size=BATCH_SIZE,
# 		validation_data=(X_test_w2v, y_test_w2v),
# 		verbose=1
# 	)

# 	history_dict = history.history

# 	for epoch in range(N_EPOCHS):

# 		if 'loss' in history_dict:
# 			mlflow.log_metric("train_loss", history_dict['loss'][epoch], step=epoch + 1)
# 		if 'accuracy' in history_dict:
# 			mlflow.log_metric("train_acc", history_dict['accuracy'][epoch], step=epoch + 1)
			
# 		if 'val_loss' in history_dict:
# 			mlflow.log_metric("valid_loss", history_dict['val_loss'][epoch], step=epoch + 1)
# 		if 'val_accuracy' in history_dict:
# 			mlflow.log_metric("valid_acc", history_dict['val_accuracy'][epoch], step=epoch + 1)
	
# 	loss, acc = w2v_lstm_model.evaluate(X_test_w2v, y_test_w2v, batch_size=BATCH_SIZE, verbose=0)

# 	mlflow.log_metric("final_test_loss", loss)
# 	mlflow.log_metric("final_test_acc", acc)

# 	input_example = X_test_w2v[:1]
# 	output_example = keras.ops.convert_to_numpy(w2v_lstm_model.predict(input_example, verbose=0))

# 	mlflow.keras.log_model(
# 		w2v_lstm_model,
# 		name="lstm",
# 		signature=mlflow.models.signature.infer_signature(input_example, output_example),
# 	)

# 	params = {
# 		"model_type": "Keras_LSTM",
# 		"keras_backend": keras.backend.backend(),
# 		"hidden_dim": HIDDEN_DIM,
# 		"n_layers": N_LAYERS,
# 		"dropout": DROPOUT,
# 		"learning_rate": LEARNING_RATE,
# 		"n_epochs": N_EPOCHS,
# 		"batch_size": BATCH_SIZE,
# 		"embedding_dim": EMBEDDING_DIM,
# 		"vocab_size": W2V_VOCAB_SIZE,
# 		"trainable_embeddings": w2v_lstm_model.layers[1].trainable,
# 	}
# 	mlflow.log_params(params)

# 	print(f'Test Loss: {loss:.3f} | Test Acc: {acc*100:.2f}%')
# 	training = 0

In [None]:
# ft_lstm_model = create_lstm_model(HIDDEN_DIM, N_LAYERS, DROPOUT, LEARNING_RATE, FT_VOCAB_SIZE, ft_embedding_matrix)

# with mlflow.start_run(run_name="FastText Test run") as run:

# 	history = ft_lstm_model.fit(
# 		X_train_ft,
# 		y_train_ft,
# 		epochs=N_EPOCHS,
# 		batch_size=BATCH_SIZE,
# 		validation_data=(X_test_ft, y_test_ft),
# 		verbose=1
# 	)

# 	history_dict = history.history

# 	for epoch in range(N_EPOCHS):

# 		if 'loss' in history_dict:
# 			mlflow.log_metric("train_loss", history_dict['loss'][epoch], step=epoch + 1)
# 		if 'accuracy' in history_dict:
# 			mlflow.log_metric("train_acc", history_dict['accuracy'][epoch], step=epoch + 1)
			
# 		if 'val_loss' in history_dict:
# 			mlflow.log_metric("valid_loss", history_dict['val_loss'][epoch], step=epoch + 1)
# 		if 'val_accuracy' in history_dict:
# 			mlflow.log_metric("valid_acc", history_dict['val_accuracy'][epoch], step=epoch + 1)
	
# 	loss, acc = ft_lstm_model.evaluate(X_test_ft, y_test_ft, batch_size=BATCH_SIZE, verbose=0)

# 	mlflow.log_metric("final_test_loss", loss)
# 	mlflow.log_metric("final_test_acc", acc)

# 	input_example = X_test_ft[:1]
# 	output_example = keras.ops.convert_to_numpy(ft_lstm_model.predict(input_example, verbose=0))

# 	mlflow.keras.log_model(
# 		ft_lstm_model,
# 		name="lstm",
# 		signature=mlflow.models.signature.infer_signature(input_example, output_example),
# 	)

# 	params = {
# 		"model_type": "Keras_LSTM",
# 		"keras_backend": keras.backend.backend(),
# 		"hidden_dim": HIDDEN_DIM,
# 		"n_layers": N_LAYERS,
# 		"dropout": DROPOUT,
# 		"learning_rate": LEARNING_RATE,
# 		"n_epochs": N_EPOCHS,
# 		"batch_size": BATCH_SIZE,
# 		"embedding_dim": EMBEDDING_DIM,
# 		"vocab_size": FT_VOCAB_SIZE,
# 		"trainable_embeddings": ft_lstm_model.layers[1].trainable,
# 	}
# 	mlflow.log_params(params)

# 	print(f'Test Loss: {loss:.3f} | Test Acc: {acc*100:.2f}%')
# 	training = 0

In [None]:
import gc
from itertools import product
import tensorflow as tf

N_EPOCHS = 5
MIN_BATCH_SIZE = 64

HIDDEN_DIM = [1024]
N_LAYERS = [5]
DROPOUT = [0.2]
LEARNING_RATE = [1e-3, 1e-2]
TRAINABLE = ["False", "True"]

all_params = list(product(HIDDEN_DIM, N_LAYERS, DROPOUT, LEARNING_RATE, TRAINABLE))
max_runs = len(all_params)
counter = 0


with mlflow.start_run(run_name="Word2Vec", run_id="140227e789ff49378183cabff82bac58") as parent_run:

	for hd, nl, dp, lr, tr in all_params:

		counter += 1
		print(f"Run numéro {counter}/{max_runs} -> HD:{hd}-NL:{nl}-DP:{dp}-LR:{lr}-TRAINABLE:{tr}")
		
		BATCH_SIZE=128

		training = 1
		while training:
			try:

				if 'model' in locals():
					del model
				keras.backend.clear_session()
				gc.collect()
				torch.cuda.empty_cache()

				model = create_lstm_model(hd, nl, dp, lr, W2V_VOCAB_SIZE, w2v_embedding_matrix, tr)
				run_name = f"HD:{hd}-NL:{nl}-DP:{dp}-LR:{lr}-TRAINABLE:{tr}"


				with mlflow.start_run(nested=True, run_name=run_name) as run:


					train_dataset = tf.data.Dataset.from_tensor_slices((X_train_w2v, y_train_w2v))
					train_dataset = train_dataset.shuffle(buffer_size=1024).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

					test_dataset = tf.data.Dataset.from_tensor_slices((X_test_w2v, y_test_w2v))
					test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

					history = model.fit(
						train_dataset,
						epochs=N_EPOCHS,
						batch_size=BATCH_SIZE,
						validation_data=test_dataset,
						verbose=1
					)

					history_dict = history.history

					for epoch in range(N_EPOCHS):

						if 'loss' in history_dict:
							mlflow.log_metric("train_loss", history_dict['loss'][epoch], step=epoch + 1)
						if 'accuracy' in history_dict:
							mlflow.log_metric("train_acc", history_dict['accuracy'][epoch], step=epoch + 1)
							
						if 'val_loss' in history_dict:
							mlflow.log_metric("valid_loss", history_dict['val_loss'][epoch], step=epoch + 1)
						if 'val_accuracy' in history_dict:
							mlflow.log_metric("valid_acc", history_dict['val_accuracy'][epoch], step=epoch + 1)
					
					loss, acc = model.evaluate(X_test_w2v, y_test_w2v, batch_size=BATCH_SIZE, verbose=0)

					mlflow.log_metric("final_test_loss", loss)
					mlflow.log_metric("final_test_acc", acc)

					input_example = X_test_w2v[:1]
					output_example = keras.ops.convert_to_numpy(model.predict(input_example, verbose=0))

					mlflow.keras.log_model(
						model,
						name="word2vec",
						signature=mlflow.models.signature.infer_signature(input_example, output_example),
					)

					params = {
						"model_type": "Keras_LSTM_Word2Vec",
						"keras_backend": keras.backend.backend(),
						"hidden_dim": hd,
						"n_layers": nl,
						"dropout": dp,
						"learning_rate": lr,
						"n_epochs": N_EPOCHS,
						"batch_size": BATCH_SIZE,
						"embedding_dim": EMBEDDING_DIM,
						"vocab_size": W2V_VOCAB_SIZE,
						"trainable_embeddings": model.layers[1].trainable,
					}
					mlflow.log_params(params)

					print(f'Test Loss: {loss:.3f} | Test Acc: {acc*100:.2f}%')
					training = 0

			except RuntimeError:
				
				print(f"\nOOM/MemoryError à BATCH_SIZE={BATCH_SIZE}.")
				
				run_id = run.info.run_id
				mlflow.delete_run(run_id)

				if BATCH_SIZE <= MIN_BATCH_SIZE:
					print(f"\nAbandon de la configuration : BATCH_SIZE minimum ({MIN_BATCH_SIZE}) atteint.")
					training = 0
				else:
					BATCH_SIZE = int(BATCH_SIZE / 2)