In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [None]:
import polars as pl

df = pl.read_parquet("dataframes/eda.parquet")
df

In [None]:
class TextClassificationDataset(Dataset):

	def __init__(self, texts, labels, tokenizer, max_length):
		self.texts = texts
		self.labels = labels
		self.tokenizer = tokenizer
		self.max_length = max_length

	def __len__(self):
		return len(self.texts)

	def __getitem__(self, idx):
		text = self.texts[idx]
		label = self.labels[idx]
		encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
		return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'target': torch.tensor(label)}

In [None]:
class BERTClassifier(nn.Module):
	
	def __init__(self, bert_model_name, num_classes):
		super().__init__()
		self.bert = BertModel.from_pretrained(bert_model_name)
		self.dropout = nn.Dropout(0.1)
		self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)
	
	def forward(self, input_ids, attention_mask):
		outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
		pooled_output = outputs.pooler_output
		x = self.dropout(pooled_output)
		logits = self.fc(x)
		return logits

In [None]:
from tqdm import tqdm

def train(model, data_loader, optimizer, scheduler, device):
	model.train()

	for batch in tqdm(data_loader):
		optimizer.zero_grad()
		input_ids = batch["input_ids"].to(device)
		attention_mask = batch["attention_mask"].to(device)
		labels = batch["target"].to(device)

		with torch.amp.autocast(device_type=str(device)):
			outputs = model(input_ids=input_ids, attention_mask=attention_mask)
			loss = nn.CrossEntropyLoss()(outputs, labels)

		loss.backward()
		optimizer.step()
		scheduler.step()

def evaluate(model, data_loader, device):
	model.eval()
	predictions = []
	actual_labels = []
	total_loss = 0

	with torch.no_grad():
		for batch in data_loader:
			input_ids = batch["input_ids"].to(device)
			attention_mask = batch["attention_mask"].to(device)
			labels = batch["target"].to(device)

			outputs = model(input_ids=input_ids, attention_mask=attention_mask)
			_, preds = torch.max(outputs, dim=1)

			predictions.extend(preds.cpu().tolist())
			actual_labels.extend(labels.cpu().tolist())

			loss = nn.CrossEntropyLoss()(outputs, labels)
			total_loss += loss.item() * input_ids.size(0)

	avg_loss = total_loss / len(data_loader.dataset)
	return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions), avg_loss

In [None]:
def predict_sentiment(text, model, tokenizer, device, max_length=128):
	model.eval()
	encoding = tokenizer(text, return_tensors="pt", max_length=max_length, padding="max_length", truncation=True)
	input_ids = encoding["input_ids"].to(device)
	attention_mask = encoding["attention_mask"].to(device)

	with torch.no_grad():
		outputs = model(input_ids=input_ids, attention_mask=attention_mask)
		_, preds = torch.max(outputs, dim=1)

	return "positive" if preds.item() == 1 else "negative"

In [None]:
bert_model_name = 'bert-base-uncased'
num_classes = 2
max_length = 64
batch_size = 32
num_epochs = 1
learning_rate = 2e-5

texts = df["content"]
labels = df["target"]

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

train_texts = train_texts.to_list()
val_texts = val_texts.to_list()

tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [None]:
import mlflow

experiment_name = "Réalisez une analyse de sentiments grâce au Deep Learning"
mlflow.set_experiment(experiment_name=experiment_name)


best_loss = float("inf")
best_loss_accuracy = 0
patience_counter = 0
PATIENCE = 1
MODEL_PATH = "models/best_bert_model.pth"

# with mlflow.start_run(run_name="BERT_base_uncased"):

# 	loss = 0
# 	accuracy = 0

# 	for epoch in range(num_epochs):

# 		print(f"Epoch {epoch + 1}/{num_epochs}")
# 		train(model, train_dataloader, optimizer, scheduler, device)

# 		accuracy, report, loss = evaluate(model, val_dataloader, device)
# 		print(f"Validation Accuracy: {accuracy:.4f}")
# 		print(f" Validation Loss: {loss:.4f}")
# 		print(report)

# 		mlflow.log_metric(f"test_loss", loss, step=epoch+1)
# 		mlflow.log_metric(f"test_acc", float(accuracy), step=epoch+1)

# 		if loss < best_loss:
# 			best_loss = loss
# 			best_loss_accuracy = accuracy
# 			patience_counter = 0

# 			torch.save(model.state_dict(), MODEL_PATH)
		
# 		else:
# 			patience_counter += 1
			
# 			if patience_counter >= PATIENCE:
# 				print("Early Stopping Trigered")
# 				break

# 	model.load_state_dict(torch.load(MODEL_PATH))

# 	mlflow.log_metric("final_test_loss", best_loss)
# 	mlflow.log_metric("final_test_acc", float(best_loss_accuracy))

# 	input_example = train_texts[1]
# 	output_example = predict_sentiment(input_example, model, tokenizer, device, max_length)

# 	mlflow.pytorch.log_model(
# 		model,
# 		name="BERT",
# 		signature=mlflow.models.signature.infer_signature(input_example, output_example),
# 	)

# 	params = {
# 		"model_type": "Torch_BERT",
# 		"bert_model_name" : bert_model_name,
# 		"num_classes" : num_classes,
# 		"max_length" : max_length,
# 		"batch_size" : batch_size,
# 		"num_epochs" : num_epochs,
# 		"learning_rate" : learning_rate
# 	}

# 	mlflow.log_params(params)

In [None]:
# from torch.onnx import export

# model.load_state_dict(torch.load(MODEL_PATH))
# model.to('cpu')
# model.eval()

# DTYPE = torch.long

# dummy_input_ids = torch.randint(0, 10000, (1, max_length), dtype=DTYPE, device='cpu') 

# dummy_attention_mask = torch.ones((1, max_length), dtype=DTYPE, device='cpu') 
# dummy_args = (dummy_input_ids,)

# dummy_kwargs = {"attention_mask": dummy_attention_mask}

# dynamic_shapes = {
#     "input_ids": {0: "batch_size"}, 
#     "attention_mask": {0: "batch_size"}
# }

# input_names = ["input_ids", "attention_mask"]

# export(
#     model,
#     dummy_args,
#     "models/bert_reduced_model.onnx",
#     kwargs=dummy_kwargs,
#     input_names=input_names,
#     output_names=["output"],
#     dynamic_shapes=dynamic_shapes
# )

In [None]:
import onnxruntime as rt
import numpy as np


# model.load_state_dict(torch.load(MODEL_PATH))
# model.eval()


# model_res = []

# for line in tqdm(val_dataloader.dataset):

# 	pred = model(
# 		line["input_ids"].unsqueeze(0).to(device),
# 		line["attention_mask"].unsqueeze(0).to(device)
# 	)

# 	model_pred = pred.cpu().detach().numpy()[0]
# 	model_res.append(0 if model_pred[0] > model_pred[1] else 1)


# onnx_model = rt.InferenceSession("models/bert_reduced_model.onnx", providers=["CUDAExecutionProvider"])
# onnx_res = []

# for line in tqdm(val_dataloader.dataset):

# 	onnx_input = {
# 		"input_ids":np.array([line["input_ids"]]),
# 		"attention_mask":np.array([line["attention_mask"]])
# 	}

# 	onnx_pred = onnx_model.run(["output"], onnx_input)[0][0]

# 	onnx_res.append(0 if onnx_pred[0] > onnx_pred[1] else 1)


# onnx_cpu_model = rt.InferenceSession("models/bert_reduced_model.onnx", providers=["CPUExecutionProvider"])
# onnx_cpu_res = []

# for line in tqdm(val_dataloader.dataset):

# 	onnx_cpu_input = {
# 		"input_ids":np.array([line["input_ids"]]),
# 		"attention_mask":np.array([line["attention_mask"]])
# 	}

# 	onnx_cpu_pred = onnx_cpu_model.run(["output"], onnx_cpu_input)[0][0]

# 	onnx_cpu_res.append(0 if onnx_cpu_pred[0] > onnx_cpu_pred[1] else 1)

In [None]:
# print("Accuracy score du modèle BERT : ", accuracy_score(val_labels, model_res))
# print("Accuracy score du modèle ONNX : ", accuracy_score(val_labels, onnx_res))
# print("Accuracy score du modèle ONNX sur CPU : ", accuracy_score(val_labels, onnx_cpu_res))

In [None]:
import onnx

onnx_model = rt.InferenceSession("models/bert_reduced_model.onnx", providers=["CUDAExecutionProvider"])
onnx_res = []

with mlflow.start_run(run_name="ONNX_model"):
	for line in tqdm(val_dataloader.dataset):

		onnx_input = {
			"input_ids":np.array([line["input_ids"]]),
			"attention_mask":np.array([line["attention_mask"]])
		}

		onnx_pred = onnx_model.run(["output"], onnx_input)[0][0]

		onnx_res.append(0 if onnx_pred[0] > onnx_pred[1] else 1)

	mlflow.log_metric("final_test_acc", float(accuracy_score(val_labels, onnx_res)))

	input_example = train_texts[1]
	output_example = onnx_pred

	onnx_model_proto = onnx.load("models/bert_reduced_model.onnx")
	mlflow.onnx.log_model(
		onnx_model_proto,
		name="BERT_ONNX",
		signature=mlflow.models.signature.infer_signature(input_example, output_example),
	)

	tokenizer.save_pretrained("tokenizer_artifacts")
	mlflow.log_artifacts("tokenizer_artifacts", artifact_path="tokenizer")

	params = {
		"model_type": "Torch_BERT_ONNX",
		"bert_model_name" : bert_model_name,
		"num_classes" : num_classes,
		"max_length" : max_length,
		"batch_size" : batch_size,
		"num_epochs" : num_epochs,
		"learning_rate" : learning_rate
	}

	mlflow.log_params(params)