In [1]:
# Importation des librairies nécessaires
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import time
import pickle
import tensorflow as tf
import gensim
import nltk

# Importation de Scikit-learn pour les modèles et métriques
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import (accuracy_score, recall_score, f1_score, roc_auc_score, 
                             confusion_matrix, roc_curve)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Importation des modules de traitement de texte NLTK
from nltk.corpus import words
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Importation de TensorFlow et Keras pour le Deep Learning
from tensorflow.keras import backend as K
from tensorflow.keras import utils, layers, metrics as kmetrics
from tensorflow.keras.utils import to_categorical

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (Dense, Dropout, Embedding, LSTM, Bidirectional, 
                                     TimeDistributed, Flatten, GlobalAveragePooling1D)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

# Importation de Gensim pour le traitement des modèles Word2Vec
from gensim.models import Word2Vec

# Importation de XGBoost
from xgboost import XGBClassifier

# Définition du chemin des données
path_data = '/Users/chretien/OpenClassroom/Openclassroom7/'


In [2]:

df = pd.read_csv("training.1600000.processed.noemoticon.csv", sep=',', encoding='ISO-8859-1', header=None,names=['target', 'id', 'date', 'flag', 'user', 'text'])
df.head(5)

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [3]:
# Garder uniquement colonnes target et text 
df = df[['target', 'text']]

# Remplacer target 4 par 1
df["target"] = df["target"].replace(4, 1)

# Sample
df_sample = df.groupby('target', as_index=False).apply(lambda x : x.sample(frac=0.001))
df_sample

Unnamed: 0,Unnamed: 1,target,text
0,719538,0,Vlad &amp; kevin are gone now i miss them a b...
0,385516,0,Kawasaki asplode. New motor time.
0,116912,0,I really don't want to log onto work tonight b...
0,8582,0,my boyfriend is going out of town for 2 days a...
0,591471,0,@markusbriggz call me ASAP... please
...,...,...,...
1,1137915,1,@UncleRUSH I do--and it surely does put you in...
1,1330331,1,@LinziMG Hope it goes/is going/has gone well.
1,985869,1,is quite confident and inspired at the mo
1,1072160,1,@bcuban OMG ... the Hitler Techno Bar is right...


In [4]:
# Tokenizer

def tokenizer_fct(sentence) :
    word_tokens = word_tokenize(sentence)
    # print(word_tokens)
    return word_tokens


# Tokenizer split

def tokenizer_split_fct(sentence) :
    word_tokens = sentence.split(' ')
    # print(word_tokens)
    return word_tokens

# Stop words
from nltk.corpus import stopwords
stop_w = list(set(stopwords.words('english')))

def stop_word_filter_fct(list_words) :
    filtered_w = [w for w in list_words if not w in stop_w]
    # print(filtered_w)    
    return filtered_w

# lower case et alpha (not "@")
def lower_alpha_fct(list_words) :
    fw = [w.lower() for w in list_words if w.isalpha()]
    # print(fw)
    return fw

# lower case et alpha (not "@")
def lower_not_user_fct(list_words) :
    fw = [w.lower() for w in list_words if not w.startswith("@")]
    # print(fw
    return fw




#------------------------------Lemmatizer-----------------------------------


def lemma_fct(list_words) :
    lemmatizer = WordNetLemmatizer()
    lem_w = [lemmatizer.lemmatize(w) for w in list_words]
    return lem_w
    
#------------------------------Stemming-----------------------------------


def stemma_fct(list_words) :
    stemming = PorterStemmer()
    stemma_w = [stemming.stem(w) for w in list_words]
    return stemma_w


#-------------------# Fonction de préparation des tweets----------------------------


# Fonction de préparation des questions
def transform_text(text) :
    word_tokens = tokenizer_split_fct(text)
    f_w = stop_word_filter_fct(word_tokens)
    lw = lower_not_user_fct(f_w)
    #lem_w = lemma_fct(lw)
    filtered_w = stop_word_filter_fct(lw)
    # print(filtered_w)
    trans_sentence = ' '.join(filtered_w)
    
    return trans_sentence


# Fonction de préparation des questions
def transform_text_lem(text) :
    word_tokens = tokenizer_split_fct(text)
    f_w = stop_word_filter_fct(word_tokens)
    lw = lower_not_user_fct(f_w)
    lem_w = lemma_fct(lw)
    filtered_w = stop_word_filter_fct(lem_w)
    # print(filtered_w)
    trans_sentence = ' '.join(filtered_w)
    
    return trans_sentence


# Fonction de préparation des questions
def transform_text_stemma(text) :
    word_tokens = tokenizer_split_fct(text)
    f_w = stop_word_filter_fct(word_tokens)
    lw = lower_not_user_fct(f_w)
    stemma_w = stemma_fct(lw)
    filtered_w = stop_word_filter_fct(stemma_w)
    # print(filtered_w)
    trans_sentence = ' '.join(filtered_w)
    
    return trans_sentence



In [5]:
# afficher DataFrame clean 

tweets = pd.DataFrame()
tweets['target'] = df_sample['target']
tweets['text_base'] = df_sample['text'].apply(lambda x : transform_text(x))
tweets['text_lemma'] = df_sample['text'].apply(lambda x : transform_text_lem(x))
tweets['text_stem'] = df_sample['text'].apply(lambda x : transform_text_stemma(x))
tweets

Unnamed: 0,Unnamed: 1,target,text_base,text_lemma,text_stem
0,719538,0,vlad &amp; kevin gone miss bunch,vlad &amp; kevin gone miss bunch,vlad &amp; kevin gone miss bunch
0,385516,0,kawasaki asplode. new motor time.,kawasaki asplode. new motor time.,kawasaki asplode. new motor time.
0,116912,0,really want log onto work tonight promised thi...,really want log onto work tonight promised thi...,realli want log onto work tonight promis thing...
0,8582,0,"boyfriend going town 2 days believe not, i'm f...","boyfriend going town 2 day believe not, i'm fe...","boyfriend go town 2 day believ not, i'm feel l..."
0,591471,0,call asap... please,call asap... please,call asap... pleas
...,...,...,...,...,...
1,1137915,1,do--and surely put whole different stratospher...,do--and surely put whole different stratospher...,do--and sure put whole differ stratosphere. o...
1,1330331,1,hope goes/is going/has gone well.,hope goes/is going/has gone well.,hope goes/i going/ha gone well.
1,985869,1,quite confident inspired mo,quite confident inspired mo,quit confid inspir mo
1,1072160,1,omg ... hitler techno bar right top 7-11!,omg ... hitler techno bar right top 7-11!,omg ... hitler techno bar right top 7-11!


# Bert

In [6]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split

In [7]:
import mlflow
import mlflow.pytorch
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.model_selection import train_test_split

# 1. Chargement des données et division en train/validation
train_texts, val_texts, train_labels, val_labels = train_test_split(tweets['text_base'], tweets['target'], test_size=0.2)

# 2. Chargement du tokenizer BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 3. Tokenization des données
def tokenize_texts(texts):
    return tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512)

train_encodings = tokenize_texts(train_texts.tolist())
val_encodings = tokenize_texts(val_texts.tolist())

# 4. Création d'une classe Dataset pour PyTorch
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# 5. Création des DataLoaders
train_dataset = TextDataset(train_encodings, train_labels.tolist())
val_dataset = TextDataset(val_encodings, val_labels.tolist())

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# 6. Chargement du modèle BERT pour la classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # 2 pour classification binaire

# 7. Configuration de l'optimiseur
optimizer = AdamW(model.parameters(), lr=5e-5)

# Configuration de MLflow
mlflow.set_tracking_uri("http://localhost:5003")
mlflow.set_experiment("BERT_Classification_Experiment")

# 8. Fonction d'entraînement
def train(model, dataloader):
    model.train()
    total_loss = 0

    for batch in dataloader:
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(dataloader)

# 9. Fonction de validation
def validate(model, dataloader):
    model.eval()
    correct_preds = 0
    total_preds = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            outputs = model(**batch)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            correct_preds += (preds == batch['labels']).sum().item()
            total_preds += len(preds)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch['labels'].cpu().numpy())
    
    accuracy = correct_preds / total_preds
    auc = roc_auc_score(all_labels, all_preds)
    return accuracy, auc, all_preds, all_labels

# 10. Entraînement du modèle
epochs = 3
with mlflow.start_run(run_name="BERT_Training_Run"):
    for epoch in range(epochs):
        train_loss = train(model, train_loader)
        val_acc, val_auc, val_preds, val_labels = validate(model, val_loader)
        
        print(f"Epoch {epoch + 1}/{epochs}")
        print(f"Train Loss: {train_loss}")
        print(f"Validation Accuracy: {val_acc}, Validation AUC: {val_auc}")
        print(classification_report(val_labels, val_preds))

        # Loguer uniquement les métriques de validation dans MLflow
        mlflow.log_metric("Validation Accuracy", val_acc)
        mlflow.log_metric("Validation AUC", val_auc)

    # Enregistrer le modèle avec MLflow
    mlflow.pytorch.log_model(model, "bert_model")

print("Training complete and model logged to MLflow.")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2024/10/29 18:06:25 INFO mlflow.tracking.fluent: Experiment with name 'BERT_Classification_Experiment' does not exist. Creating a new experiment.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1/3
Train Loss: 0.6107588492333889
Validation Accuracy: 0.753125, Validation AUC: 0.7500489831106234
              precision    recall  f1-score   support

           0       0.76      0.70      0.73       151
           1       0.75      0.80      0.77       169

    accuracy                           0.75       320
   macro avg       0.75      0.75      0.75       320
weighted avg       0.75      0.75      0.75       320



  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 2/3
Train Loss: 0.36632076408714054
Validation Accuracy: 0.76875, Validation AUC: 0.7676633096908188
              precision    recall  f1-score   support

           0       0.76      0.75      0.75       151
           1       0.78      0.79      0.78       169

    accuracy                           0.77       320
   macro avg       0.77      0.77      0.77       320
weighted avg       0.77      0.77      0.77       320



  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 3/3
Train Loss: 0.15257935313275084
Validation Accuracy: 0.68125, Validation AUC: 0.6696579019554059
              precision    recall  f1-score   support

           0       0.77      0.46      0.58       151
           1       0.65      0.88      0.74       169

    accuracy                           0.68       320
   macro avg       0.71      0.67      0.66       320
weighted avg       0.70      0.68      0.67       320





Training complete and model logged to MLflow.
