# Modèle sur mesure avancé

In [4]:
import re
import os
import string
import matplotlib.pyplot as plt
import seaborn as sns

# Maths modules
from scipy.stats import f_oneway
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
# precision, recall, f1-score,
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score

# Vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer

# Tokenizers, Stemmers and Lemmatizers
import nltk
from nltk.corpus import stopwords
import spacy
# Download resources
nltk.download("stopwords")
stopwords = set(stopwords.words("english"))
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

from gensim.models import Word2Vec
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Flatten, Embedding, Bidirectional
from tensorflow.keras.layers import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
from keras.metrics import AUC

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\doly9\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def analyse_performance_model(model,X,y,title_dataset):
    y_pred_proba = model.predict(X)
    y_pred = np.where(y_pred_proba> 0.5, 1, 0)
    cf = confusion_matrix(y, y_pred)
    plt.figure()
    make_confusion_matrix(cf, categories=['NEGATIVE', 'POSITIVE'], title="Performance du modèle sur le "+title_dataset)
    plt.figure()
    plot_roc_curve(y_pred_proba,y,title='Courbe ROC sur le ' + title_dataset)
    
def plot_roc_curve(y_pred_proba,y_true,title=None):
    #define metrics
    auc = roc_auc_score(y_true, y_pred_proba)
    fpr, tpr, _ = roc_curve(y_true,  y_pred_proba)
    #create ROC curve
    #create ROC curve
    plt.plot(fpr,tpr,label="AUC="+str(auc))
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.legend(loc=4)
    plt.title(title)

def make_confusion_matrix(cf,
                          group_names=None,
                          categories='auto',
                          count=True,
                          percent=True,
                          cbar=True,
                          xyticks=True,
                          xyplotlabels=True,
                          sum_stats=True,
                          figsize=None,
                          cmap='Blues',
                          title=None):
    '''
    This function will make a pretty plot of an sklearn Confusion Matrix cm using a Seaborn heatmap visualization.
    Arguments
    ---------
    cf:            confusion matrix to be passed in
    group_names:   List of strings that represent the labels row by row to be shown in each square.
    categories:    List of strings containing the categories to be displayed on the x,y axis. Default is 'auto'
    count:         If True, show the raw number in the confusion matrix. Default is True.
    normalize:     If True, show the proportions for each category. Default is True.
    cbar:          If True, show the color bar. The cbar values are based off the values in the confusion matrix.
                   Default is True.
    xyticks:       If True, show x and y ticks. Default is True.
    xyplotlabels:  If True, show 'True Label' and 'Predicted Label' on the figure. Default is True.
    sum_stats:     If True, display summary statistics below the figure. Default is True.
    figsize:       Tuple representing the figure size. Default will be the matplotlib rcParams value.
    cmap:          Colormap of the values displayed from matplotlib.pyplot.cm. Default is 'Blues'
                   See http://matplotlib.org/examples/color/colormaps_reference.html
                   
    title:         Title for the heatmap. Default is None.
    '''


    # CODE TO GENERATE TEXT INSIDE EACH SQUARE
    blanks = ['' for i in range(cf.size)]

    if group_names and len(group_names)==cf.size:
        group_labels = ["{}\n".format(value) for value in group_names]
    else:
        group_labels = blanks

    if count:
        group_counts = ["{0:0.0f}\n".format(value) for value in cf.flatten()]
    else:
        group_counts = blanks

    if percent:
        group_percentages = ["{0:.2%}".format(value) for value in cf.flatten()/np.sum(cf)]
    else:
        group_percentages = blanks

    box_labels = [f"{v1}{v2}{v3}".strip() for v1, v2, v3 in zip(group_labels,group_counts,group_percentages)]
    box_labels = np.asarray(box_labels).reshape(cf.shape[0],cf.shape[1])


    # CODE TO GENERATE SUMMARY STATISTICS & TEXT FOR SUMMARY STATS
    if sum_stats:
        #Accuracy is sum of diagonal divided by total observations
        accuracy  = np.trace(cf) / float(np.sum(cf))

        #if it is a binary confusion matrix, show some more stats
        if len(cf)==2:
            #Metrics for Binary Confusion Matrices
            precision = cf[1,1] / sum(cf[:,1])
            recall    = cf[1,1] / sum(cf[1,:])
            f1_score  = 2*precision*recall / (precision + recall)
            stats_text = "\n\nAccuracy={:0.3f}\nPrecision={:0.3f}\nRecall={:0.3f}\nF1 Score={:0.3f}".format(
                accuracy,precision,recall,f1_score)
        else:
            stats_text = "\n\nAccuracy={:0.3f}".format(accuracy)
    else:
        stats_text = ""


    # SET FIGURE PARAMETERS ACCORDING TO OTHER ARGUMENTS
    if figsize==None:
        #Get default figure size if not set
        figsize = plt.rcParams.get('figure.figsize')

    if xyticks==False:
        #Do not show categories if xyticks is False
        categories=False


    # MAKE THE HEATMAP VISUALIZATION
    plt.figure(figsize=figsize)
    sns.heatmap(cf,annot=box_labels,fmt="",cmap=cmap,cbar=cbar,xticklabels=categories,yticklabels=categories)

    if xyplotlabels:
        plt.ylabel('True label')
        plt.xlabel('Predicted label' + stats_text)
    else:
        plt.xlabel(stats_text)
    
    if title:
        plt.title(title)

In [6]:
df = pd.read_csv('D:/openclassroom/projet7/input/df_cleaned_docs.csv',encoding='ISO-8859-1')

In [7]:
# split text column into lists of words
df["tokenized_tweet"] = df['clean_text'].str.split()

In [8]:
df.head()

Unnamed: 0,target,text,clean_text,tokenized_tweet
0,NEGATIVE,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot httptwitpiccom awww s bummer should...,"[switchfoot, httptwitpiccom, awww, s, bummer, ..."
1,NEGATIVE,is upset that he can't update his Facebook by ...,upset not update facebook texte cry result sch...,"[upset, not, update, facebook, texte, cry, res..."
2,NEGATIVE,@Kenichan I dived many times for the ball. Man...,kenichan dive time ball manage save rest bound,"[kenichan, dive, time, ball, manage, save, res..."
3,NEGATIVE,my whole body feels itchy and like its on fire,body feel itchy like fire,"[body, feel, itchy, like, fire]"
4,NEGATIVE,"@nationwideclass no, it's not behaving at all....",nationwideclass behave m mad not,"[nationwideclass, behave, m, mad, not]"


In [9]:
sampled_data = df.sample(n=1000000, random_state=42)

**Encoding of the text data using BERT Tokenizer and obtaining the input_ids and attentions masks to feed into the model.**

In [10]:
from transformers import *
from transformers import BertTokenizer, TFBertModel, BertConfig
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)

  from .autonotebook import tqdm as notebook_tqdm
loading file vocab.txt from cache at C:\Users\doly9/.cache\huggingface\hub\models--bert-base-uncased\snapshots\0a6aa9128b6194f4f3c4db429b6cb4891cdb421b\vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at C:\Users\doly9/.cache\huggingface\hub\models--bert-base-uncased\snapshots\0a6aa9128b6194f4f3c4db429b6cb4891cdb421b\tokenizer_config.json
loading configuration file config.json from cache at C:\Users\doly9/.cache\huggingface\hub\models--bert-base-uncased\snapshots\0a6aa9128b6194f4f3c4db429b6cb4891cdb421b\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range":

In [8]:
#Load the sentences into the BERT Tokenizer.

In [11]:
input_ids=[]
attention_masks=[]
sentences = sampled_data['clean_text']
for sent in sentences:
    bert_inp=bert_tokenizer.encode_plus(sent,add_special_tokens = True,max_length =50,pad_to_max_length = True,return_attention_mask = True)
    input_ids.append(bert_inp['input_ids'])
    attention_masks.append(bert_inp['attention_mask'])

input_ids=np.asarray(input_ids)
attention_masks=np.array(attention_masks)
labels=sampled_data['target'].apply(lambda x: 1 if x == 'POSITIVE' else 0)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [29]:
#Saving and loading the data into the pickle files
import pickle
results_data_path = os.path.join("..", "outputs")
pickle_inp_path=os.path.join(results_data_path,'bert_inp.pkl')
pickle_mask_path=os.path.join(results_data_path,'bert_mask.pkl')
pickle_label_path=os.path.join(results_data_path,'bert_label.pkl')

In [30]:

print('Preparing the pickle file.....')
pickle.dump((input_ids),open(pickle_inp_path,'wb'))
pickle.dump((attention_masks),open(pickle_mask_path,'wb'))
pickle.dump((np.array(labels)),open(pickle_label_path,'wb'))

print('Pickle files saved as ',pickle_inp_path,pickle_mask_path,pickle_label_path)

Preparing the pickle file.....
Pickle files saved as  ..\outputs\bert_inp.pkl ..\outputs\bert_mask.pkl ..\outputs\bert_label.pkl


In [31]:
print('Loading the saved pickle files..')

input_ids=pickle.load(open(pickle_inp_path, 'rb'))
attention_masks=pickle.load(open(pickle_mask_path, 'rb'))
labels=pickle.load(open(pickle_label_path, 'rb'))

print('Input shape {} Attention mask shape {} Input label shape {}'.format(input_ids.shape,attention_masks.shape,labels.shape))

Loading the saved pickle files..
Input shape (1000000, 50) Attention mask shape (1000000, 50) Input label shape (1000000,)


In [53]:
#Spitting into train and validation set

In [12]:
train_inp,test_inp,train_label,test_label,train_mask,test_mask=train_test_split(input_ids,labels,attention_masks,test_size=0.2,random_state=42)
val_inp, test_inp, val_label, test_label, val_mask, test_mask = train_test_split(test_inp,test_label,test_mask, test_size=0.33, random_state=42)

# Vérifier la taille des jeux de données
print("Taille du jeu d'entraînement :", train_inp.shape)
print("Taille du jeu de validation :", val_inp.shape)
print("Taille du jeu de test :", test_inp.shape)

Taille du jeu d'entraînement : (800000, 50)
Taille du jeu de validation : (134000, 50)
Taille du jeu de test : (66000, 50)


In [None]:
# Update a BERT model with trained weights from a .h5 file

In [13]:
# Load the trained weights from the .h5 file
bert_model.load_weights(os.path.join("..", "saved_models",'bert_weights.h5'))

# Compile the model
bert_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', AUC(curve="ROC", name="ROC_AUC")])

In [26]:
y_pred_proba = bert_model.predict([test_inp,test_mask])




In [30]:
y_pred_proba[0]

array([[0.93472844, 0.86206394],
       [0.10232023, 0.09827419],
       [0.35728437, 0.33339134],
       ...,
       [0.6686935 , 0.63677424],
       [0.16889217, 0.15504488],
       [0.6481481 , 0.6150536 ]], dtype=float32)

In [33]:
y_pred = np.argmax(y_pred_proba[0], axis=1)  # Get the class with the highest probability
y_pred 

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [31]:

cf = confusion_matrix(y, y_pred)

ValueError: Found input variables with inconsistent numbers of samples: [3, 66000]

In [None]:
analyse_performance_model(bert_model,[test_inp,test_mask],test_label,"training")

In [33]:
import tensorflow as tf
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import BinaryAccuracy


In [34]:
# Define the log directory for TensorBoard
#logdir = os.path.join("..", "logs/",'tb_bert')
#model_save_path=os.path.join(results_data_path,'bert_model.h5')
#tensorboard_callback = TensorBoard(log_dir=logdir)
#callbacks = [tensorboard_callback]
metric = ['accuracy', AUC(curve="ROC", name="ROC_AUC")]
print('\nBert Model',bert_model.summary())

bert_model.compile(loss=BinaryCrossentropy(), optimizer=Adam(learning_rate=2e-5), metrics=BinaryAccuracy())

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________

Bert Model None


In [35]:
checkpoint_callback = ModelCheckpoint(filepath=os.path.join("..", "saved_models/bert")+'/'+'_epoch_{epoch:02d}.h5', 
                                      save_freq=1, save_weights_only=True)
tensorboard_callback = TensorBoard(log_dir=os.path.join("..", "logs")+'/bert')
callbacks=[checkpoint_callback,tensorboard_callback]


history=bert_model.fit([train_inp,train_mask],train_label,batch_size=32,epochs=5,validation_data=([val_inp,val_mask],val_label),callbacks=callbacks)


Epoch 1/5
  133/25000 [..............................] - ETA: 35:06:19 - loss: 0.7706 - binary_accuracy: 0.5740

KeyboardInterrupt: 

In [16]:
tf.config.experimental.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [15]:
import tensorflow as tf
print(tf.__version__)

2.10.0


In [22]:
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint


#checkpoint_callback = ModelCheckpoint(filepath=os.path.join("..", "saved_models/bert")+'/'+'_epoch_{epoch:02d}.h5', save_freq=1)
#tensorboard_callback = TensorBoard(log_dir=os.path.join("..", "logs")+'/bert'+model.name)
# callbacks=[EarlyStopping(monitor="val_loss", patience=3),],
history=bert_model.fit([train_inp,train_mask],train_label,batch_size=64,epochs=10,
                       validation_data=([val_inp,val_mask],val_label),
                        workers=4,
                        use_multiprocessing=True,)


Epoch 1/10
  270/19969 [..............................] - ETA: 1353:25:10 - loss: 0.6683 - binary_accuracy: 0.6398

KeyboardInterrupt: 

In [None]:
# fit NN model
print("Fitting model...")
model.fit(
    [input_ids_train, attention_mask_train, token_type_ids_train],
    labels_train,
    epochs=10,
    batch_size=8,
    validation_split=0.2,
    callbacks=[
        EarlyStopping(monitor="val_loss", patience=3),
    ],
    workers=4,
    use_multiprocessing=True,
)

print(model.summary())

In [None]:
# you can save the fine-tuning model by this line.
model3.save_weights('bert_weights.h5')

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
import transformers

# Set the GPU device to use
tf.config.experimental.set_visible_devices(tf.config.list_physical_devices('GPU')[0], 'GPU')

# Load the BERT tokenizer and model
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
model = transformers.TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

# Load the training data
train_data, info = tfds.load('imdb_reviews', split='train', with_info=True, as_supervised=True)

# Preprocess the data and convert to a TensorFlow Dataset
train_data = train_data.map(lambda x, y: (tokenizer(x['text'], padding=True, truncation=True, max_length=512)['input_ids'], y))
train_data = train_data.batch(32)

# Define the optimizer and loss function
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Define the metrics to track during training
metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]

# Compile the model
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

# Train the model
history = model.fit(train_data, epochs=3)


In [None]:
#Evaluating the performance of the model

In [None]:
model_save_path='./bert_model.h5'

trained_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)
trained_model.compile(loss=loss,optimizer=optimizer, metrics=metric)
trained_model.load_weights(model_save_path)

In [None]:
analyse_performance_model(trained_model,[train_inp,train_mask],train_label,"training")