# Modèle sur mesure avancé

### Sommaire
1. [Encodage des données textuelles à l'aide du tokenizer BERT](#paragraph1)</p> 
2. [Entrainement du modèle Bert](#paragraph2)</p>
3. [Mettre à jour un modèle BERT avec les poids entraînés](#paragraph3)</p>

In [1]:
import os
# Maths modules
from scipy.stats import f_oneway
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# precision, recall, f1-score,
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score

# Vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer

# Tokenizers, Stemmers and Lemmatizers
import nltk
from nltk.corpus import stopwords
import spacy
# Download resources
nltk.download("stopwords")
stopwords = set(stopwords.words("english"))
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

from gensim.models import Word2Vec
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Flatten, Embedding, Bidirectional
from tensorflow.keras.layers import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
from keras.metrics import AUC

from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import BinaryAccuracy

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\doly9\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv('D:/openclassroom/projet7/input/df_cleaned_docs.csv',encoding='ISO-8859-1')

In [3]:
df.head()

Unnamed: 0,target,text,clean_text
0,NEGATIVE,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot httptwitpiccom awww s bummer should...
1,NEGATIVE,is upset that he can't update his Facebook by ...,upset not update facebook texte cry result sch...
2,NEGATIVE,@Kenichan I dived many times for the ball. Man...,kenichan dive time ball manage save rest bound
3,NEGATIVE,my whole body feels itchy and like its on fire,body feel itchy like fire
4,NEGATIVE,"@nationwideclass no, it's not behaving at all....",nationwideclass behave m mad not


In [4]:
sampled_data = df.sample(n=1000000, random_state=42)

## 1. Encodage des données textuelles à l'aide du tokenizer BERT<a class="anchor" id="paragraph1"></a>

In [5]:
from transformers import *
from transformers import BertTokenizer, TFBertModel, BertConfig
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)

  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 993kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 5.05kB/s]
loading file vocab.txt from cache at C:\Users\doly9/.cache\huggingface\hub\models--bert-base-uncased\snapshots\a265f773a47193eed794233aa2a0f0bb6d3eaa63\vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at C:\Users\doly9/.cache\huggingface\hub\models--bert-base-uncased\snapshots\a265f773a47193eed794233aa2a0f0bb6d3eaa63\tokenizer_config.json
Downloading (…)lve/main/config.json: 100%|██████████| 57

In [6]:
#Load the sentences into the BERT Tokenizer.

In [11]:
input_ids=[]
attention_masks=[]
sentences = sampled_data['clean_text']
for sent in sentences:
    bert_inp=bert_tokenizer.encode_plus(sent,add_special_tokens = True,max_length =50,pad_to_max_length = True,return_attention_mask = True)
    input_ids.append(bert_inp['input_ids'])
    attention_masks.append(bert_inp['attention_mask'])

input_ids=np.asarray(input_ids)
attention_masks=np.array(attention_masks)
labels=sampled_data['target'].apply(lambda x: 1 if x == 'POSITIVE' else 0)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [12]:
#Saving and loading the data into the pickle files
import pickle
results_data_path = os.path.join("..", "outputs")
pickle_inp_path=os.path.join(results_data_path,'bert_inp.pkl')
pickle_mask_path=os.path.join(results_data_path,'bert_mask.pkl')
pickle_label_path=os.path.join(results_data_path,'bert_label.pkl')

In [13]:

print('Preparing the pickle file.....')
pickle.dump((input_ids),open(pickle_inp_path,'wb'))
pickle.dump((attention_masks),open(pickle_mask_path,'wb'))
pickle.dump((np.array(labels)),open(pickle_label_path,'wb'))

print('Pickle files saved as ',pickle_inp_path,pickle_mask_path,pickle_label_path)

Preparing the pickle file.....
Pickle files saved as  ..\outputs\bert_inp.pkl ..\outputs\bert_mask.pkl ..\outputs\bert_label.pkl


In [14]:
print('Loading the saved pickle files..')

input_ids=pickle.load(open(pickle_inp_path, 'rb'))
attention_masks=pickle.load(open(pickle_mask_path, 'rb'))
labels=pickle.load(open(pickle_label_path, 'rb'))

print('Input shape {} Attention mask shape {} Input label shape {}'.format(input_ids.shape,attention_masks.shape,labels.shape))

Loading the saved pickle files..
Input shape (1000000, 50) Attention mask shape (1000000, 50) Input label shape (1000000,)


In [53]:
#Spitting into train and validation set

In [15]:
train_inp,test_inp,train_label,test_label,train_mask,test_mask=train_test_split(input_ids,labels,attention_masks,test_size=0.2,random_state=42)
val_inp, test_inp, val_label, test_label, val_mask, test_mask = train_test_split(test_inp,test_label,test_mask, test_size=0.33, random_state=42)

# Vérifier la taille des jeux de données
print("Taille du jeu d'entraînement :", train_inp.shape)
print("Taille du jeu de validation :", val_inp.shape)
print("Taille du jeu de test :", test_inp.shape)

Taille du jeu d'entraînement : (800000, 50)
Taille du jeu de validation : (134000, 50)
Taille du jeu de test : (66000, 50)


## 2. Entrainement du modèle Bert <a class="anchor" id="paragraph2"></a>

In [34]:
print('\nBert Model',bert_model.summary())

bert_model.compile(loss=BinaryCrossentropy(), optimizer=Adam(learning_rate=2e-5), metrics=BinaryAccuracy())

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________

Bert Model None


In [35]:
checkpoint_callback = ModelCheckpoint(filepath=os.path.join("..", "saved_models/bert")+'/'+'_epoch_{epoch:02d}.h5', 
                                      save_freq=1, save_weights_only=True)
tensorboard_callback = TensorBoard(log_dir=os.path.join("..", "logs")+'/bert')
callbacks=[checkpoint_callback,tensorboard_callback]

history=bert_model.fit([train_inp,train_mask],train_label,batch_size=32,epochs=5,validation_data=([val_inp,val_mask],val_label),callbacks=callbacks)


Epoch 1/5
  133/25000 [..............................] - ETA: 35:06:19 - loss: 0.7706 - binary_accuracy: 0.5740

KeyboardInterrupt: 

In [22]:
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint

history=bert_model.fit([train_inp,train_mask],train_label,batch_size=64,epochs=10,
                       validation_data=([val_inp,val_mask],val_label),
                        workers=4,
                        use_multiprocessing=True,)


Epoch 1/10
  270/19969 [..............................] - ETA: 1353:25:10 - loss: 0.6683 - binary_accuracy: 0.6398

KeyboardInterrupt: 

## 3.Mettre à jour un modèle BERT avec les poids entraînés<a class="anchor" id="paragraph3"></a>

In [16]:
# Load the trained weights from the .h5 file
bert_model.load_weights(os.path.join("..", "saved_models",'bert_weights.h5'))

# Compile the model
bert_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', AUC(curve="ROC", name="ROC_AUC")])

In [None]:
y_pred_proba = bert_model.predict([test_inp,test_mask])

In [None]:
y_pred = np.argmax(y_pred_proba[0], axis=1)  # Get the class with the highest probability

cf = confusion_matrix(y, y_pred)

## 4. Gestion des expérimentations des modèles avec MLFlow<a class="anchor" id="paragraph4"></a>

In [None]:
import mlflow.keras as mlk
class BertModel():
    def __init__(self):
        # Load the trained weights from the .h5 file
        self.bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)
        self.bert_model = load_weights(os.path.join("..", "saved_models",'bert_weights.h5'))
    def build(self):
        self.bert_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', AUC(curve="ROC", name="ROC_AUC")])
    def mlflow_run(self, epochs, batch_size):
        with mlflow.start_run(run_name=self.name_model) as run:
            # Automatically capture the model's parameters, metrics, artifacts,
            # and source code with the autolog() function
            mlk.autolog()
            # Enregistrez les paramètres
            mlflow.log_param("units_lstm", self.units_lstm)
            mlflow.log_param("units_dense", self.units_dense)
            self.bert_model.fit([train_inp,train_mask],train_label,batch_size=32,epochs=5,validation_data=([val_inp,val_mask],val_label),callbacks=callbacks)
        return run.info.run_id

In [None]:
import mlflow

if __name__ == "__main__":
    
    mlflow.set_experiment(experiment_name = "p7-advanced model")
    mlflow.autolog()
    # Use sqlite:///mlruns.db as the local store for tracking and registery
    mlflow.set_tracking_uri("sqlite:///mlruns.db")

    name_model = "glove_bilstm"
    type_model ="BidirectionalLSTM"
    units_lstm = 100
    units_dense = 20
    opt = "adam"
    epoch = 30
    batch_size = 1024

    model = AdvancedModel(name_model,type_model,word2vec_layer,units_lstm,units_dense, X_train, y_train)
    model.build()
    run_id = model.mlflow_run(epoch,batch_size)
    print("MLflow run_id={}".format(run_id))
