# Python Projet 5 - Prédiction des tags

## Multilabel classification avec la méthode One-vs-Rest

Dans le cadre du cours OpenClassrooms Machine Learning et du projet d'évaluation #5, ce notebook présente les différents modèles de prédiction des tags ainsi qu'une comparaison avec l'analyse non supervisée obtenue par LDA.

Author : Vincent Arrigoni, 04/2023

Regexp : https://www.debuggex.com/cheatsheet/regex/python

Tout savoir :https://datascientest.com/regex-tout-savoir

Test regexp : https://regex101.com/

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# import of libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import precision_score, recall_score, jaccard_score, classification_report
from gensim.models import Word2Vec
from scipy.stats import uniform
import re
from tqdm import tqdm
from time import time
from sklearn.utils import shuffle

# import tensorflow as tf
# import tensorflow_hub as hub
# import tensorflow_text as text
# from official.nlp import optimization  # to create AdamW optimizer

In [4]:
import warnings
from sklearn.exceptions import ConvergenceWarning

# Suppress specific warnings
warnings.filterwarnings('ignore', category=ConvergenceWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

In [5]:
data = pd.read_csv('/content/drive/MyDrive/OCS/Projet5/msg_preprocessed.csv')
data = pd.read_csv('/content/drive/MyDrive/OCS/Projet5/msg_preprocessed_reduced.csv')
data.head(10)

Unnamed: 0.1,Unnamed: 0,Text,Tags,Token,Tags_lst,Tags_lst_new,Token_lst_new
0,587,Sql Server string to date conversion<p>I want ...,<sql-server><tsql><datetime><sql-server-2005><...,"['sql', 'server', 'string', 'date', 'convers',...","['sql-server', 'tsql', 'datetime', 'sql-server...",['sql-server'],"['sql', 'server', 'string', 'date', 'convers',..."
1,1625,Instantiating a python class in C#<p>I've writ...,<c#><.net><python><ironpython><cross-language>,"['instanti', 'python', 'class', 'c#', 'written...","['c#', '.net', 'python', 'ironpython', 'cross-...","['c#', '.net', 'python']","['instanti', 'python', 'class', 'c#', 'written..."
2,2730,pyserial with multiprocessing gives me a ctype...,<python><python-3.x><multiprocessing><pickle><...,"['pyseri', 'multiprocess', 'give', 'ctype', 'e...","['python', 'python-3.x', 'multiprocessing', 'p...",['python'],"['pyseri', 'multiprocess', 'give', 'ctype', 'e..."
3,4700,How do I install SciPy on 64 bit Windows?<p>Ho...,<python><windows><64-bit><numpy><scipy>,"['instal', 'scipi', 'bit', 'window', 'instal',...","['python', 'windows', '64-bit', 'numpy', 'scipy']","['python', 'windows']","['instal', 'scipi', 'bit', 'window', 'instal',..."
4,3497,MSBuild doesn't copy references (DLL files) if...,<visual-studio><msbuild><reference><dependenci...,"['msbuild', 'copi', 'refer', 'dll', 'file', 'u...","['visual-studio', 'msbuild', 'reference', 'dep...",[],"['msbuild', 'copi', 'refer', 'dll', 'file', 'u..."
5,4509,How to set view model on ViewResult in request...,<c#><asp.net-mvc><model><filter><viewresult>,"['set', 'view', 'model', 'viewresult', 'reques...","['c#', 'asp.net-mvc', 'model', 'filter', 'view...",['c#'],"['set', 'view', 'model', 'viewresult', 'reques..."
6,1372,Custom C# data transfer objects from javascrip...,<asp.net><javascript><ajax><json><asp.net-ajax>,"['custom', 'c#', 'data', 'transfer', 'object',...","['asp.net', 'javascript', 'ajax', 'json', 'asp...","['asp.net', 'javascript']","['custom', 'c#', 'data', 'transfer', 'object',..."
7,3576,Print a specific PDF page using command line<p...,<windows><shell><pdf><command-line><printing>,"['print', 'specif', 'pdf', 'page', 'use', 'com...","['windows', 'shell', 'pdf', 'command-line', 'p...",['windows'],"['print', 'specif', 'pdf', 'page', 'use', 'com..."
8,1809,How do I properly sanitize data received from ...,<php><mysql><html><forms><sanitization>,"['proper', 'sanit', 'data', 'receiv', 'text', ...","['php', 'mysql', 'html', 'forms', 'sanitization']","['php', 'html']","['proper', 'sanit', 'data', 'receiv', 'text', ..."
9,73,Insert into ... values ( SELECT ... FROM ... )...,<sql><database><syntax><database-agnostic><ans...,"['insert', 'valu', 'select', 'tri', 'insert', ...","['sql', 'database', 'syntax', 'database-agnost...",['sql'],"['insert', 'valu', 'select', 'tri', 'insert', ..."


In [6]:
def regexp_Token(x):
  return(re.findall(r'(?:C\+\+)|(?:c\+\+)|(?:c\#)|(?:C\#)|(?:\.net)|(?:\.NET)|\w{2,}', x['Token_lst_new']))

def regexp_Tags(x):
  return(re.findall(r'(?:C\+\+)|(?:c\+\+)|(?:c\#)|(?:C\#)|(?:\.net)|(?:\.NET)|\w{2,}', x['Tags_lst_new']))

ds = pd.DataFrame({'Token' : data[:5000].apply(regexp_Token, axis=1), 'Tags' : data[:5000].apply(regexp_Tags, axis=1)})
ds = shuffle(ds)
ds.reset_index(inplace=True)
ds.head(10)

Unnamed: 0,index,Token,Tags
0,3771,"[automat, generat, python, api, document, pych...",[python]
1,2887,"[differ, clang, clang, std, c++, erron, use, c...",[c++]
2,1238,"[use, deflat, instead, gzip, text, file, serv,...",[]
3,3696,"[c++, serial, perform, build, distribut, c++, ...","[c++, performance]"
4,4818,"[extract, float, point, number, delimit, strin...",[php]
5,202,"[bash, script, set, group, new, file, creat, b...",[linux]
6,752,"[chang, scheme, http, https, drf_yasg, use, dr...",[python]
7,2593,"[differ, classic, integr, pipelin, mode, iis7,...","[asp, .net]"
8,3097,"[fix, input, hidden, tensor, devic, pytorch, w...",[python]
9,4763,"[add, new, scaffold, use, visual, studio, scaf...",[]


In [7]:
# Classifier choice
# Choose Logistic Regression
# Else is RandomForestClassifier
Clf_choice = True

## Approche Bag Of Words pour la classification par Régression Logistique

In [8]:
def dummy(doc):
  return doc
  # https://stackoverflow.com/questions/35867484/pass-tokens-to-countvectorizer
  # pas besoin que countvectorizer fasse un preprocessing ou une tokenization des documents

vectorizer = CountVectorizer(tokenizer=dummy, preprocessor=dummy, binary=True)
X = vectorizer.fit_transform(ds['Token'])
X = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

LabelBinarizer = MultiLabelBinarizer()
y = LabelBinarizer.fit_transform(ds['Tags'])
y = pd.DataFrame(y, columns=LabelBinarizer.classes_)

In [9]:
# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(X, y)

params_LR = {'C': 0.5, 'max_iter': 500, 'penalty': 'none', 'solver': 'lbfgs'}
params_RF = {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 100, 'bootstrap': False}

# Build the model
if Clf_choice:
  model = MultiOutputClassifier(LogisticRegression(**params_LR))
  # https://scikit-learn.org/stable/modules/multiclass.html
else:
  model = MultiOutputClassifier(RandomForestClassifier(max_depth=100))

In [10]:
# search = GridSearchCV(
#     estimator = model,
#     param_grid={'estimator__C':np.logspace(-3, 0, 4)},
#     scoring = ['accuracy', 'precision_micro', 'recall_micro'],
#     refit = 'precision_micro',
#     cv = 5
# ).fit(X_train, y_train)
# # https://datascience.stackexchange.com/questions/107867/how-to-train-multioutput-classification-with-hyperparameter-tuning-in-sklearn
# # https://stackoverflow.com/questions/41899132/invalid-parameter-for-sklearn-estimator-pipeline
# # https://scikit-learn.org/stable/modules/model_evaluation.html#from-binary-to-multiclass-and-multilabel

# # error : Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

# print(search.best_estimator_)
# print(search.best_params_)
# clf_BOW = search.best_estimator_

In [11]:
t = time()
model.fit(X_train, y_train)
print(time()-t)
clf_BOW = model

119.13631963729858


In [12]:
t = time()
y_pred = clf_BOW.predict(X_test)
print(time()-t)
print("Classifier report:\n", classification_report(y_test, y_pred, target_names=LabelBinarizer.classes_, zero_division=0))
# Calculate the Jaccard score
score = jaccard_score(y_test, y_pred, average='micro')
print(f"Jaccard score: {score:.4f}")

8.668025732040405
Classifier report:
               precision    recall  f1-score   support

        .net       0.53      0.50      0.52       148
     android       0.94      0.67      0.78        45
         asp       0.50      0.29      0.37        34
          c#       0.72      0.65      0.68       192
         c++       0.82      0.72      0.77       104
         css       0.80      0.40      0.53        30
        html       0.50      0.42      0.46        43
         ios       0.74      0.62      0.67        81
      iphone       0.62      0.21      0.32        47
        java       0.74      0.57      0.64       139
  javascript       0.68      0.48      0.56       102
      jquery       0.90      0.54      0.68        35
       linux       0.53      0.23      0.32        43
   objective       0.50      0.33      0.40        49
 performance       0.44      0.24      0.31        29
         php       0.84      0.62      0.71        52
      python       0.84      0.70      0.76

In [13]:
# Optimization des paramètres du modèle considéré

parameters = {'estimator__C':[0.005,0.001,0.05,0.1,0.5,1], 'estimator__penalty':['l2', 'l1','none']}

logistic = MultiOutputClassifier(LogisticRegression())

opti_clf = GridSearchCV(logistic, parameters)
t = time()
search = opti_clf.fit(X_train, y_train)
print(time()-t)
print(search.best_params_)

y_pred_opti = search.predict(X_test)
print("Classifier report:\n", classification_report(y_test, y_pred_opti, target_names=LabelBinarizer.classes_, zero_division=0))
# Calculate the Jaccard score
score = jaccard_score(y_test, y_pred_opti, average='micro')
print(f"Jaccard score: {score:.4f}")

30 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/multioutput.py", line 450, in fit
    super().fit(X, Y, sample_weight, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/multioutput.py", line 216, in fit
    self.estimators_ = Parallel(n_jobs=self.n_jobs)(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/parallel.py", line 63, in __call__
    return super().__call__(iterable_w

6073.886451244354
{'estimator__C': 0.005, 'estimator__penalty': 'none'}
Classifier report:
               precision    recall  f1-score   support

        .net       0.53      0.50      0.52       148
     android       0.94      0.67      0.78        45
         asp       0.50      0.29      0.37        34
          c#       0.72      0.65      0.68       192
         c++       0.82      0.72      0.77       104
         css       0.80      0.40      0.53        30
        html       0.50      0.42      0.46        43
         ios       0.74      0.62      0.67        81
      iphone       0.62      0.21      0.32        47
        java       0.74      0.57      0.64       139
  javascript       0.68      0.48      0.56       102
      jquery       0.90      0.54      0.68        35
       linux       0.53      0.23      0.32        43
   objective       0.50      0.33      0.40        49
 performance       0.44      0.24      0.31        29
         php       0.84      0.62      0.71

In [14]:
import joblib

with open('/content/drive/MyDrive/OCS/Projet5/BOW_clf_optim.pkl', 'wb') as f:
  joblib.dump((LabelBinarizer, search), f)

## Approche TF-IDF pour la classification par Régression Logistique

In [None]:
def dummy(doc):
  return doc

# Binarized before Train/Test split
LabelBinarizer = MultiLabelBinarizer()
y_tfidf = LabelBinarizer.fit_transform(ds['Tags'])
y_tfidf = pd.DataFrame(y_tfidf, columns=LabelBinarizer.classes_)

# Train-Test split
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(ds['Token'], y_tfidf)

# Binarized after Train/Test split as the frequency depends on the sample size
# Perform tf-idf on all dataset could induce data leak
tfidf_vectorizer = TfidfVectorizer(preprocessor=dummy, tokenizer=dummy)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_tfidf)
X_test_tfidf = tfidf_vectorizer.transform(X_test_tfidf)
# https://www.kaggle.com/code/neerajmohan/nlp-text-classification-using-tf-idf-features

params_LR = {'C': 0.5, 'max_iter': 500, 'penalty': 'none', 'solver': 'lbfgs'}
params_RF = {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': None, 'bootstrap': False}

# Build the model
if Clf_choice:
  model = MultiOutputClassifier(LogisticRegression(**params_LR))
  # https://scikit-learn.org/stable/modules/multiclass.html
else:
  model = MultiOutputClassifier(RandomForestClassifier(max_depth=100))

In [None]:
# search_tfidf = GridSearchCV(
#     estimator = model,
#     param_grid={'estimator__C':np.logspace(-3, 0, 4)},
#     scoring = ['accuracy', 'precision_micro', 'recall_micro'],
#     refit = 'precision_micro',
#     cv = 5
# ).fit(X_train_tfidf, y_train_tfidf)

# print(search_tfidf.best_estimator_)
# print(search_tfidf.best_params_)
# clf_TFIDF = search_tfidf.best_estimator_

In [None]:
t = time()
model.fit(X_train_tfidf, y_train_tfidf)
print(time()-t)
clf_TFIDF = model

11.077659368515015


In [None]:
y_pred_tfidf = clf_TFIDF.predict(X_test_tfidf)
print("Classifier report:\n", classification_report(y_test_tfidf, y_pred_tfidf, target_names=LabelBinarizer.classes_, zero_division=0))
# Calculate the Jaccard score
score = jaccard_score(y_test_tfidf, y_pred_tfidf, average='micro')
print(f"Jaccard score: {score:.4f}")

Classifier report:
               precision    recall  f1-score   support

        .net       0.56      0.42      0.48       151
     android       0.92      0.62      0.74        58
         asp       0.53      0.26      0.35        38
          c#       0.74      0.59      0.65       189
         c++       0.84      0.67      0.74        96
         css       0.70      0.24      0.36        29
        html       0.53      0.38      0.44        45
         ios       0.78      0.64      0.70        66
      iphone       0.35      0.21      0.26        33
        java       0.80      0.56      0.66       142
  javascript       0.70      0.59      0.64        97
      jquery       0.85      0.61      0.71        36
       linux       0.70      0.17      0.27        41
   objective       0.80      0.47      0.59        43
 performance       0.75      0.29      0.42        31
         php       0.88      0.56      0.68        52
      python       0.81      0.68      0.74       130
      s

## Word Embeddings with Word2Vec and Logistic Regression

In [None]:
# Word2Vec

# Binarized before Train/Test split
LabelBinarizer = MultiLabelBinarizer()
y_wrd2vec = LabelBinarizer.fit_transform(ds['Tags'])
# y_wrd2vec = pd.DataFrame(y_wrd2vec, columns=LabelBinarizer.classes_)

# Train-Test split
X_train_wrd2vec, X_test_wrd2vec, y_train_wrd2vec, y_test_wrd2vec = train_test_split(ds['Token'], y_wrd2vec)

# Train the word2vec model
w2v_model = Word2Vec(X_train_wrd2vec,
                     vector_size=100,
                     window=5,
                     min_count=5)

In [None]:
len(w2v_model.wv.index_to_key) # all the words that appear at least twice in the document

7368

In [None]:
w2v_model.wv[1] # embedding of the second word of the first sentence

array([-0.9082942 ,  0.28290892, -0.02818973, -0.27904075, -0.34566316,
       -1.056297  ,  0.27239338,  0.52423674,  0.61830163, -0.98657066,
       -0.16263108, -0.0101722 , -0.06468078,  0.21490379,  0.13522781,
        0.41400838,  0.5701941 ,  0.14336807,  0.10659837,  0.03016821,
        0.6045584 ,  0.1198278 ,  0.582128  ,  0.49496642,  0.31167856,
        0.8397371 ,  0.02211393, -0.15342037,  0.47438166,  0.1658942 ,
       -0.1240719 , -0.6770868 ,  0.02637972, -0.58936983,  0.17494851,
       -0.15399049,  0.61241233, -0.30545798, -0.44502062,  0.77877825,
        0.17071669, -0.24501291, -0.4382072 ,  0.55295104,  0.01959483,
       -0.31758267,  0.05758564, -0.53637433,  0.6583569 , -0.9756117 ,
        0.6151211 , -0.3520466 ,  0.77424514, -0.5543617 , -0.81301224,
        0.83001786, -0.25902337, -0.51293194,  0.78515714,  1.1166841 ,
        0.36998808, -0.48448548, -0.09710752,  0.0970918 ,  0.53041565,
        1.3639127 , -0.42906052, -0.73319954,  0.3556762 , -0.32

In [None]:
w2v_model.wv.most_similar(positive=["python"])

[('setup', 0.8927351832389832),
 ('pip', 0.8870863914489746),
 ('chai', 0.8836931586265564),
 ('packag', 0.8791005611419678),
 ('instal', 0.8679782748222351),
 ('easy_instal', 0.8545506000518799),
 ('nuget', 0.8489009737968445),
 ('fortran', 0.8476963043212891),
 ('jupyt', 0.8456833362579346),
 ('modul', 0.8439409732818604)]

In [None]:
words = set(w2v_model.wv.index_to_key) # vocabulary correspondance
# https://medium.com/@dilip.voleti/classification-using-word2vec-b1d79d375381

X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_train_wrd2vec]) # concatenate word embeddings of the same sentence in the same element of X_train_vect

X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])  for ls in X_test_wrd2vec])

In [None]:
# Compute sentence embeddings by averaging the word embeddings for the words contained in the sentence
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))

X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [None]:
params_LR = {'C': 0.5, 'max_iter': 500, 'penalty': 'none', 'solver': 'lbfgs'}
params_RF = {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': None, 'bootstrap': False}

# Build the model
if Clf_choice:
  model_WRD2VEC = MultiOutputClassifier(LogisticRegression(**params_LR))
  # https://scikit-learn.org/stable/modules/multiclass.html
else:
  model_WRD2VEC = MultiOutputClassifier(RandomForestClassifier(max_depth=100))

t = time()
model_WRD2VEC.fit(X_train_vect_avg, y_train_wrd2vec)
print(time()-t)

9.251000881195068


In [None]:
t = time()
y_pred_wrd2vec = model_WRD2VEC.predict(X_test_vect_avg)
print(time()-t)
print("Classifier report:\n", classification_report(y_test_wrd2vec, y_pred_wrd2vec, target_names=LabelBinarizer.classes_, zero_division=0))
# Calculate the Jaccard score
score = jaccard_score(y_test_wrd2vec, y_pred_wrd2vec, average='samples')
print(f"Jaccard score: {score:.4f}")

0.02974224090576172
Classifier report:
               precision    recall  f1-score   support

        .net       0.62      0.32      0.42       146
     android       0.88      0.75      0.81        77
         asp       0.64      0.17      0.27        41
          c#       0.65      0.33      0.44       198
         c++       0.70      0.42      0.53        83
         css       0.71      0.43      0.53        28
        html       0.52      0.28      0.36        43
         ios       0.71      0.44      0.54        73
      iphone       0.17      0.07      0.10        30
        java       0.71      0.39      0.50       143
  javascript       0.60      0.34      0.43       112
      jquery       0.53      0.36      0.43        45
       linux       0.42      0.18      0.25        28
   objective       0.47      0.15      0.23        46
 performance       0.14      0.03      0.05        31
         php       0.55      0.37      0.44        63
      python       0.83      0.57      0.

## BERT NLP algorithm

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import BertTokenizer, TFBertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', truncation=True, padding=True)
model_BERT = TFBertModel.from_pretrained("bert-base-uncased")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [None]:
# ID ENCODING BY HAND
def bert_encode(token_lst, tokenizer, max_len=64):
  all_tokens = []
  all_masks = []
  all_segments = []

  for text in token_lst:

    text = text[:max_len-2] # How to deal with truncation ?
    text = ["[CLS]"] + text + ["[SEP]"] # special tokens
    pad_len = max_len - len(text) # necesary PAD length

    token_id = tokenizer.convert_tokens_to_ids(text) + [0] * pad_len
    pad_masks = [1] * len(text) + [0] * pad_len
    segment_ids = [0] * max_len

    all_tokens.append(token_id)
    all_masks.append(pad_masks)
    all_segments.append(segment_ids)

  return {'input_ids' : np.array(all_tokens),
          'token_type_ids' : np.array(all_masks),
          'attention_mask' : np.array(all_segments)}

emb_BERT = np.array([])
batch_size=500
for i in tqdm(range(len(ds)//batch_size)):
  # Batch it because otherwise too big for the model to train on
  encoded_tokens = bert_encode(ds.loc[i*batch_size:(i+1)*batch_size-1,'Token'], tokenizer)
  # print(encoded_tokens)
  # print(len(encoded_tokens))
  embeddings = model_BERT(encoded_tokens)
  if len(emb_BERT)==0:
    emb_BERT = np.array(embeddings['pooler_output'])
  else:
    emb_BERT = np.vstack((emb_BERT, np.array(embeddings['pooler_output'])))
    # De-batching it to usual format

100%|██████████| 10/10 [00:06<00:00,  1.59it/s]


In [None]:
print(len(embeddings['pooler_output']))

500


In [None]:
x = emb_BERT
y = ds['Tags']

# Initialize MultiLabelBinarizer
LabelBinarizer = MultiLabelBinarizer()

y = LabelBinarizer.fit_transform(y)

X_train_BERT, X_test_BERT, y_train_BERT, y_test_BERT = train_test_split(x, y, test_size=0.2, random_state=42)

params_LR = {'C': 0.5, 'max_iter': 500, 'penalty': 'none', 'solver': 'lbfgs'}
params_RF = {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': None, 'bootstrap': False}

# Build the model
if Clf_choice:
  clf_BERT = MultiOutputClassifier(LogisticRegression(**params_LR))
  # https://scikit-learn.org/stable/modules/multiclass.html
else:
  clf_BERT = MultiOutputClassifier(RandomForestClassifier(max_depth=100))

In [None]:
t = time()
clf_BERT.fit(X_train_BERT, y_train_BERT)
print(time()-t)

51.30465054512024


In [None]:
# Make predictions on the testing data
y_pred_BERT = clf_BERT.predict(X_test_BERT)

# Evaluate the classifier using classification_report
print("Classifier report:\n", classification_report(y_test_BERT, y_pred_BERT, target_names=LabelBinarizer.classes_, zero_division=0))

# Calculate the Jaccard score
score = jaccard_score(y_test_BERT, y_pred_BERT, average='samples')
print(f"Jaccard score: {score:.4f}")

# Reset warnings to default
# warnings.resetwarnings()

Classifier report:
               precision    recall  f1-score   support

        .net       0.19      0.07      0.11       121
     android       0.51      0.49      0.50        43
         asp       0.10      0.09      0.10        32
          c#       0.26      0.17      0.20       138
         c++       0.29      0.20      0.24        76
         css       0.18      0.21      0.19        24
        html       0.17      0.20      0.19        49
         ios       0.35      0.34      0.34        62
      iphone       0.04      0.04      0.04        27
        java       0.55      0.34      0.42       123
  javascript       0.25      0.18      0.21        74
      jquery       0.18      0.27      0.22        30
       linux       0.15      0.20      0.17        30
   objective       0.14      0.11      0.12        36
 performance       0.03      0.05      0.04        21
         php       0.34      0.27      0.30        44
      python       0.67      0.50      0.57       104
      s

In [None]:
# OVERALL ENCODING
def BERT_encode(batch):
  batch = list(map(lambda x : ' '.join(x) ,batch))
  encoded_input = tokenizer(batch, return_tensors='tf', truncation=True, padding=True)
  # grouped_inputs = {'input_ids':tf.cast(encoded_input.input_ids, tf.int16),
  #                   'token_type_ids' : tf.cast(encoded_input.token_type_ids, tf.int16),
  #                   'attention_mask' : tf.cast(encoded_input.attention_mask, tf.int16)}
  # print(tf.cast(encoded_input.input_ids, tf.int16))
  # # print(encoded_input.astype(int16))
  # print(len(encoded_input))
  return model_BERT(encoded_input)

emb_BERT = np.array([])
# Batch the dataset to compute embeddings more easily
batch_size = 100
max_len = 64 # max number of word to avoid Out Of Memory issue
nb_batches = len(ds)//batch_size
for i in tqdm(range(len(ds)//batch_size)):
  # Batch it because otherwise too big for the model to train on
  embeddings = BERT_encode(ds.loc[i*batch_size:(i+1)*batch_size-1,'Token'].str.slice(0,max_len))
  # embeddings = model_USE(encoded_tokens)
  if emb_BERT.size==0:
    emb_BERT = np.array(embeddings['pooler_output'])
    # print(emb_BERT.size)
  else:
    emb_BERT = np.vstack((emb_BERT, np.array(embeddings['pooler_output'])))
    # print('merging')
    # print(i)
    # print(emb_BERT.size)
    # De-batching it to usual format

100%|██████████| 50/50 [00:24<00:00,  2.06it/s]


In [None]:
x = emb_BERT
y = ds['Tags']

# Initialize MultiLabelBinarizer
LabelBinarizer = MultiLabelBinarizer()

y = LabelBinarizer.fit_transform(y)

X_train_BERT, X_test_BERT, y_train_BERT, y_test_BERT = train_test_split(x, y, test_size=0.2, random_state=42)

params_LR = {'C': 0.5, 'max_iter': 500, 'penalty': 'none', 'solver': 'lbfgs'}
params_RF = {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': None, 'bootstrap': False}

# Build the model
if Clf_choice:
  clf_BERT2 = MultiOutputClassifier(LogisticRegression(**params_LR))
  # https://scikit-learn.org/stable/modules/multiclass.html
else:
  clf_BERT2 = MultiOutputClassifier(RandomForestClassifier(max_depth=100))

In [None]:
t = time()
clf_BERT2.fit(X_train_BERT, y_train_BERT)
print(time()-t)

25.04637837409973


In [None]:
# Make predictions on the testing data
y_pred_BERT2 = clf_BERT2.predict(X_test_BERT)

# Evaluate the classifier using classification_report
print("Classifier report:\n", classification_report(y_test_BERT, y_pred_BERT2, target_names=LabelBinarizer.classes_, zero_division=0))

# Calculate the Jaccard score
score = jaccard_score(y_test_BERT, y_pred_BERT, average='samples')
print(f"Jaccard score: {score:.4f}")

# Reset warnings to default
# warnings.resetwarnings()

Classifier report:
               precision    recall  f1-score   support

        .net       0.42      0.36      0.39       121
     android       0.43      0.49      0.46        43
         asp       0.22      0.38      0.28        32
          c#       0.40      0.37      0.39       138
         c++       0.45      0.46      0.46        76
         css       0.31      0.42      0.36        24
        html       0.21      0.18      0.20        49
         ios       0.46      0.35      0.40        62
      iphone       0.22      0.22      0.22        27
        java       0.51      0.33      0.40       123
  javascript       0.35      0.31      0.33        74
      jquery       0.17      0.27      0.21        30
       linux       0.08      0.10      0.09        30
   objective       0.31      0.36      0.33        36
 performance       0.11      0.19      0.14        21
         php       0.39      0.50      0.44        44
      python       0.50      0.52      0.51       104
      s

## USE NLP Algorithm

In [None]:
import tensorflow_hub as hub

model_USE = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')

In [None]:
def USE_encode(batch):
  batch = list(map(lambda x : ' '.join(x) ,batch))
  return model_USE(batch).numpy()

emb_USE = np.array([])
# Batch the dataset to compute embeddings more easily
batch_size = 200
nb_batches = len(ds)//batch_size
for i in range(len(ds)//batch_size):
  # Batch it because otherwise too big for the model to train on
  embeddings = USE_encode(ds.loc[i*batch_size:(i+1)*batch_size-1,'Token'])
  # embeddings = model_USE(encoded_tokens)
  if len(emb_USE)==0:
    emb_USE = np.array(embeddings)
  else:
    emb_USE = np.vstack((emb_USE, np.array(embeddings)))
    # De-batching it to usual format

In [None]:
embeddings.shape

(200, 512)

In [None]:
emb_USE.shape

(5000, 512)

In [None]:
x = emb_USE
y = ds['Tags']

# Initialize MultiLabelBinarizer
LabelBinarizer = MultiLabelBinarizer()

y = LabelBinarizer.fit_transform(y)

X_train_USE, X_test_USE, y_train_USE, y_test_USE = train_test_split(x, y, test_size=0.2, random_state=42)

params_LR = {'C': 0.5, 'max_iter': 500, 'penalty': 'none', 'solver': 'lbfgs'}
params_RF = {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': None, 'bootstrap': False}

# Build the model
if Clf_choice:
  USE_clf = MultiOutputClassifier(LogisticRegression(**params_LR))
  # https://scikit-learn.org/stable/modules/multiclass.html
else:
  USE_clf = MultiOutputClassifier(RandomForestClassifier(max_depth=100))

In [None]:
t = time()
USE_clf.fit(X_train_USE, y_train_USE)
print(time()-t)

3.3793904781341553


In [None]:
# Make predictions on the testing data
y_pred_USE = USE_clf.predict(X_test_USE)

# Evaluate the classifier using classification_report
print("Classifier report:\n", classification_report(y_test_USE, y_pred_USE, target_names=LabelBinarizer.classes_, zero_division=0))

# Calculate the Jaccard score
score = jaccard_score(y_test_USE, y_pred_USE, average='samples')
print(f"Jaccard score: {score:.4f}")

# Reset warnings to default
warnings.resetwarnings()

Classifier report:
               precision    recall  f1-score   support

        .net       0.43      0.50      0.46       121
     android       0.75      0.70      0.72        43
         asp       0.37      0.41      0.39        32
          c#       0.52      0.64      0.57       138
         c++       0.53      0.61      0.57        76
         css       0.55      0.46      0.50        24
        html       0.46      0.39      0.42        49
         ios       0.63      0.61      0.62        62
      iphone       0.39      0.44      0.41        27
        java       0.57      0.67      0.62       123
  javascript       0.50      0.62      0.55        74
      jquery       0.39      0.43      0.41        30
       linux       0.42      0.53      0.47        30
   objective       0.33      0.39      0.35        36
 performance       0.23      0.29      0.26        21
         php       0.66      0.61      0.64        44
      python       0.71      0.73      0.72       104
      s

In [None]:
# Optimization des paramètres du modèle considéré

parameters = {'estimator__C':[0.005,0.001,0.05,0.1,0.5,1], 'estimator__penalty':['l2', 'l1','none']}

logistic = MultiOutputClassifier(LogisticRegression())

opti_clf = GridSearchCV(logistic, parameters)
t = time()
search = opti_clf.fit(X_train_USE, y_train_USE)
print(time()-t)
print(search.best_params_)

y_pred_opti = search.predict(X_test_USE)
print("Classifier report:\n", classification_report(y_test_USE, y_pred_opti, target_names=LabelBinarizer.classes_, zero_division=0))
# Calculate the Jaccard score
score = jaccard_score(y_test_USE, y_pred_opti, average='micro')
print(f"Jaccard score: {score:.4f}")

In [1]:
import joblib

with open('/content/drive/MyDrive/OCS/Projet5/USE_clf_optim.pkl', 'wb') as f:
  joblib.dump((LabelBinarizer, search), f)

In [None]:
# import joblib
# yolo, lolilol = joblib.load('USE_clf.pkl')