# Python Projet 5 - Prédiction des tags 

## Multilabel classification avec la méthode One-vs-Rest

Dans le cadre du cours OpenClassrooms Machine Learning et du projet d'évaluation #5, ce notebook présente les différents modèles de prédiction des tags ainsi qu'une comparaison avec l'analyse non supervisée obtenue par LDA. 

Author : Vincent Arrigoni, 04/2023 

Regexp : https://www.debuggex.com/cheatsheet/regex/python

Tout savoir :https://datascientest.com/regex-tout-savoir

Test regexp : https://regex101.com/

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# import of libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer 
from sklearn.multioutput import MultiOutputClassifier 
from sklearn.metrics import precision_score, recall_score, jaccard_score, classification_report
from gensim.models import Word2Vec
import re
from tqdm import tqdm
from time import time
from sklearn.utils import shuffle

# import tensorflow as tf
# import tensorflow_hub as hub
# import tensorflow_text as text
# from official.nlp import optimization  # to create AdamW optimizer

In [3]:
data = pd.read_csv('/content/drive/MyDrive/OCS/Projet5/msg_preprocessed.csv')
data.head(10)

Unnamed: 0.1,Unnamed: 0,Text,Tags,Token,Tags_lst,Tags_lst_new,Token_lst_new
0,0,how do I calculate a rolling idxmax<p>consider...,<python><pandas><numpy><dataframe><series>,"['calcul', 'roll', 'idxmax', 'consid', 'pd', '...","['python', 'pandas', 'numpy', 'dataframe', 'se...",['python'],"['calcul', 'roll', 'idxmax', 'consid', 'pd', '..."
1,1,Object-Oriented Callbacks for C++?<p>Is there ...,<c++><oop><callback><pointer-to-member><eiffel>,"['object', 'orient', 'callback', 'c++', 'libra...","['c++', 'oop', 'callback', 'pointer-to-member'...",['c++'],"['object', 'orient', 'callback', 'c++', 'libra..."
2,2,TDD and ADO.NET Entity Framework<p>I've been p...,<unit-testing><entity-framework><ado.net><tdd>...,"['tdd', 'ado', '.net', 'entiti', 'framework', ...","['unit-testing', 'entity-framework', 'ado.net'...",['unit-testing'],"['tdd', 'ado', '.net', 'entiti', 'framework', ..."
3,3,Better techniques for trimming leading zeros i...,<sql><sql-server><sql-server-2005><tsql><string>,"['better', 'techniqu', 'trim', 'lead', 'zero',...","['sql', 'sql-server', 'sql-server-2005', 'tsql...","['sql', 'sql-server', 'string']","['better', 'techniqu', 'trim', 'lead', 'zero',..."
4,4,"""No X11 DISPLAY variable"" - what does it mean?...",<java><linux><variables><x11><headless>,"['x11', 'display', 'variabl', 'mean', 'tri', '...","['java', 'linux', 'variables', 'x11', 'headless']","['java', 'linux']","['x11', 'display', 'variabl', 'mean', 'tri', '..."
5,5,Why doesn't .NET find the OpenSSL.NET dll?<p><...,<c#><.net><dll><dllimport><dllnotfoundexception>,"['.net', 'find', 'openssl', '.net', 'dll', 'ed...","['c#', '.net', 'dll', 'dllimport', 'dllnotfoun...","['c#', '.net']","['.net', 'find', 'openssl', '.net', 'dll', 'ed..."
6,6,Iterating through/Parsing JSON Object via Java...,<javascript><json><jquery><parsing><loops>,"['iter', 'pars', 'json', 'object', 'via', 'jav...","['javascript', 'json', 'jquery', 'parsing', 'l...","['javascript', 'json', 'jquery']","['iter', 'pars', 'json', 'object', 'via', 'jav..."
7,7,Class inherited from class without default con...,<c++><class><inheritance><constructor><default...,"['class', 'inherit', 'class', 'without', 'defa...","['c++', 'class', 'inheritance', 'constructor',...",['c++'],"['class', 'inherit', 'class', 'without', 'defa..."
8,8,libxml2 vs expat for an XMPP server<p>I'm tryi...,<c><go><xmpp><libxml2><expat-parser>,"['libxml2', 'vs', 'expat', 'xmpp', 'server', '...","['c', 'go', 'xmpp', 'libxml2', 'expat-parser']",['c'],"['libxml2', 'vs', 'expat', 'xmpp', 'server', '..."
9,9,Implications of deploying a Debug build of an ...,<c#><security><debugging><deployment><compiler...,"['implic', 'deploy', 'debug', 'build', 'applic...","['c#', 'security', 'debugging', 'deployment', ...","['c#', 'security', 'debugging']","['implic', 'deploy', 'debug', 'build', 'applic..."


In [4]:
def regexp_Token(x):
  return(re.findall(r'(?:C\+\+)|(?:c\+\+)|(?:c\#)|(?:C\#)|(?:\.net)|(?:\.NET)|\w{2,}', x['Token_lst_new']))
         
def regexp_Tags(x):
  return(re.findall(r'(?:C\+\+)|(?:c\+\+)|(?:c\#)|(?:C\#)|(?:\.net)|(?:\.NET)|\w{2,}', x['Tags_lst_new']))

ds = pd.DataFrame({'Token' : data[:5000].apply(regexp_Token, axis=1), 'Tags' : data[:5000].apply(regexp_Tags, axis=1)})
ds = shuffle(ds)
ds.reset_index(inplace=True)
ds.head(10)

Unnamed: 0,index,Token,Tags
0,2685,"[convert, give, invalid, cast, error, creat, c...","[c#, wpf]"
1,1249,"[upgrad, winform, app, wpf, side, project, wro...","[.net, wpf, winforms]"
2,1410,"[get, base, class, instanc, deriv, class, know...","[c#, .net]"
3,2438,"[null, object, differ, null, undefin, null, co...",[javascript]
4,2281,"[boost, filter_iter, would, stl, pass, iter, p...",[c++]
5,4137,"[good, graph, travers, algorithm, abstract, pr...","[python, performance, algorithm]"
6,2074,"[json, pars, io, creat, app, exist, websit, cu...","[ios, objective, json]"
7,2934,"[creat, subarray, nsarray, use, nsrang, array,...","[ios, objective, arrays]"
8,1239,"[http, authent, logout, via, php, correct, way...",[php]
9,358,"[use, replicatem, solv, eight, queen, problem,...",[]


## Approche Bag Of Words pour la classification par Régression Logistique 

In [None]:
def dummy(doc):
  return doc
  # https://stackoverflow.com/questions/35867484/pass-tokens-to-countvectorizer
  # pas besoin que countvectorizer fasse un preprocessing ou une tokenization des documents

vectorizer = CountVectorizer(tokenizer=dummy, preprocessor=dummy, binary=True)
X = vectorizer.fit_transform(ds['Token'])
X = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

LabelBinarizer = MultiLabelBinarizer()
y = LabelBinarizer.fit_transform(ds['Tags'])
y = pd.DataFrame(y, columns=LabelBinarizer.classes_)



In [None]:
# Train-Test split 
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Build the model
model = MultiOutputClassifier(LogisticRegression(verbose=1))
# https://scikit-learn.org/stable/modules/multiclass.html

In [None]:
search = GridSearchCV(
    estimator = model,
    param_grid={'estimator__C':np.logspace(-3, 0, 4)},
    scoring = ['accuracy', 'precision_micro', 'recall_micro'],  
    refit = 'precision_micro',
    cv = 5
).fit(X_train, y_train)
# https://datascience.stackexchange.com/questions/107867/how-to-train-multioutput-classification-with-hyperparameter-tuning-in-sklearn
# https://stackoverflow.com/questions/41899132/invalid-parameter-for-sklearn-estimator-pipeline
# https://scikit-learn.org/stable/modules/model_evaluation.html#from-binary-to-multiclass-and-multilabel

# error : Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

print(search.best_estimator_)
model_BOW = search.best_estimator_ 

In [None]:
model.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   13.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j

In [None]:
y_pred = model.predict(X_test)
print("Classifier report:\n", classification_report(y_test, y_pred, target_names=LabelBinarizer.classes_, zero_division=0))
# Calculate the Jaccard score
score = jaccard_score(y_test, y_pred, average='samples')
print(f"Jaccard score: {score:.4f}")

Classifier report:
                 precision    recall  f1-score   support

          .net       0.58      0.39      0.46       150
          ajax       0.86      0.35      0.50        17
     algorithm       1.00      0.10      0.17        21
       android       0.98      0.62      0.76        64
        arrays       0.50      0.11      0.18        18
           asp       0.80      0.33      0.47        48
          bash       0.33      0.11      0.17        18
            c#       0.80      0.57      0.67       185
           c++       0.86      0.63      0.72        94
         cocoa       0.75      0.18      0.29        17
           css       0.62      0.29      0.39        28
      database       0.31      0.17      0.22        24
     debugging       1.00      0.14      0.25        14
        django       1.00      0.55      0.71        20
           gcc       0.33      0.05      0.09        19
     hibernate       1.00      0.19      0.32        16
          html       0.55  

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
y_pred = model_BOW.predict(X_test)
precision = precision_score(y_test, y_pred, average='micro')
recall = recall_score(y_test, y_pred, average='micro')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Precision: 0.842 / Recall: 0.147 / Accuracy: .net              0.868
ajax              0.992
algorithm         0.988
android           0.968
arrays            0.992
asp               0.956
bash              0.988
c#                0.896
c++               0.932
cocoa             0.996
css               0.976
database          0.988
debugging         0.984
django            0.996
gcc               0.992
hibernate         0.992
html              0.976
image             0.992
ios               0.952
ios7              0.960
iphone            0.976
java              0.928
javascript        0.912
jquery            0.976
js                0.988
json              0.984
linq              0.980
linux             0.956
macos             0.984
multithreading    0.996
mvc               0.988
mysql             0.996
node              0.988
objective         0.956
on                0.988
optimization      0.992
performance       0.980
php               0.956
python            0.944
rails             0

## Approche TF-IDF pour la classification par Régression Logistique

In [None]:
def dummy(doc):
  return doc

# Binarized before Train/Test split
LabelBinarizer = MultiLabelBinarizer()
y_tfidf = LabelBinarizer.fit_transform(ds['Tags'])
y_tfidf = pd.DataFrame(y_tfidf, columns=LabelBinarizer.classes_)

# Train-Test split 
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(ds['Token'], y_tfidf)

# Binarized after Train/Test split as the frequency depends on the sample size
# Perform tf-idf on all dataset could induce data leak
tfidf_vectorizer = TfidfVectorizer(preprocessor=dummy, tokenizer=dummy)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_tfidf)
X_test_tfidf = tfidf_vectorizer.transform(X_test_tfidf)
# https://www.kaggle.com/code/neerajmohan/nlp-text-classification-using-tf-idf-features

# Build the model
model_TFIDF = MultiOutputClassifier(LogisticRegression())



In [None]:
model_TFIDF.fit(X_train_tfidf, y_train_tfidf)

In [None]:
search_tfidf = GridSearchCV(
    estimator = model,
    param_grid={'estimator__C':np.logspace(-3, 3, 10)},
    scoring = ['accuracy', 'precision_micro', 'recall_micro'],
    refit = 'precision_micro',
    cv = 5
).fit(X_train_tfidf, y_train_tfidf)

print(search_tfidf.best_estimator_)
model_TFIDF = search_tfidf.best_estimator_

In [None]:
y_pred = model_TFIDF.predict(X_test_tfidf)
precision = precision_score(y_test_tfidf, y_pred, average='micro')
recall = recall_score(y_test_tfidf, y_pred, average='micro')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test_tfidf).sum()/len(y_pred), 3)))

Precision: 0.846 / Recall: 0.123 / Accuracy: .net              0.884
ajax              0.986
algorithm         0.987
android           0.965
arrays            0.986
asp               0.957
bash              0.985
c#                0.873
c++               0.951
cocoa             0.985
css               0.978
database          0.984
debugging         0.992
django            0.983
gcc               0.989
hibernate         0.989
html              0.957
image             0.990
ios               0.944
ios7              0.990
iphone            0.977
java              0.913
javascript        0.933
jquery            0.974
js                0.991
json              0.987
linq              0.990
linux             0.968
macos             0.986
multithreading    0.982
mvc               0.981
mysql             0.983
node              0.991
objective         0.968
on                0.979
optimization      0.990
performance       0.979
php               0.970
python            0.927
rails             0

In [None]:
y_pred_tfidf = model_TFIDF.predict(X_test_tfidf)
print("Classifier report:\n", classification_report(y_test_tfidf, y_pred_tfidf, target_names=LabelBinarizer.classes_, zero_division=0))
# Calculate the Jaccard score
score = jaccard_score(y_test_tfidf, y_pred_tfidf, average='samples')
print(f"Jaccard score: {score:.4f}")

Classifier report:
                 precision    recall  f1-score   support

          .net       0.88      0.17      0.28       169
          ajax       0.00      0.00      0.00        18
     algorithm       0.00      0.00      0.00        16
       android       1.00      0.20      0.33        55
        arrays       0.00      0.00      0.00        18
           asp       1.00      0.07      0.13        58
          bash       0.00      0.00      0.00        19
            c#       0.78      0.23      0.35       190
           c++       0.92      0.38      0.53        93
         cocoa       0.00      0.00      0.00        19
           css       1.00      0.03      0.07        29
      database       0.00      0.00      0.00        20
     debugging       0.00      0.00      0.00        10
        django       1.00      0.12      0.22        24
           gcc       0.00      0.00      0.00        14
     hibernate       0.00      0.00      0.00        14
          html       0.00  

  _warn_prf(average, modifier, msg_start, len(result))


## Word Embeddings with Word2Vec and Logistic Regression

In [None]:
# Word2Vec

# Binarized before Train/Test split
LabelBinarizer = MultiLabelBinarizer()
y_wrd2vec = LabelBinarizer.fit_transform(ds['Tags'])
# y_wrd2vec = pd.DataFrame(y_wrd2vec, columns=LabelBinarizer.classes_)

# Train-Test split 
X_train_wrd2vec, X_test_wrd2vec, y_train_wrd2vec, y_test_wrd2vec = train_test_split(ds['Token'], y_wrd2vec)

# Train the word2vec model 
w2v_model = Word2Vec(X_train_wrd2vec, 
                     vector_size=100,
                     window=5, 
                     min_count=2)

In [None]:
len(w2v_model.wv.index_to_key) # all the words that appear at least twice in the document

17607

In [None]:
w2v_model.wv[1] # embedding of the second word of the first sentence

array([-0.8560293 ,  2.514168  ,  0.03624365, -0.5484349 , -0.82519555,
       -1.6372861 ,  0.9489383 ,  0.7850049 ,  0.07324732, -1.7028004 ,
       -1.2190976 , -1.7493724 , -0.3584944 ,  0.91182494,  0.45971295,
        1.5882945 ,  1.7925165 ,  0.03817501, -0.9298287 , -0.7946958 ,
        1.3778473 , -0.06816832,  1.492833  ,  1.0859311 ,  1.0520694 ,
        0.73363197, -1.3254396 , -0.5543015 ,  0.05527653,  0.972904  ,
        2.3859074 ,  0.92625445,  0.23584762, -0.44052994,  1.1579409 ,
       -0.44179767,  2.2616148 ,  0.25743628,  0.8323377 , -0.2539642 ,
       -0.13018885, -1.2192273 , -0.17191643,  0.44603714, -0.48378727,
       -0.506642  ,  0.24188894, -0.4173854 ,  0.8292513 ,  0.9648731 ,
        0.68008757, -1.8085318 , -0.19980508, -1.663434  , -0.41592735,
        0.9151131 ,  0.9023225 ,  0.5139946 , -1.7894812 ,  0.5037689 ,
        0.34894034, -0.05896798, -0.7465126 ,  0.05092788, -0.1348537 ,
        2.539671  ,  1.1685464 , -0.02742696, -0.01833059,  0.70

In [None]:
w2v_model.wv.most_similar(positive=["python"]) 

[('fortran', 0.9249147772789001),
 ('_crypt', 0.9232932925224304),
 ('flycheck', 0.9231752157211304),
 ('nuget', 0.9117671847343445),
 ('superpack', 0.9050602912902832),
 ('setup', 0.9047500491142273),
 ('instal', 0.9010030031204224),
 ('f2pi', 0.8960736989974976),
 ('rubi', 0.8932390213012695),
 ('pip', 0.8924426436424255)]

In [None]:
words = set(w2v_model.wv.index_to_key) # vocabulary correspondance
# https://medium.com/@dilip.voleti/classification-using-word2vec-b1d79d375381

X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_train_wrd2vec]) # concatenate word embeddings of the same sentence in the same element of X_train_vect

X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])  for ls in X_test_wrd2vec])

  X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_train_wrd2vec]) # concatenate word embeddings of the same sentence in the same element of X_train_vect
  X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])  for ls in X_test_wrd2vec])


In [None]:
# Compute sentence embeddings by averaging the word embeddings for the words contained in the sentence
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [None]:
# Build the model
model_WRD2VEC = MultiOutputClassifier(LogisticRegression(solver='newton-cholesky',  ))
model_WRD2VEC.fit(X_train_vect_avg, y_train_wrd2vec)

In [None]:
# Instantiate and fit a basic Random Forest model on top of the vectors
from sklearn.ensemble import RandomForestClassifier
rf = MultiOutputClassifier(RandomForestClassifier())
rf_model = rf.fit(X_train_vect_avg, y_train_wrd2vec)

In [None]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect_avg)

In [None]:
from sklearn.metrics import precision_score, recall_score
precision = precision_score(y_test_wrd2vec, y_pred, average='micro')
recall = recall_score(y_test_wrd2vec, y_pred, average='micro')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test_wrd2vec).sum()/len(y_pred), 3)))

Precision: 0.695 / Recall: 0.127 / Accuracy: 55.397


In [None]:
y_pred_wrd2vec = model_WRD2VEC.predict(X_test_vect_avg)
print("Classifier report:\n", classification_report(y_test_wrd2vec, y_pred_wrd2vec, target_names=LabelBinarizer.classes_, zero_division=0))
# Calculate the Jaccard score
score = jaccard_score(y_test_wrd2vec, y_pred_wrd2vec, average='samples')
print(f"Jaccard score: {score:.4f}")

Classifier report:
                 precision    recall  f1-score   support

          .net       0.74      0.29      0.41       174
          ajax       0.00      0.00      0.00        13
     algorithm       0.33      0.06      0.11        16
       android       0.89      0.38      0.53        64
        arrays       0.25      0.10      0.14        21
           asp       0.83      0.19      0.31        53
          bash       0.00      0.00      0.00        12
            c#       0.63      0.23      0.33       194
           c++       0.64      0.43      0.51       101
         cocoa       0.00      0.00      0.00        19
           css       0.11      0.05      0.06        22
      database       0.00      0.00      0.00        23
     debugging       1.00      0.07      0.13        14
        django       0.67      0.11      0.18        19
           gcc       0.00      0.00      0.00        15
     hibernate       1.00      0.11      0.19        19
          html       0.41  

  _warn_prf(average, modifier, msg_start, len(result))


## BERT NLP algorithm

In [58]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [72]:
from transformers import BertTokenizer, TFBertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', truncation=True, padding=True)
model = TFBertModel.from_pretrained("bert-base-uncased")
# text = "R:eplace me by any text you'd like."
# encoded_input = tokenizer(text, return_tensors='tf')
# output = model(encoded_input)
# print(output)

  and should_run_async(code)
Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [73]:
# from transformers import AutoModel # For BERTs
# # from transformers import AutoModeForSequenceClassification # For models fine-tuned on MNLI
# from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-small") # v1 and v2
# model = AutoModel.from_pretrained("prajjwal1/bert-small") # v1 and v2

In [8]:
# ID ENCODING BY HAND 
def bert_encode(token_lst, tokenizer, max_len=64):
  all_tokens = []
  all_masks = []
  all_segments = []

  for text in token_lst:

    text = text[:max_len-2] # How to deal with truncation ? 
    text = ["[CLS]"] + text + ["[SEP]"] # special tokens
    pad_len = max_len - len(text) # necesary PAD length 

    token_id = tokenizer.convert_tokens_to_ids(text) + [0] * pad_len 
    pad_masks = [1] * len(text) + [0] * pad_len
    segment_ids = [0] * max_len
        
    all_tokens.append(token_id)
    all_masks.append(pad_masks)
    all_segments.append(segment_ids)
  
  return {'input_ids' : np.array(all_tokens),
          'token_type_ids' : np.array(all_masks),
          'attention_mask' : np.array(all_segments)}

emb_BERT = np.array([])
batch_size=500
for i in range(len(ds)//batch_size):
  # Batch it because otherwise too big for the model to train on 
  encoded_tokens = bert_encode(ds.loc[i*batch_size:(i+1)*batch_size-1,'Token'], tokenizer)
  embeddings = model(encoded_tokens)
  if len(emb_BERT)==0:
    emb_BERT = np.array(embeddings['pooler_output'])
  else:
    emb_BERT = np.vstack((emb_BERT, np.array(embeddings['pooler_output'])))
    # De-batching it to usual format 

In [9]:
print(len(embeddings['pooler_output']))

500


In [85]:
# OVERALL MODEL 
def BERT_encode(batch):
  batch = list(map(lambda x : ' '.join(x) ,batch))
  print(batch)
  print(len(batch))
  tokens = tokenizer.encode(batch, return_tensors='tf')
  print(tokens)
  return model(tokens)

emb_BERT = np.array([])
# Batch the dataset to compute embeddings more easily 
batch_size = 200
nb_batches = len(ds)//batch_size
for i in range(len(ds)//batch_size):
  # Batch it because otherwise too big for the model to train on 
  embeddings = BERT_encode(ds.loc[i*batch_size:(i+1)*batch_size-1,'Token'])
  print('this is embeddings')
  print(embeddings)
  # embeddings = model_USE(encoded_tokens)
  if len(emb_BERT)==0:
    emb_USE = np.array(embeddings)
  else:
    emb_BERT = np.vstack((emb_BERT, np.array(embeddings)))
    # De-batching it to usual format

Output hidden; open in https://colab.research.google.com to view.

In [81]:
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='tf')
output = model(encoded_input)

In [83]:
encoded_input

  and should_run_async(code)


{'input_ids': <tf.Tensor: shape=(1, 12), dtype=int32, numpy=
array([[ 101, 5672, 2033, 2011, 2151, 3793, 2017, 1005, 1040, 2066, 1012,
         102]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(1, 12), dtype=int32, numpy=array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 12), dtype=int32, numpy=array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32)>}

In [82]:
output

  and should_run_async(code)


TFBaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=<tf.Tensor: shape=(1, 12, 768), dtype=float32, numpy=
array([[[ 0.13862701,  0.15826888, -0.29666466, ..., -0.2708498 ,
         -0.284363  ,  0.45808432],
        [ 0.536364  , -0.23269653,  0.17541912, ...,  0.55402553,
          0.4980719 , -0.00240718],
        [ 0.30023742, -0.34751198,  0.12084411, ..., -0.4562491 ,
          0.32880187,  0.87728155],
        ...,
        [ 0.37985852,  0.120288  ,  0.8282937 , ..., -0.8623719 ,
         -0.5956966 ,  0.04711508],
        [-0.0252418 , -0.7176743 , -0.69504786, ...,  0.07574203,
         -0.6667816 , -0.34007484],
        [ 0.7535387 ,  0.23910922,  0.07174388, ...,  0.2467154 ,
         -0.6458064 , -0.32129768]]], dtype=float32)>, pooler_output=<tf.Tensor: shape=(1, 768), dtype=float32, numpy=
array([[-0.93767875, -0.5042589 , -0.979893  ,  0.90304404,  0.93293256,
        -0.24377501,  0.89257544,  0.22880587, -0.9531208 , -0.9999953 ,
        -0.88623035,  0.990

In [78]:
emb_BERT

  and should_run_async(code)


array([], dtype=float64)

In [66]:
ds.loc[i*batch_size:(i+1)*batch_size-1,'Token']

1000    [ipython, pylab, matplotlib, instal, initi, er...
1001    [static, analysi, tool, python, rubi, sql, cob...
1002    [jqueri, detect, bootstrap, state, bootstrap, ...
1003    [deseri, client, side, ajax, json, date, given...
1004    [method, angularj, that, equal, getjson, newbi...
                              ...                        
1195    [access, app, config, set, class, librari, cal...
1196    [import, qtquick, control, work, qqmlapplicati...
1197    [mysqldump, error, access, deni, despit, corre...
1198    [unabl, chang, background, color, static, tabl...
1199    [prevent, long, queri, pdo, way, make, pdo, ob...
Name: Token, Length: 200, dtype: object

In [None]:
x = emb_BERT
y = ds['Tags']

# Initialize MultiLabelBinarizer
LabelBinarizer = MultiLabelBinarizer()

y = LabelBinarizer.fit_transform(y)

X_train_BERT, X_test_BERT, y_train_BERT, y_test_BERT = train_test_split(x, y, test_size=0.2, random_state=42)

import warnings
from sklearn.exceptions import ConvergenceWarning

# Suppress specific warnings
warnings.filterwarnings('ignore', category=ConvergenceWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

best_params_SB = {'estimator__C': 0.005, 'estimator__max_iter': 500, 'estimator__penalty': 'none', 'estimator__solver': 'lbfgs'}

# Remove the 'estimator__' prefix from the keys of best_params
best_params_cleaned_SB = {key.replace('estimator__', ''): value for key, value in best_params_SB.items()}

# Train the OneVsRestClassifier with the best parameters on the entire training data
model_BERT = MultiOutputClassifier(LogisticRegression(**best_params_cleaned_SB, random_state=42))

In [11]:
model_BERT.fit(X_train_BERT, y_train_BERT)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [12]:
# Make predictions on the testing data
y_pred_BERT = model_BERT.predict(X_test_BERT)

# Evaluate the classifier using classification_report
print("Classifier report:\n", classification_report(y_test_BERT, y_pred_BERT, target_names=LabelBinarizer.classes_, zero_division=0))

# Calculate the Jaccard score
score = jaccard_score(y_test_BERT, y_pred_BERT, average='samples')
print(f"Jaccard score: {score:.4f}")

# Reset warnings to default
warnings.resetwarnings()

Classifier report:
                 precision    recall  f1-score   support

          .net       0.34      0.16      0.22       131
          ajax       0.29      0.33      0.31         6
     algorithm       0.10      0.15      0.12        13
       android       0.55      0.60      0.57        50
        arrays       0.38      0.69      0.49        13
           asp       0.05      0.05      0.05        41
          bash       0.33      0.22      0.27        18
            c#       0.35      0.21      0.26       155
           c++       0.35      0.18      0.24        98
         cocoa       0.11      0.06      0.08        16
           css       0.12      0.19      0.15        16
      database       0.05      0.05      0.05        22
     debugging       0.06      0.08      0.07        13
        django       0.12      0.11      0.11        19
           gcc       0.09      0.08      0.09        12
     hibernate       0.00      0.00      0.00         4
          html       0.26  

  _warn_prf(average, modifier, msg_start, len(result))


## USE NLP Algorithm

In [50]:
import tensorflow_hub as hub 

model_USE = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')

In [52]:
def USE_encode(batch):
  batch = list(map(lambda x : ' '.join(x) ,batch))
  return model_USE(batch).numpy()

emb_USE = np.array([])
# Batch the dataset to compute embeddings more easily 
batch_size = 200
nb_batches = len(ds)//batch_size
for i in range(len(ds)//batch_size):
  # Batch it because otherwise too big for the model to train on 
  embeddings = USE_encode(ds.loc[i*batch_size:(i+1)*batch_size-1,'Token'])
  # embeddings = model_USE(encoded_tokens)
  if len(emb_USE)==0:
    emb_USE = np.array(embeddings)
  else:
    emb_USE = np.vstack((emb_USE, np.array(embeddings)))
    # De-batching it to usual format

In [53]:
embeddings.shape

(200, 512)

In [54]:
emb_USE.shape

(5000, 512)

In [55]:
x = emb_USE
y = ds['Tags']

# Initialize MultiLabelBinarizer
LabelBinarizer = MultiLabelBinarizer()

y = LabelBinarizer.fit_transform(y)

X_train_USE, X_test_USE, y_train_USE, y_test_USE = train_test_split(x, y, test_size=0.2, random_state=42)

import warnings
from sklearn.exceptions import ConvergenceWarning

# Suppress specific warnings
warnings.filterwarnings('ignore', category=ConvergenceWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

best_params_SB = {'estimator__C': 0.005, 'estimator__max_iter': 500, 'estimator__penalty': 'none', 'estimator__solver': 'lbfgs'}

# Remove the 'estimator__' prefix from the keys of best_params
best_params_cleaned_SB = {key.replace('estimator__', ''): value for key, value in best_params_SB.items()}

# Train the OneVsRestClassifier with the best parameters on the entire training data
USE_clf = MultiOutputClassifier(LogisticRegression(**best_params_cleaned_SB, random_state=42))

  and should_run_async(code)


In [56]:
USE_clf.fit(X_train_USE, y_train_USE)

  and should_run_async(code)


In [57]:
# Make predictions on the testing data
y_pred_USE = USE_clf.predict(X_test_USE)

# Evaluate the classifier using classification_report
print("Classifier report:\n", classification_report(y_test_USE, y_pred_USE, target_names=LabelBinarizer.classes_, zero_division=0))

# Calculate the Jaccard score
score = jaccard_score(y_test_USE, y_pred_USE, average='samples')
print(f"Jaccard score: {score:.4f}")

# Reset warnings to default
warnings.resetwarnings()

Classifier report:
                 precision    recall  f1-score   support

          .net       0.54      0.56      0.55       131
          ajax       0.22      0.33      0.27         6
     algorithm       0.29      0.38      0.33        13
       android       0.84      0.72      0.77        50
        arrays       0.39      0.54      0.45        13
           asp       0.43      0.44      0.43        41
          bash       0.64      0.39      0.48        18
            c#       0.55      0.55      0.55       155
           c++       0.71      0.61      0.66        98
         cocoa       0.33      0.19      0.24        16
           css       0.36      0.56      0.44        16
      database       0.30      0.27      0.29        22
     debugging       0.36      0.38      0.37        13
        django       0.85      0.58      0.69        19
           gcc       0.30      0.25      0.27        12
     hibernate       0.25      0.50      0.33         4
          html       0.48  

  and should_run_async(code)
