In [1]:
import os
import time
import joblib
import numpy as np
import pandas as pd
from ast import literal_eval
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import set_config
import sklearn.metrics as metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
# Define path to data
prefix = "/Users/tatia/Developer/tag-generator/core"
date = "2024-01-02/19"
time_str= "193120"
output_data_dir = f"{prefix}/data/final/{date}"

print(output_data_dir)
data_file_name = f"{time_str}_final_so_questions_2008_2023.csv"
print(f"{output_data_dir}/{data_file_name}")
data = pd.read_csv(f"{output_data_dir}/{data_file_name}", index_col=0,
                   converters={"Title": literal_eval,
                               "Body": literal_eval,
                               "Tags": literal_eval})
data.head(3)


/Users/tatia/Developer/tag-generator/core/data/final/2024-01-02/19
/Users/tatia/Developer/tag-generator/core/data/final/2024-01-02/19/193120_final_so_questions_2008_2023.csv


Unnamed: 0,Title,Body,All_Tags,Tags
0,"[convert, decimal, double]","[want, assign, decimal, variable, trans, doubl...","['c#', 'floating-point', 'type-conversion', 'd...",[c#]
1,"[width, collapse, percentage, width, child, el...","[absolutely, positioned, containing, several, ...","['html', 'css', 'internet-explorer-7']","[html, css]"
2,"[calculate, age, based, datetime, type, birthday]","[given, representing, person, birthday, calcul...","['c#', '.net', 'datetime']","[c#, .net]"


In [3]:
data.shape

(676651, 4)

In [4]:
data["Full_doc"] = data["Title"] + data["Body"]
data["Full_doc"].head(3)

0    [convert, decimal, double, want, assign, decim...
1    [width, collapse, percentage, width, child, el...
2    [calculate, age, based, datetime, type, birthd...
Name: Full_doc, dtype: object

In [5]:
def metrics_score(model, df, y_true, y_pred):
    """Compilation function of metrics specific to multi-label
    classification problems in a Pandas DataFrame.
    This dataFrame will have 1 row per metric
    and 1 column per model tested. 

    Parameters
    ----------------------------------------
    model : string
        Name of the tested model
    df : DataFrame 
        DataFrame to extend. 
        If None : Create DataFrame.
    y_true : array
        Array of true values to test
    y_pred : array
        Array of predicted values to test
    ----------------------------------------
    """
    if(df is not None):
        temp_df = df
    else:
        temp_df = pd.DataFrame(index=["Accuracy", "F1",
                                      "Jaccard", "Recall",
                                      "Precision"],
                               columns=[model])
        
    scores = []
    scores.append(metrics.accuracy_score(y_true, 
                                         y_pred))
    scores.append(metrics.f1_score(y_pred, 
                                   y_true, 
                                   average='weighted'))
    scores.append(metrics.jaccard_score(y_true, 
                                        y_pred, 
                                        average='weighted'))
    scores.append(metrics.recall_score(y_true, 
                                       y_pred, 
                                       average='weighted'))
    scores.append(metrics.precision_score(y_true, 
                                          y_pred, 
                                          average='weighted'))
    temp_df[model] = scores
    
    return temp_df

In [6]:
# Define X and y
X = data["Full_doc"]
y = data["Tags"]

# Initialize the "CountVectorizer" TFIDF for Full_doc
vectorizer = TfidfVectorizer(analyzer="word",
                             max_df=.6,
                             min_df=0.005,
                             tokenizer=None,
                             preprocessor=' '.join,
                             stop_words=None,
                             lowercase=False)

vectorizer.fit(X)
X_tfidf = vectorizer.transform(X)

print("Shape of X for Full_doc: {}".format(X_tfidf.shape))

# Multilabel binarizer for targets
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(y)
y_binarized = multilabel_binarizer.transform(y)

print("Shape of y: {}".format(y_binarized.shape))

Shape of X for Full_doc: (676651, 1134)
Shape of y: (676651, 100)


In [7]:
# Create train and test split (30%)
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_binarized,
                                                    test_size=0.3, random_state=8)
print("X_train shape : {}".format(X_train.shape))
print("X_test shape : {}".format(X_test.shape))
print("y_train shape : {}".format(y_train.shape))
print("y_test shape : {}".format(y_test.shape))

X_train shape : (473655, 1134)
X_test shape : (202996, 1134)
y_train shape : (473655, 100)
y_test shape : (202996, 100)


In [8]:
from sklearn.model_selection import GridSearchCV
# Initialize Logistic Regression with OneVsRest

start = time.perf_counter()
param_lr = {"estimator__C": [100, 10, 1.0, 0.1],
               "estimator__penalty": ["l1", "l2"],
               "estimator__dual": [False],
               "estimator__solver": ["liblinear"]}

multiclass_lr_cv = GridSearchCV(OneVsRestClassifier(LogisticRegression()),
                              param_grid=param_lr,
                              n_jobs=-1,
                              cv=5,
                              scoring="f1_weighted",
                              return_train_score = True,
                              refit=True)

multiclass_lr_cv.fit(X_train, y_train)
end = time.perf_counter()
print(f"It tooks {end - start}(s)") #1h approx

It tooks -4233.229224875002(s)


In [9]:
lr_cv_results = pd.DataFrame.from_dict(multiclass_lr_cv.cv_results_)
print("-"*50)
print("Best params for Logistic Regression")
print("-" * 50)
lr_best_params = multiclass_lr_cv.best_params_
print(lr_best_params)

--------------------------------------------------
Best params for Logistic Regression
--------------------------------------------------
{'estimator__C': 100, 'estimator__dual': False, 'estimator__penalty': 'l1', 'estimator__solver': 'liblinear'}


In [10]:
lr_cv_results[lr_cv_results["params"]==lr_best_params]


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_estimator__C,param_estimator__dual,param_estimator__penalty,param_estimator__solver,params,split0_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,2664.185185,10.264316,1.23594,0.028223,100,False,l1,liblinear,"{'estimator__C': 100, 'estimator__dual': False...",0.525614,...,0.525274,0.001384,1,0.543155,0.542944,0.542777,0.544172,0.543198,0.543249,0.000486


In [11]:
# Predict
y_test_predicted_labels_tfidf = multiclass_lr_cv.predict(X_test)

# Inverse transform
y_test_pred_inversed = multilabel_binarizer.inverse_transform(y_test_predicted_labels_tfidf)
y_test_inversed = multilabel_binarizer.inverse_transform(y_test)

print("-"*50)
print("Print 5 first predicted Tags vs true Tags")
print("-" * 50)
print("Predicted:", y_test_pred_inversed[0:5])
print("True:", y_test_inversed[0:5])

--------------------------------------------------
Print 5 first predicted Tags vs true Tags
--------------------------------------------------
Predicted: [('c++',), (), ('mysql',), ('php',), ('python',)]
True: [('c++',), ('.net', 'c#'), ('mysql',), ('php',), ('python',)]


In [12]:
df_metrics_compare = metrics_score("Logit", 
                                   df=None, 
                                   y_true = y_test,
                                   y_pred = y_test_predicted_labels_tfidf)
df_metrics_compare

Unnamed: 0,Logit
Accuracy,0.255276
F1,0.587398
Jaccard,0.377973
Recall,0.423507
Precision,0.724488


In [26]:
from pathlib import Path
prefix = "/Users/tatia/Developer/tag-generator/core/"
#date = "2024-01-02/19"
#time_str= "193120"
hr = time_str[0:2]
output_model_dir = f"{prefix}/models/{date}"

print(output_model_dir)

Path(output_model_dir).mkdir(parents=True, exist_ok=True)
data_file_name = Path(data_file_name).stem
joblib.dump(multiclass_lr_cv,f"{output_model_dir}/{data_file_name}_logit_nlp_model.pkl")
joblib.dump(vectorizer,f"{output_model_dir}/{data_file_name}_tfidf_vectorizer.pkl")
joblib.dump(multilabel_binarizer,f"{output_model_dir}/{data_file_name}_multilabel_binarizer.pkl")

/Users/tatia/Developer/tag-generator/core//models/2024-01-02/19


['/Users/tatia/Developer/tag-generator/core//models/2024-01-02/19/193120_final_so_questions_2008_2023_multilabel_binarizer.pkl']

In [None]:
version = "1.0.0"
artifacts = {
    "model": multiclass_lr_cv,
    "binarizer": multilabel_binarizer,
    "vectorizer": vectorizer,
    "metadata" : {
        "name": "OvR Logistic Regression",
        "version": version,
        "description": "One-vs-the-rest (OvR) multiclass Logistic Regression classifier.",
        "training_data": "2024-01-02, 19:31:20'"
    }
}
joblib.dump(artifacts, f"{output_model_dir}/{data_file_name}artifacts_{version}.pkl")