In [1]:
#imports
import joblib
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

In [2]:
from joblib import load
import os

input_dir = '../../output/embeddings'

X_train_emb = load(os.path.join(input_dir, 'X_train_emb.joblib'))
X_test_emb = load(os.path.join(input_dir, 'X_test_emb.joblib'))
y_train = load(os.path.join(input_dir, 'y_train.joblib'))
y_test = load(os.path.join(input_dir, 'y_test.joblib')) # Don't forget y_test!

print("Embeddings and labels loaded successfully!")

Embeddings and labels loaded successfully!


In [3]:
param_grid = [
    {
        'penalty': ['l1', 'l2'],
        'C': [0.01, 0.1, 1, 10],
        'solver': ['liblinear'],
        'max_iter': [500]
    },
    {
        'penalty': ['l2'],
        'C': [0.01, 0.1, 1, 10],
        'solver': ['saga'],
        'max_iter': [1000]
    }
]

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid=param_grid,
    cv=kfold,
    scoring='accuracy',
    n_jobs=16
)
grid_search.fit(X_train_emb, y_train)

best_model = grid_search.best_estimator_

In [6]:
y_pred = best_model.predict(X_test_emb)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95      7843
           1       0.95      0.95      0.95      8568

    accuracy                           0.95     16411
   macro avg       0.95      0.95      0.95     16411
weighted avg       0.95      0.95      0.95     16411



In [7]:
joblib.dump(best_model, '../../output/models/LogisticRegression_SBERT.joblib')

['../../output/models/LogisticRegression_SBERT.joblib']