# read data

In [1]:
# read data

import polars as pl

corpus = pl.read_parquet('../data/outputs_similarity_matches/corpus.parquet')
print(len(corpus))

# define x and y

x = list(corpus['title'] + ' ' + corpus['abstract'])
y = list(corpus['label'])
print(len(x)), print(len(y))

# calculate weights

from sklearn.utils.class_weight import compute_class_weight
import numpy as np

classes = np.unique(y)
weights = compute_class_weight('balanced', classes=classes, y=y)
class_weight_dict = dict(zip(classes, weights))
class_weight_dict

376318
376318
376318


{0: 0.5061371064276203, 1: 41.23580977427131}

In [None]:
x = [
        "The new album from the popular band topped the music charts.",
        "The television series received critical acclaim for its storytelling.",
        "The actor won an award for their outstanding performance in the film.",
        "The music festival attracted thousands of fans from around the world.",
        "The art exhibition features works from renowned contemporary artists.",
        "The comedian's stand-up special is now available on streaming platforms.",
        "The video game has become a global phenomenon with millions of players.",
        "The celebrity announced their engagement on social media.",
        "The theme park is launching a new ride for the summer season."]

y = [0,1,0,1,0,0,0,0,0]

# calculate weights

from sklearn.utils.class_weight import compute_class_weight
import numpy as np

classes = np.unique(y)
weights = compute_class_weight('balanced', classes=classes, y=y)
class_weight_dict = dict(zip(classes, weights))
class_weight_dict

In [2]:
from imblearn.pipeline import make_pipeline 
from embetter.text import SentenceEncoder
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix

from tqdm import tqdm
import numpy as np
import time

def train_eval_model(x_train, y_train, x_test, y_test, kfold, embedding_model):
    
    """"
    Train and test model   
    Parameters:
    -----------
    x_train, x_test: list of strings -> raw text to create embeddings
    y_train_y_test: array -> binary labels
    kfold: int -> indicating number of folds for cross validation
    embedding_model: string - > instantiation of embedding model
    random_state: int -> seed
    C: int -> strength of regularisation parametre

    Returns:
    --------
    Model object.
    Dictionary with scores.
    Aray of predictions and predicted probabilities.
    Log losss values for test and train set.
    
    """
    pipeline = make_pipeline(
        SentenceEncoder(embedding_model),
        SGDClassifier(loss='hinge',         # svm
                      max_iter=5,           # epochs
                      learning_rate='optimal',
                      n_iter_no_change=2,   # number of iterations with no improvement to wait before stopping fitting
                      class_weight=class_weight_dict,
                      random_state=42,
                      n_jobs=6,             # number of CPUs to use
                      verbose=1 
            )
    )
    
    # Perform cross-validation with time tracking
    print("Performing cross-validation")
    skf = StratifiedKFold(n_splits=kfold)

    y_train_pred_cv = np.zeros(len(y_train))

    for fold, (train_idx, val_idx) in enumerate(skf.split(x_train, y_train), 1):
        print(f"Starting fold {fold}")
        start_time = time.time()
        
        # Fit on training fold
        pipeline.fit([x_train[i] for i in train_idx], [y_train[i] for i in train_idx])
        
        # Predict on validation fold
        y_train_pred_cv[val_idx] = pipeline.predict([x_train[i] for i in val_idx])
        
        end_time = time.time()
        duration = end_time - start_time
        print(f"Finished fold {fold} in {duration:.2f} seconds")
    
    print("Fitting the final model")
    pipeline.fit(x_train, y_train)
    
    # make predictions
    print("Making predictions")
    y_train_pred = list(tqdm(pipeline.predict(x_train), desc="Predicting on training set"))
    y_test_pred = list(tqdm(pipeline.predict(x_test), desc="Predicting on test set"))
    
    # model object -svm / logreg-
    model = pipeline.steps[-1][1]  

    # get model scores 
    scores = {
        'CV': kfold,
        'F1_tr_cv': round(f1_score(y_train, y_train_pred_cv), 3),
        'F1_tr': round(f1_score(y_train, y_train_pred), 3),
        'F1_ts': round(f1_score(y_test, y_test_pred), 3),
        'Precision_tr_cv': round(precision_score(y_train, y_train_pred_cv), 3),
        'Precision_tr': round(precision_score(y_train, y_train_pred), 3),
        'Precision_ts': round(precision_score(y_test, y_test_pred), 3),
        'Recall_tr_cv': round(recall_score(y_train, y_train_pred_cv), 3),
        'Recall_tr': round(recall_score(y_train, y_train_pred), 3),
        'Recall_ts': round(recall_score(y_test, y_test_pred), 3)
        }
    
    print(f'Confusion matrix train set:\n{confusion_matrix(y_train, pipeline.predict(x_train))}')
    print(f'Confusion matrix test set:\n{confusion_matrix(y_test, pipeline.predict(x_test))}') 
 
    return scores, y_train_pred, y_test_pred, model

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42, stratify=y)
len(x_train), len(x_test)

(301054, 75264)

In [4]:
scores, y_train_pred, y_test_pred, model = train_eval_model(x_train, y_train, x_test, y_test, 5, 'paraphrase-multilingual-mpnet-base-v2')

Performing cross-validation
Starting fold 1
-- Epoch 1
Norm: 48.80, NNZs: 768, Bias: -23.461583, T: 240843, Avg. loss: 4.364508
Total training time: 0.25 seconds.
-- Epoch 2
Norm: 33.95, NNZs: 768, Bias: -12.626407, T: 481686, Avg. loss: 0.826260
Total training time: 0.50 seconds.
-- Epoch 3
Norm: 28.89, NNZs: 768, Bias: -9.825129, T: 722529, Avg. loss: 0.631117
Total training time: 0.75 seconds.
-- Epoch 4
Norm: 26.55, NNZs: 768, Bias: -8.115728, T: 963372, Avg. loss: 0.547474
Total training time: 1.01 seconds.
-- Epoch 5
Norm: 24.89, NNZs: 768, Bias: -7.077434, T: 1204215, Avg. loss: 0.478720
Total training time: 1.27 seconds.




Finished fold 1 in 1730.79 seconds
Starting fold 2
-- Epoch 1
Norm: 51.05, NNZs: 768, Bias: -24.606095, T: 240843, Avg. loss: 4.295647
Total training time: 0.24 seconds.
-- Epoch 2
Norm: 33.08, NNZs: 768, Bias: -14.045523, T: 481686, Avg. loss: 0.801592
Total training time: 0.49 seconds.
-- Epoch 3
Norm: 28.58, NNZs: 768, Bias: -9.735281, T: 722529, Avg. loss: 0.615612
Total training time: 0.73 seconds.
-- Epoch 4
Norm: 26.52, NNZs: 768, Bias: -7.588107, T: 963372, Avg. loss: 0.521560
Total training time: 0.98 seconds.
-- Epoch 5
Norm: 25.06, NNZs: 768, Bias: -7.402012, T: 1204215, Avg. loss: 0.489844
Total training time: 1.22 seconds.




Finished fold 2 in 3863.17 seconds
Starting fold 3
-- Epoch 1
Norm: 49.31, NNZs: 768, Bias: -23.511032, T: 240843, Avg. loss: 4.120446
Total training time: 0.26 seconds.
-- Epoch 2
Norm: 34.09, NNZs: 768, Bias: -13.417673, T: 481686, Avg. loss: 0.816942
Total training time: 0.51 seconds.
-- Epoch 3
Norm: 28.82, NNZs: 768, Bias: -9.828206, T: 722529, Avg. loss: 0.603001
Total training time: 0.77 seconds.
-- Epoch 4
Norm: 26.30, NNZs: 768, Bias: -8.183595, T: 963372, Avg. loss: 0.523062
Total training time: 1.03 seconds.
-- Epoch 5
Norm: 25.08, NNZs: 768, Bias: -7.157042, T: 1204215, Avg. loss: 0.492730
Total training time: 1.29 seconds.




Finished fold 3 in 2235.77 seconds
Starting fold 4
-- Epoch 1
Norm: 50.16, NNZs: 768, Bias: -23.320334, T: 240843, Avg. loss: 4.054760
Total training time: 0.24 seconds.
-- Epoch 2
Norm: 34.29, NNZs: 768, Bias: -14.296457, T: 481686, Avg. loss: 0.827502
Total training time: 0.49 seconds.
-- Epoch 3
Norm: 29.09, NNZs: 768, Bias: -9.711394, T: 722529, Avg. loss: 0.623602
Total training time: 0.74 seconds.
-- Epoch 4
Norm: 26.40, NNZs: 768, Bias: -8.310311, T: 963372, Avg. loss: 0.530295
Total training time: 1.00 seconds.
-- Epoch 5
Norm: 24.99, NNZs: 768, Bias: -7.421527, T: 1204215, Avg. loss: 0.493675
Total training time: 1.26 seconds.




Finished fold 4 in 1841.80 seconds
Starting fold 5
-- Epoch 1
Norm: 51.59, NNZs: 768, Bias: -24.933665, T: 240844, Avg. loss: 4.585689
Total training time: 0.25 seconds.
-- Epoch 2
Norm: 34.49, NNZs: 768, Bias: -14.069610, T: 481688, Avg. loss: 0.844360
Total training time: 0.51 seconds.
-- Epoch 3
Norm: 29.45, NNZs: 768, Bias: -10.590814, T: 722532, Avg. loss: 0.621153
Total training time: 0.77 seconds.
-- Epoch 4
Norm: 26.64, NNZs: 768, Bias: -8.312561, T: 963376, Avg. loss: 0.525965
Total training time: 1.02 seconds.
-- Epoch 5
Norm: 24.98, NNZs: 768, Bias: -7.121520, T: 1204220, Avg. loss: 0.476215
Total training time: 1.27 seconds.




Finished fold 5 in 1728.62 seconds
Fitting the final model
-- Epoch 1
Norm: 43.60, NNZs: 768, Bias: -19.490344, T: 301054, Avg. loss: 3.654130
Total training time: 0.30 seconds.
-- Epoch 2
Norm: 30.15, NNZs: 768, Bias: -12.329048, T: 602108, Avg. loss: 0.713228
Total training time: 0.61 seconds.
-- Epoch 3
Norm: 26.66, NNZs: 768, Bias: -8.310700, T: 903162, Avg. loss: 0.554633
Total training time: 0.93 seconds.
-- Epoch 4
Norm: 24.02, NNZs: 768, Bias: -7.712512, T: 1204216, Avg. loss: 0.492060
Total training time: 1.25 seconds.
-- Epoch 5
Norm: 22.74, NNZs: 768, Bias: -6.514829, T: 1505270, Avg. loss: 0.452196
Total training time: 1.58 seconds.
Making predictions


Predicting on training set: 100%|██████████| 301054/301054 [00:00<00:00, 1384614.87it/s]
Predicting on test set: 100%|██████████| 75264/75264 [00:00<00:00, 1151066.90it/s]


Confusion matrix train set:
[[265147  32257]
 [   710   2940]]
Confusion matrix test set:
[[66342  8009]
 [  193   720]]


In [8]:
scores


{'CV': 5,
 'F1_tr_cv': 0.09,
 'F1_tr': 0.151,
 'F1_ts': 0.149,
 'Precision_tr_cv': 0.048,
 'Precision_tr': 0.084,
 'Precision_ts': 0.082,
 'Recall_tr_cv': 0.908,
 'Recall_tr': 0.805,
 'Recall_ts': 0.789}

In [9]:
print(np.mean(y_test_pred == y_test))

0.0


In [10]:
# save to disk

import mpu

mpu.io.write('../results/model_baseline____.pickle', model)

pl.DataFrame(scores).write_csv('../results/model_baseline___.csv')

pl.DataFrame(y_train_pred).write_csv('../results/logreg_y_train_pred___.csv')
pl.DataFrame(y_test_pred).write_csv('../results/logreg_y_test_pred___.csv')
# pl.DataFrame(cm_tr).write_csv('../results/logreg_cm_tr.csv')
# pl.DataFrame(cm_ts).write_csv('../results/logreg_cm_ts.csv')

In [None]:
import mpu

model = mpu.io.read('../results/model_baseline____.pickle')

In [None]:
import polars as pl

# import spanish data

spa_dat = pl.read

In [None]:
from sklearn.linear_model import SGDClassifier

model.predict()