In [None]:
# April 2025
# Train classification model using Logistic Regression
# Violeta Berdejo-Espinola

In [None]:
# %pip install polars pyarrow scikit-learn embetter imblearn tqdm mpu

In [1]:
# read data

import polars as pl

corpus = pl.read_parquet('../data/outputs_similarity_matches/corpus.parquet')
print(len(corpus))

# define x and y

x = list(corpus['title'] + ' ' + corpus['abstract'])
y = list(corpus['label'])
print(len(x)), print(len(y))

# calculate weights

from sklearn.utils.class_weight import compute_class_weight
import numpy as np

classes = np.unique(y)
weights = compute_class_weight('balanced', classes=classes, y=y)
class_weight_dict = dict(zip(classes, weights))
class_weight_dict

376318
376318
376318


{0: 0.5061371064276203, 1: 41.23580977427131}

In [4]:
corpus.describe()

statistic,index,title,abstract,journal,year,authors,language,label,abstract_length
str,f64,str,str,str,f64,str,str,f64,f64
"""count""",376318.0,"""376318""","""376318""","""376318""",376318.0,"""376318""","""376318""",376318.0,376318.0
"""null_count""",0.0,"""0""","""0""","""0""",0.0,"""0""","""0""",0.0,0.0
"""mean""",259407.224302,,,,2006.047173,,,0.012125,1470.370384
"""std""",149486.554872,,,,11.279078,,,0.109446,501.950977
"""min""",250.0,"""&iteretmochelys imbricata&it s…","""! An oligotrophic coastal fres…","""Acta chiropterologica""",1936.0,"""A'Bear, AD; Boddy, L; Jones, T…","""en""",0.0,300.0
"""25%""",134267.0,,,,2000.0,,,0.0,1128.0
"""50%""",256447.0,,,,2008.0,,,0.0,1448.0
"""75%""",390643.0,,,,2014.0,,,0.0,1771.0
"""max""",524432.0,"""Δ15n variation in ulva lactuca…","""{en} Over the past decades, mu…","""Zoo biology""",2025.0,"""Üzüm, N; Olgun, K""","""en""",1.0,7214.0


In [5]:
print(type(y[9]))
x[:5]

<class 'int'>


['The influence of the nasonov pheromone on the recognition of house bees and foragers by varroa-jacobsoni Simultaneous choice tests proved that Varroa jacobsoni is able to distinguish house bees and foragers by means of the age-dependent Nasonov pheromone production of the bees. The secretion of one or 10 Nasonov glands, respectively, of foragers showed a repellent effect upon the mite equal to one or 10 bee equivalents, respectively, of geraniol, the main component of the pheromone. One hundred bee equivalents of geraniol caused the strongest repellent effect. Specimen gathered from 3 to 7 day old house bees did not show any influence on the mite in comparison to controls. The results show that Varroa jacobsoni is capable of olfactorial recognition of house bees as ideal host.',
 'Efficacy and accuracy of portable pit-antennae when locating fish in ice-covered streams Active tracking of passive integrated transponder (PIT)-tags using portable antennae is becoming an increasingly comm

In [9]:
# split data

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42, stratify=y)

In [10]:
print(len(x_train), len(y_train), len(x_test), len(y_test))

301054 301054 75264 75264


In [11]:
from collections import Counter

counter = Counter()

for _ in y_test:
    counter[_] +=1

print(counter)

Counter({0: 74351, 1: 913})


In [13]:
from imblearn.pipeline import make_pipeline 

from embetter.text import SentenceEncoder

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_predict, StratifiedKFold

from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix

from tqdm import tqdm

def train_eval_model(x_train, y_train, x_test, y_test, kfold, embedding_model):
    
    """"
    Train and test model   
    Parameters:
    -----------
    x_train, x_test: list of strings -> raw text to create embeddings
    y_train_y_test: array -> binary labels
    kfold: int -> indicating number of folds for cross validation
    embedding_model: string - > instantiation of embedding model
    random_state: int -> seed

    Returns:
    --------
    Model object.
    Dictionary with scores.
    Aray of predictions and predicted probabilities.
    Log losss values for test and train set.
    
    """
    pipeline = make_pipeline(
        SentenceEncoder(embedding_model),
        LogisticRegression(solver='liblinear', 
                            class_weight=class_weight_dict, 
                            random_state=42)
    )
    
    #train model
    print("performing cross-validation")
    y_train_pred_cv = cross_val_predict(pipeline, 
                                        x_train, 
                                        y_train, 
                                        cv=StratifiedKFold(kfold), 
                                        method='predict') 
    print("fitting the model")
    pipeline.fit(x_train, y_train)
    
    # make predictions
    print("making predictions")
    # y_train_pred = pipeline.predict(x_train)

    # y_test_pred = pipeline.predict(x_test)
    
    # y_train_pred_proba = pipeline.predict_proba(x_train) 
    
    # y_test_pred_proba = pipeline.predict_proba(x_test)  
    
    y_train_pred = list(tqdm(pipeline.predict(x_train), desc="predicting on training set"))
    y_test_pred = list(tqdm(pipeline.predict(x_test), desc="predicting on test set"))
    
    y_train_pred_proba = np.array(list(tqdm(pipeline.predict_proba(x_train), desc="predicting probabilities on training set")))
    y_test_pred_proba = np.array(list(tqdm(pipeline.predict_proba(x_test), desc="predicting probabilities on test set")))
    
    # log loss 
    logloss_tr = log_loss(y_train, y_train_pred_proba) 
    
    logloss_ts = log_loss(y_test, y_test_pred_proba)
    
    # model object -svm / logreg-
    model = pipeline.steps[-1][1]
    
    # confusion matrix 
    cm_tr = confusion_matrix(y_train, pipeline.predict(x_train)) 
    cm_ts = confusion_matrix(y_test, pipeline.predict(x_test)) 
    
     # get model scores 
    scores = {
        'Embedding_model': embedding_model,
        'CV': kfold,
        'F1_tr_cv': round(f1_score(y_train, y_train_pred_cv), 3),
        'F1_tr': round(f1_score(y_train, y_train_pred), 3),
        'F1_ts': round(f1_score(y_test, y_test_pred), 3),
        'Precision_tr_cv': round(precision_score(y_train, y_train_pred_cv), 3),
        'Precision_tr': round(precision_score(y_train, y_train_pred), 3),
        'Precision_ts': round(precision_score(y_test, y_test_pred), 3),
        'Recall_tr_cv': round(recall_score(y_train, y_train_pred_cv), 3),
        'Recall_tr': round(recall_score(y_train, y_train_pred), 3),
        'Recall_ts': round(recall_score(y_test, y_test_pred), 3)
        }
    
    return model, scores, y_train_pred, y_test_pred, y_train_pred_proba, y_test_pred_proba, logloss_tr, logloss_ts, cm_tr, cm_ts

In [None]:
model_mpnet = 'paraphrase-multilingual-mpnet-base-v2'
model_distill = 'distiluse-base-multilingual-cased-v1'

In [14]:
model, scores, y_train_pred, y_test_pred, y_train_pred_proba, y_test_pred_proba, logloss_tr, logloss_ts, cm_tr, cm_ts = train_eval_model(x_train, y_train, x_test, y_test, 5, 'paraphrase-multilingual-mpnet-base-v2')

performing cross-validation
fitting the model
making predictions


predicting on training set: 100%|██████████| 301054/301054 [00:00<00:00, 2175510.53it/s]
predicting on test set: 100%|██████████| 75264/75264 [00:00<00:00, 703291.43it/s]
predicting probabilities on training set: 100%|██████████| 301054/301054 [00:00<00:00, 1394382.10it/s]
predicting probabilities on test set: 100%|██████████| 75264/75264 [00:00<00:00, 886500.30it/s]


In [15]:
scores

{'Embedding_model': 'paraphrase-multilingual-mpnet-base-v2',
 'CV': 5,
 'F1_tr_cv': 0.138,
 'F1_tr': 0.146,
 'F1_ts': 0.142,
 'Precision_tr_cv': 0.075,
 'Precision_tr': 0.079,
 'Precision_ts': 0.077,
 'Recall_tr_cv': 0.845,
 'Recall_tr': 0.901,
 'Recall_ts': 0.865}

In [20]:
cm_ts

array([[64905,  9446],
       [  123,   790]])

In [22]:
len(y_test)

75264

In [36]:
logloss_tr

0.3048401435824959

In [37]:
logloss_ts

0.3025656417300697

In [34]:
# save to disk

import mpu

mpu.io.write('../results/model_baseline.pickle', model)

pl.DataFrame(scores).write_csv('../results/model_baseline_logreg.csv')
pl.DataFrame(y_train_pred_proba).write_csv('../results/logreg_y_train_pred_proba.csv')
pl.DataFrame(y_test_pred_proba).write_csv('../results/logreg_y_test_pred_proba.csv')
pl.DataFrame(y_train_pred).write_csv('../results/logreg_y_train_pred.csv')
pl.DataFrame(y_test_pred).write_csv('../results/logreg_y_test_pred.csv')
pl.DataFrame(cm_tr).write_csv('../results/logreg_cm_tr.csv')
pl.DataFrame(cm_ts).write_csv('../results/logreg_cm_ts.csv')