In [None]:
# Model training of text classiffiers using logreg, svm, mlp 
# Violeta Berdejo-Espinola, Akos Hajas, Nan Ye
# November 2024

# read data

In [1]:
import mpu

# raw corpus

corpus_raw = mpu.io.read('../data/corpus_raw.pickle')
corpus_raw_long = mpu.io.read('../data/corpus_raw_long.pickle')

x_raw = [' '.join(each_document) for each_document in corpus_raw]
x_raw_long = [' '.join(each_document) for each_document in corpus_raw_long]

# clean corpus

corpus = mpu.io.read('../data/corpus_clean.pickle')
corpus_long = mpu.io.read('../data/corpus_clean_long.pickle')

x = corpus
x_long = corpus_long

# pos, negs

pos = mpu.io.read('../data/pos.pickle')
neg = mpu.io.read('../data/neg.pickle')
y = [1] * len(pos) + [0] * len(neg)

In [2]:
# calculate weights

weight_for_class_0 = len(x) / (len(neg) * 2) 
weight_for_class_1 = len(x) / (len(pos) * 2) 

# split data

In [3]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)
x_train_long, x_test_long, y_train_long, y_test_long = train_test_split(x_long, y, test_size=0.20, random_state=42)
x_train_r, x_test_r, y_train_r, y_test_r = train_test_split(x_raw, y, test_size=0.20, random_state=42)
x_train_r_long, x_test_r_long, y_train_r_long, y_test_r_long = train_test_split(x_raw_long, y, test_size=0.20, random_state=42)

In [None]:
from collections import Counter

counter = Counter()

for i in y_test:
    counter[i] +=1
    
print(counter)

# instantiate feature extractors, embedding models, resamplers, models 

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer # uses one-dim array of strings ~ shape (n,)
from sklearn.feature_extraction.text import CountVectorizer # returns arrays

vect_cv = CountVectorizer()
vect_tfidf = TfidfVectorizer()

model_mpnet = 'paraphrase-multilingual-mpnet-base-v2'
model_distill = 'distiluse-base-multilingual-cased-v1'

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import ADASYN 

rus = RandomUnderSampler(random_state=42, sampling_strategy=1)
ros = RandomOverSampler(random_state=42, sampling_strategy='not majority')
ada = ADASYN(random_state=42)

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

logreg = LogisticRegression(solver='liblinear', random_state=42)
logreg_weight = LogisticRegression(solver='liblinear', class_weight={0: weight_for_class_0, 1: weight_for_class_1}, random_state=42)
svm = SVC(kernel='linear')
svm_weight = SVC(kernel='linear', class_weight={0: weight_for_class_0, 1: weight_for_class_1}, probability=True)
mlp = MLPClassifier(activation='logistic', batch_size=16, hidden_layer_sizes=(), learning_rate='constant',learning_rate_init=0.001, solver='adam', random_state=42)
mlp_t = MLPClassifier(activation='logistic', batch_size=16, hidden_layer_sizes=(5,), learning_rate='invscaling',learning_rate_init=1, solver='sgd', random_state=42, max_iter=400)

# function to train eval models

In [None]:
from imblearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline 

from embetter.text import SentenceEncoder

from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import f1_score, precision_score, recall_score

import pandas as pd

In [None]:
# TF TFIDF baseline and weighted

def train_eval_tf_tfidf(x_train, y_train, x_test, y_test, text_length, kfold):
    
    def run_estimator(estimator, feature_extractor, balanced):
        
        pipeline = Pipeline([("vectorizer", feature_extractor),
                             ("estimator", estimator)
                            ])
    
        y_train_pred_cv = cross_val_predict(pipeline, x_train, y_train, cv=StratifiedKFold(kfold), method='predict') # cross val splits the data and then applies the pipeline steps

        pipeline.fit(x_train, y_train)
        
        y_train_pred = pipeline.predict(x_train)

        y_test_pred = pipeline.predict(x_test)

        return {
            'Classifier': pipeline['estimator'],
            'Feature_extraction': pipeline['vectorizer'],
            'Weighting': 'Weighted' if balanced else 'None',
            'CV': kfold,
            'Text_length': text_length,
            'F1_tr_cv': round(f1_score(y_train, y_train_pred_cv), 3),
            'F1_tr': round(f1_score(y_train, y_train_pred), 3),
            'F1_ts': round(f1_score(y_test, y_test_pred), 3),
            'Precision_tr_cv': round(precision_score(y_train, y_train_pred_cv), 3),
            'Precision_tr': round(precision_score(y_train, y_train_pred), 3),
            'Precision_ts': round(precision_score(y_test, y_test_pred), 3),
            'Recall_tr_cv': round(recall_score(y_train, y_train_pred_cv), 3),
            'Recall_tr': round(recall_score(y_train, y_train_pred), 3),
            'Recall_ts': round(recall_score(y_test, y_test_pred), 3)
        }


    all_scores = []
    
    for feature_extractor in [vect_cv, vect_tfidf]:
        
        for estimator in [logreg, svm]:
            all_scores.append(run_estimator(estimator, feature_extractor, False))
            
        for estimator in [logreg_weight, svm_weight]: 
            all_scores.append(run_estimator(estimator, feature_extractor, True))

    return all_scores

In [None]:
# embeddings baseline and weighted

def train_eval_embeddings(x_train, y_train, x_test, y_test, text_length, kfold):

    def run_estimator(estimator, balanced):
        
        pipeline = make_pipeline(
            SentenceEncoder(embedding_model),
            estimator
        )
        
        y_train_pred_cv = cross_val_predict(pipeline, x_train, y_train, cv=StratifiedKFold(kfold), method='predict')

        pipeline.fit(x_train, y_train)
        
        y_train_pred = pipeline.predict(x_train)

        y_test_pred = pipeline.predict(x_test)

        return {
            'Classifier': estimator,
            'Feature_extraction': embedding_model,
            'Weighting': 'Weighted' if balanced else None,
            'CV': kfold,
            'Text_length': text_length,
            'F1_tr_cv': round(f1_score(y_train, y_train_pred_cv), 3),
            'F1_tr': round(f1_score(y_train, y_train_pred), 3),
            'F1_ts': round(f1_score(y_test, y_test_pred), 3),
            'Precision_tr_cv': round(precision_score(y_train, y_train_pred_cv), 3),
            'Precision_tr': round(precision_score(y_train, y_train_pred), 3),
            'Precision_ts': round(precision_score(y_test, y_test_pred), 3),
            'Recall_tr_cv': round(recall_score(y_train, y_train_pred_cv), 3),
            'Recall_tr': round(recall_score(y_train, y_train_pred), 3),
            'Recall_ts': round(recall_score(y_test, y_test_pred), 3)
        }

    all_scores = []
    
    for embedding_model in [model_mpnet, model_distill]:
        
            for estimator in [logreg, svm, mlp, mlp_t]:
                all_scores.append(run_estimator(estimator, False))
                
            for estimator in [logreg_weight, svm_weight]: 
                all_scores.append(run_estimator(estimator, True))

    return all_scores

In [None]:
# TF TFIDF resampled

def train_eval_tf_tfidf_resampled(x_train, y_train, x_test, y_test, text_length, kfold):
    
    all_scores = []
    
    for feature_extractor in [vect_cv, vect_tfidf]:
        
        for resampler in [rus, ros, ada]:
        
            for estimator in [logreg, svm, mlp]:
        
                pipeline = Pipeline([("vectorizer", feature_extractor),
                                    ("resampler", resampler),
                                    ("estimator", estimator)
                                    ])
        
                y_train_pred_cv = cross_val_predict(pipeline, x_train, y_train, cv=StratifiedKFold(kfold), method='predict')

                pipeline.fit(x_train, y_train)
                
                y_train_pred = pipeline.predict(x_train)

                y_test_pred = pipeline.predict(x_test)

                scores = {
                    'Classifier': pipeline['estimator'],
                    'Feature_extraction': pipeline['vectorizer'],
                    'Weighting': pipeline['resampler'],
                    'CV': kfold,
                    'Text_length': text_length,
                    'F1_tr_cv': round(f1_score(y_train, y_train_pred_cv), 3),
                    'F1_tr': round(f1_score(y_train, y_train_pred), 3),
                    'F1_ts': round(f1_score(y_test, y_test_pred), 3),
                    'Precision_tr_cv': round(precision_score(y_train, y_train_pred_cv), 3),
                    'Precision_tr': round(precision_score(y_train, y_train_pred), 3),
                    'Precision_ts': round(precision_score(y_test, y_test_pred), 3),
                    'Recall_tr_cv': round(recall_score(y_train, y_train_pred_cv), 3),
                    'Recall_tr': round(recall_score(y_train, y_train_pred), 3),
                    'Recall_ts': round(recall_score(y_test, y_test_pred), 3)    
                }
    
                all_scores.append(scores)

    return all_scores

In [None]:
# embeddings resampled

def train_eval_embedding_resampled(x_train, y_train, x_test, y_test, text_length, kfold, embed_model):
    
    all_scores = []
    
    pipeline = make_pipeline(
        SentenceEncoder(embed_model),
        RandomOverSampler(random_state=42, sampling_strategy='not majority'),
        MLPClassifier(activation='logistic', batch_size=16, hidden_layer_sizes=(), learning_rate='constant',learning_rate_init=0.001, solver='adam', random_state=42)
    )
                                
    y_train_pred_cv = cross_val_predict(pipeline, x_train, y_train, cv=StratifiedKFold(kfold), method='predict')

    pipeline.fit(x_train, y_train)
    
    y_train_pred = pipeline.predict(x_train)

    y_test_pred = pipeline.predict(x_test)

    scores = {
        'Classifier': mlp,
        'Feature_extraction': embed_model,
        'Weighting': ros,
        'CV': kfold,
        'Text_length': text_length,
        'F1_tr_cv': round(f1_score(y_train, y_train_pred_cv), 3),
        'F1_tr': round(f1_score(y_train, y_train_pred), 3),
        'F1_ts': round(f1_score(y_test, y_test_pred), 3),
        'Precision_tr_cv': round(precision_score(y_train, y_train_pred_cv), 3),
        'Precision_tr': round(precision_score(y_train, y_train_pred), 3),
        'Precision_ts': round(precision_score(y_test, y_test_pred), 3),
        'Recall_tr_cv': round(recall_score(y_train, y_train_pred_cv), 3),
        'Recall_tr': round(recall_score(y_train, y_train_pred), 3),
        'Recall_ts': round(recall_score(y_test, y_test_pred), 3)    
    }

    all_scores.append(scores)

    return all_scores

# train eval models

In [None]:
# TF TFIDF baseline and weighted

df1 = pd.DataFrame(train_eval_tf_tfidf(x_train, y_train, x_test, y_test,'Title_Abstract',2))
df2 = pd.DataFrame(train_eval_tf_tfidf(x_train_long, y_train_long, x_test_long, y_test_long,'Title_Abstract_Main',2))

In [None]:
# embeddings baseline and weighted

df3 = pd.DataFrame(train_eval_embeddings(x_train_r, y_train_r, x_test_r, y_test_r,'Title_Abstract',2))
df4 = pd.DataFrame(train_eval_embeddings(x_train_r_long, y_train_r_long, x_test_r_long, y_test_r_long,'Title_Abstract_Main',2))

In [None]:
# TF TFIDF resampled

df5 = pd.DataFrame(train_eval_tf_tfidf_resampled(x_train, y_train, x_test, y_test,'Title_Abstract',2))
df6 = pd.DataFrame(train_eval_tf_tfidf_resampled(x_train_long, y_train_long, x_test_long, y_test_long,'Title_Abstract_Main',2))

In [None]:
# embeddings resampled

df7 = pd.DataFrame(train_eval_embedding_resampled(x_train_r, y_train_r, x_test_r, y_test_r,'Title_Abstract',2, model_mpnet))
df8 = pd.DataFrame(train_eval_embedding_resampled(x_train_r_long, y_train_r_long, x_test_r_long, y_test_r_long,'Title_Abstract_Main',2, model_mpnet))

df9 = pd.DataFrame(train_eval_embedding_resampled(x_train_r, y_train_r, x_test_r, y_test_r,'Title_Abstract',2, model_distill))
df10 = pd.DataFrame(train_eval_embedding_resampled(x_train_r_long, y_train_r_long, x_test_r_long, y_test_r_long,'Title_Abstract_Main',2, model_distill))

# cocatenate and save model results

In [None]:
res = pd.concat([df1,df2,df3,df4,df5,df6,df7,df8,df9,df10])
res = res.sort_values(by=['Recall_ts','F1_tr_cv'], ascending=[False, False]).reset_index(drop=True)
res 

In [None]:
# save model scores

res.to_csv('../results/model_scores/model_scores.csv')