In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

# Data import

In [2]:
chembl_32 = pd.read_excel('ChEMBL_32_documents.xlsx', sheet_name='Sheet1').drop_duplicates(subset='document_chembl_id')

In [3]:
data_chembl = pd.read_csv('ChEMBL_reference.csv', sep=',')
data_chembl = data_chembl.drop_duplicates(subset='document_chembl_id').reset_index(drop=True)

In [4]:
chembl_32_merged = pd.merge(chembl_32, data_chembl, how='left')
data_chembl_core = chembl_32_merged.dropna(subset='PUBMED_TYPE')

In [5]:
core_journals = ['ACS Med Chem Lett', 'Bioorg Med Chem', 'Bioorg Med Chem Lett', 'Eur J Med Chem', 'J Med Chem',
                 'J Nat Prod', 'Medchemcomm']
data_chembl_core = data_chembl_core[data_chembl_core['journal'].isin(core_journals)].reset_index(drop=True)
data_chembl_unclass = pd.concat([chembl_32_merged, data_chembl_core], axis=0).drop_duplicates(keep=False).reset_index(
    drop=True)

In [6]:
data_from_pubmed = pd.read_excel('data_from_pubmed.xlsx', sheet_name='core journals').drop_duplicates(
    subset='PMID').reset_index(drop=True)
data_from_pubmed_refs = pd.read_csv('pubmed_reference.csv', sep=',')
data_from_pubmed = pd.concat([data_from_pubmed, data_from_pubmed_refs], axis=1)

# Data preprocessing (pubmed data)

In [7]:
abstracts_pubmed = pd.read_excel('abstracts_pubmed.xlsx', sheet_name='abstracts')
abstracts_pubmed = abstracts_pubmed.iloc[:, 0]
data_from_pubmed['abstract'] = abstracts_pubmed

In [8]:
data_from_pubmed_main = data_from_pubmed[
    ['PMID', 'Publication Year', 'Authors', 'page1', 'page2', 'Journal/Book', 'reference', 'Title', 'abstract',
     'pubtype ']]

unique_journals = data_from_pubmed_main['Journal/Book'].unique()
for journal in unique_journals:
    data_from_pubmed_main['journal_' + str(journal)] = data_from_pubmed_main['Journal/Book'] == journal

data_from_pubmed_main.replace(True, 1, inplace=True)
data_from_pubmed_main.replace(False, 0, inplace=True)

In [9]:
data_from_pubmed_main['pages'] = data_from_pubmed_main['page2'] - data_from_pubmed_main['page1']

auth_num = []
auth = data_from_pubmed_main['Authors'].values

for string in auth:
    auth_num.append(str(string).count(',') + 1)
data_from_pubmed_main['auth_num'] = np.array(auth_num)

In [10]:
pubmed_data = {'pubmed_id': data_from_pubmed_main['PMID'].values,
               'pages': np.log(data_from_pubmed_main.pages.values + 1),
               'year': data_from_pubmed_main['Publication Year'].values,
               'auth_num': np.log(data_from_pubmed_main.auth_num.values + 1),
               'reference': np.log(data_from_pubmed_main.reference.values + 1),
               'journal_Medchemcomm': data_from_pubmed_main['journal_Medchemcomm'].values,
               'journal_J Nat Prod': data_from_pubmed_main['journal_J Nat Prod'].values,
               'journal_J Med Chem': data_from_pubmed_main['journal_J Med Chem'].values,
               'journal_Eur J Med Chem': data_from_pubmed_main['journal_Eur J Med Chem'].values,
               'journal_Bioorg Med Chem Lett': data_from_pubmed_main['journal_Bioorg Med Chem Lett'].values,
               'journal_Bioorg Med Chem': data_from_pubmed_main['journal_Bioorg Med Chem'].values,
               'journal_ACS Med Chem Lett': data_from_pubmed_main['journal_ACS Med Chem Lett'].values,}

pubmed_df = pd.DataFrame(pubmed_data)

In [11]:
data_chembl_core_main = data_chembl_core[
    ['pubmed_id', 'year', 'authors', 'first_page', 'last_page', 'journal', 'references', 'title', 'abstract',
     'PUBMED_TYPE']]
data_chembl_unclass_main = data_chembl_unclass[
    ['pubmed_id', 'year', 'authors', 'first_page', 'last_page', 'journal', 'references', 'title', 'abstract',
     'PUBMED_TYPE']]

In [12]:
for journal in unique_journals:
    data_chembl_core_main['journal_' + str(journal)] = data_chembl_core_main['journal'] == journal
    data_chembl_unclass_main['journal_' + str(journal)] = data_chembl_unclass_main['journal'] == journal
data_chembl_core_main.replace(True, 1, inplace=True)
data_chembl_core_main.replace(False, 0, inplace=True)
data_chembl_unclass_main.replace(True, 1, inplace=True)
data_chembl_unclass_main.replace(False, 0, inplace=True)

In [13]:
data_chembl_core_main['pages'] = data_chembl_core_main['last_page'] - data_chembl_core_main['first_page']

auth_num = []
auth = data_chembl_core_main['authors'].values

for string in auth:
    auth_num.append(str(string).count(',') + 1)
data_chembl_core_main['auth_num'] = np.array(auth_num)

In [14]:
data_chembl_unclass_main['pages'] = data_chembl_unclass_main['last_page'] - data_chembl_unclass_main['first_page']

auth_num = []
auth = data_chembl_unclass_main['authors'].values

for string in auth:
    auth_num.append(str(string).count(',') + 1)
data_chembl_unclass_main['auth_num'] = np.array(auth_num)

In [15]:
chembl_core_data = {'pubmed_id': data_chembl_core_main['pubmed_id'].values,
                    'pages': np.log(data_chembl_core_main.pages.values + 1),
                    'year': data_chembl_core_main['year'].values,
                    'auth_num': np.log(data_chembl_core_main.auth_num.values + 1),
                    'reference': np.log(data_chembl_core_main.references.values + 1),
                    'journal_Medchemcomm': data_chembl_core_main['journal_Medchemcomm'].values,
                    'journal_J Nat Prod': data_chembl_core_main['journal_J Nat Prod'].values,
                    'journal_J Med Chem': data_chembl_core_main['journal_J Med Chem'].values,
                    'journal_Eur J Med Chem': data_chembl_core_main['journal_Eur J Med Chem'].values,
                    'journal_Bioorg Med Chem Lett': data_chembl_core_main['journal_Bioorg Med Chem Lett'].values,
                    'journal_Bioorg Med Chem': data_chembl_core_main['journal_Bioorg Med Chem'].values,
                    'journal_ACS Med Chem Lett': data_chembl_core_main['journal_ACS Med Chem Lett'].values}

chembl_core_df = pd.DataFrame(chembl_core_data)

In [16]:
chembl_unclass_data = {'pubmed_id': data_chembl_unclass_main['pubmed_id'].values,
                       'pages': np.log(data_chembl_unclass_main.pages.values + 1),
                       'year': data_chembl_unclass_main['year'].values,
                       'auth_num': np.log(data_chembl_unclass_main.auth_num.values + 1),
                       'reference': np.log(data_chembl_unclass_main.references.values + 1),
                       'journal_Medchemcomm': data_chembl_unclass_main['journal_Medchemcomm'].values,
                       'journal_J Nat Prod': data_chembl_unclass_main['journal_J Nat Prod'].values,
                       'journal_J Med Chem': data_chembl_unclass_main['journal_J Med Chem'].values,
                       'journal_Eur J Med Chem': data_chembl_unclass_main['journal_Eur J Med Chem'].values,
                       'journal_Bioorg Med Chem Lett': data_chembl_unclass_main['journal_Bioorg Med Chem Lett'].values,
                       'journal_Bioorg Med Chem': data_chembl_unclass_main['journal_Bioorg Med Chem'].values,
                       'journal_ACS Med Chem Lett': data_chembl_unclass_main['journal_ACS Med Chem Lett'].values}

chembl_unclass_df = pd.DataFrame(chembl_unclass_data)

In [17]:
pubmed_df['class'] = data_from_pubmed_main['pubtype '].values
chembl_core_df['class'] = data_chembl_core_main.PUBMED_TYPE.values
chembl_unclass_df['class'] = data_chembl_unclass_main.PUBMED_TYPE.values

pubmed_df['title'] = data_from_pubmed_main.Title.values
chembl_core_df['title'] = data_chembl_core_main.title.values
chembl_unclass_df['title'] = data_chembl_unclass_main.title.values

pubmed_df['abstract'] = data_from_pubmed_main.abstract.values
chembl_core_df['abstract'] = data_chembl_core_main.abstract.values
chembl_unclass_df['abstract'] = data_chembl_unclass_main.abstract.values

In [18]:
classified_df = pd.concat([pubmed_df, chembl_core_df], axis=0).drop_duplicates(subset='pubmed_id').reset_index(
    drop=True)

unique_class = classified_df['class'].unique()
for cl in unique_class:
    if cl == 'Review':
        classified_df['class'].replace(cl, 1, inplace=True)
    else:
        classified_df['class'].replace(cl, 0, inplace=True)

In [19]:
array_of_mistakes = np.array([26985286, 35059113, 35059117, 35178165, 35178167, 34606998, 35059114, 
                              35178166, 35059115, 35059116, 35178168, 35300084, 35300085])

chembl_unclass_df.loc[chembl_unclass_df['pubmed_id'].isin(array_of_mistakes), 'abstract'] = np.nan

# BERT vectorisation

In [20]:
classified_df['abstract'].fillna('[MASK]', inplace=True)
chembl_unclass_df['abstract'].fillna('[MASK]', inplace=True)

In [21]:
abstracts_X = list(classified_df['abstract'].values)
titles_X = list(classified_df['title'].values)
texts_X = [f"{t1}. {t2}" for t1, t2 in zip(titles_X, abstracts_X)]
pd.DataFrame(texts_X).to_csv('title_abstract_texts_X.csv', index=False)

In [22]:
all_abstracts = chembl_unclass_df.abstract.values
all_titles = chembl_unclass_df.title.values
all_texts = [f"{t1}. {t2}" for t1, t2 in zip(all_titles, all_abstracts)]
pd.DataFrame(all_texts).to_csv('title_abstract_texts_all.csv', index=False)

In [23]:
cls_X_matrix_pubmed = pd.read_csv('cls_X_matrix_pubmedbert.csv', sep=',')
cls_all_matrix_pubmed = pd.read_csv('cls_all_matrix_pubmedbert.csv', sep=',')

In [24]:
cls_X_matrix_biobert = pd.read_csv('cls_X_matrix_biobert.csv', sep=',')
cls_all_matrix_biobert = pd.read_csv('cls_all_matrix_biobert.csv', sep=',')

In [25]:
cls_X_matrix_biomed_roberta = pd.read_csv('cls_X_matrix_biomed_roberta.csv', sep=',')
cls_all_matrix_biomed_roberta = pd.read_csv('cls_all_matrix_biomed_roberta.csv', sep=',')

In [26]:
cls_X_matrix_distilbert = pd.read_csv('cls_X_matrix_distilbert.csv', sep=',')
cls_all_matrix_distilbert = pd.read_csv('cls_all_matrix_distilbert.csv', sep=',')

In [27]:
classified_df_vect_pubmed = pd.concat([classified_df, cls_X_matrix_pubmed], axis=1)
classified_df_vect_pubmed = classified_df_vect_pubmed.drop(columns=['title', 'abstract'])
classified_df_vect_pubmed.pages.replace(0, None, inplace=True)
classified_df_vect_pubmed.reference.replace(0, None, inplace=True)

In [28]:
unclassified_df_vect_pubmed = pd.concat([chembl_unclass_df, cls_all_matrix_pubmed], axis=1)
unclassified_df_vect_pubmed = unclassified_df_vect_pubmed.drop(columns=['title', 'abstract'])
unclassified_df_vect_pubmed.pages.replace(0, None, inplace=True)
unclassified_df_vect_pubmed.reference.replace(0, None, inplace=True)

In [29]:
classified_df_vect_biobert = pd.concat([classified_df, cls_X_matrix_biobert], axis = 1)
classified_df_vect_biobert = classified_df_vect_biobert.drop(columns = ['title', 'abstract'])
classified_df_vect_biobert.pages.replace(0, None, inplace = True)
classified_df_vect_biobert.reference.replace(0, None, inplace = True)

In [30]:
unclassified_df_vect_biobert = pd.concat([chembl_unclass_df, cls_all_matrix_biobert], axis = 1)
unclassified_df_vect_biobert = unclassified_df_vect_biobert.drop(columns = ['title', 'abstract'])
unclassified_df_vect_biobert.pages.replace(0, None, inplace = True)
unclassified_df_vect_biobert.reference.replace(0, None, inplace = True)

In [31]:
classified_df_vect_biomed_roberta = pd.concat([classified_df, cls_X_matrix_biomed_roberta], axis = 1)
classified_df_vect_biomed_roberta = classified_df_vect_biomed_roberta.drop(columns = ['title', 'abstract'])
classified_df_vect_biomed_roberta.pages.replace(0, None, inplace = True)
classified_df_vect_biomed_roberta.reference.replace(0, None, inplace = True)

In [32]:
unclassified_df_vect_biomed_roberta = pd.concat([chembl_unclass_df, cls_all_matrix_biomed_roberta], axis = 1)
unclassified_df_vect_biomed_roberta = unclassified_df_vect_biomed_roberta.drop(columns = ['title', 'abstract'])
unclassified_df_vect_biomed_roberta.pages.replace(0, None, inplace = True)
unclassified_df_vect_biomed_roberta.reference.replace(0, None, inplace = True)

In [33]:
classified_df_vect_distilbert = pd.concat([classified_df, cls_X_matrix_distilbert], axis = 1)
classified_df_vect_distilbert = classified_df_vect_distilbert.drop(columns = ['title', 'abstract'])
classified_df_vect_distilbert.pages.replace(0, None, inplace = True)
classified_df_vect_distilbert.reference.replace(0, None, inplace = True)

In [34]:
unclassified_df_vect_distilbert = pd.concat([chembl_unclass_df, cls_all_matrix_distilbert], axis = 1)
unclassified_df_vect_distilbert = unclassified_df_vect_distilbert.drop(columns = ['title', 'abstract'])
unclassified_df_vect_distilbert.pages.replace(0, None, inplace = True)
unclassified_df_vect_distilbert.reference.replace(0, None, inplace = True)

# Model training

# XGB_alldata_pubmedbert

In [35]:
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.model_selection import RandomizedSearchCV, train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt

In [36]:
X_train_pubmed, X_test_pubmed, y_train_pubmed, y_test_pubmed = train_test_split(
    classified_df_vect_pubmed.drop(columns=['class', 'pubmed_id']), classified_df_vect_pubmed['class'], test_size=0.2,
    random_state=42, stratify=classified_df_vect_pubmed['class'])


In [37]:
X_train_pubmed, X_test_pubmed, y_train_pubmed, y_test_pubmed = train_test_split(
    classified_df_vect_pubmed.drop(columns=['class', 'pubmed_id']), classified_df_vect_pubmed['class'], test_size=0.2,
    random_state=42, stratify=classified_df_vect_pubmed['class'])

import random
np.random.seed(42)

xgb_all_pubmedbert = xgb.XGBClassifier()

param_grid_xgb_all_pubmedbert = {
    'max_depth': stats.randint(3, 10),
    'learning_rate': stats.uniform(0.001, 0.1),
    'subsample': stats.uniform(0.3, 0.7),
    'n_estimators': stats.randint(50, 200)
}

random_search_xgb_all_pubmedbert = RandomizedSearchCV(xgb_all_pubmedbert, param_grid_xgb_all_pubmedbert, cv=5, random_state=42, n_iter=10, scoring='f1', verbose=10)
random_search_xgb_all_pubmedbert.fit(X_train_pubmed.astype(np.float64), y_train_pubmed)

print("Best set of hyperparameters: ", random_search_xgb_all_pubmedbert.best_params_)
print("Best score: ", random_search_xgb_all_pubmedbert.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 1/5; 1/10] END learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836;, score=0.968 total time=   6.8s
[CV 2/5; 1/10] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 2/5; 1/10] END learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836;, score=0.968 total time=   6.8s
[CV 3/5; 1/10] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 3/5; 1/10] END learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836;, score=0.959 total time=   6.8s
[CV 4/5; 1/10] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 4/5; 1/10] END learning_rate=0.038454

[CV 2/5; 7/10] END learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925;, score=0.970 total time=   3.7s
[CV 3/5; 7/10] START learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925
[CV 3/5; 7/10] END learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925;, score=0.965 total time=   3.7s
[CV 4/5; 7/10] START learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925
[CV 4/5; 7/10] END learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925;, score=0.967 total time=   3.7s
[CV 5/5; 7/10] START learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925
[CV 5/5; 7/10] END learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925;, score=0.969 total time=   3.7s
[CV 1/5; 8/10] START learning_rate=0.07951759613930137, max_depth=5, n_estimat

In [38]:
model_xgb_all_pubmedbert = xgb.XGBClassifier(learning_rate=0.07951759613930137, max_depth=5, n_estimators=157,
                                subsample=0.6599641068895281)
model_xgb_all_pubmedbert.fit(X_train_pubmed.astype(np.float64), y_train_pubmed)

print(f1_score(y_test_pubmed, model_xgb_all_pubmedbert.predict(X_test_pubmed.astype(np.float64))))
print(accuracy_score(y_test_pubmed, model_xgb_all_pubmedbert.predict(X_test_pubmed.astype(np.float64))))
print(precision_score(y_test_pubmed, model_xgb_all_pubmedbert.predict(X_test_pubmed.astype(np.float64))))
print(recall_score(y_test_pubmed, model_xgb_all_pubmedbert.predict(X_test_pubmed.astype(np.float64))))
print(confusion_matrix(y_test_pubmed, model_xgb_all_pubmedbert.predict(X_test_pubmed.astype(np.float64))))

0.9800148038490007
0.9844470046082949
0.992503748125937
0.9678362573099415
[[1047    5]
 [  22  662]]


# RF_alldata_pubmedbert

In [39]:
np.random.seed(42)

rf_all_pubmedbert = RandomForestClassifier()

param_grid_rf_all_pubmedbert = {
    'n_estimators': stats.randint(50, 250),
    'max_depth': stats.randint(5, 100),
    'criterion': ['gini', 'entropy'],
    'max_samples': stats.uniform(0.3, 0.7)
}

random_search_rf_all_pubmedbert = RandomizedSearchCV(rf_all_pubmedbert, param_grid_rf_all_pubmedbert, cv=5, random_state=42, n_iter=10, scoring='f1',
                                             verbose=10)
random_search_rf_all_pubmedbert.fit(X_train_pubmed.astype(np.float64), y_train_pubmed)

print("Best set of hyperparameters: ", random_search_rf_all_pubmedbert.best_params_)
print("Best score: ", random_search_rf_all_pubmedbert.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 1/5; 1/10] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.961 total time=  21.3s
[CV 2/5; 1/10] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 2/5; 1/10] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.965 total time=  21.6s
[CV 3/5; 1/10] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 3/5; 1/10] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.951 total time=  21.8s
[CV 4/5; 1/10] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 4/5; 1/10] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.955 total time=  21.0s
[CV 5/5; 1/10] 

[CV 1/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219;, score=0.958 total time=  23.5s
[CV 2/5; 8/10] START criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219
[CV 2/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219;, score=0.970 total time=  23.9s
[CV 3/5; 8/10] START criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219
[CV 3/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219;, score=0.945 total time=  23.5s
[CV 4/5; 8/10] START criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219
[CV 4/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219;, score=0.951 total time=  23.7s
[CV 5/5; 8/10] START criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219
[CV 5/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056

In [40]:
best_model_rf_all_pubmedbert = random_search_rf_all_pubmedbert.best_estimator_

print(f1_score(y_test_pubmed, best_model_rf_all_pubmedbert.predict(X_test_pubmed.astype(np.float64))))
print(accuracy_score(y_test_pubmed, best_model_rf_all_pubmedbert.predict(X_test_pubmed.astype(np.float64))))
print(precision_score(y_test_pubmed, best_model_rf_all_pubmedbert.predict(X_test_pubmed.astype(np.float64))))
print(recall_score(y_test_pubmed, best_model_rf_all_pubmedbert.predict(X_test_pubmed.astype(np.float64))))
print(confusion_matrix(y_test_pubmed, best_model_rf_all_pubmedbert.predict(X_test_pubmed.astype(np.float64))))

0.9554043839758125
0.9660138248847926
0.9890453834115805
0.9239766081871345
[[1045    7]
 [  52  632]]


# SVM_alldata_pubmedbert

In [41]:
np.random.seed(42)

pipeline_svm_all_pubmedbert = Pipeline([
    ('imputer', KNNImputer(weights='distance', metric='nan_euclidean', add_indicator=True)),
    ('scaler', StandardScaler()),
    ('svm', SVC())
])

param_grid_svm_all_pubmedbert = {
    'imputer__n_neighbors': [2, 3, 5, 7],
    'svm__C': stats.uniform(0.1, 10),
    'svm__gamma': stats.uniform(0.01, 1),
    'svm__kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}

random_search_svm_all_pubmedbert = RandomizedSearchCV(pipeline_svm_all_pubmedbert,
                                                          param_grid_svm_all_pubmedbert, cv=5, random_state=42,
                                                          n_iter=10, scoring='f1', verbose=10)
random_search_svm_all_pubmedbert.fit(X_train_pubmed.values, y_train_pubmed)

print("Best set of hyperparameters: ", random_search_svm_all_pubmedbert.best_params_)
print("Best score: ", random_search_svm_all_pubmedbert.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START imputer__n_neighbors=5, svm__C=8.065429868602328, svm__gamma=0.1934347898661638, svm__kernel=sigmoid
[CV 1/5; 1/10] END imputer__n_neighbors=5, svm__C=8.065429868602328, svm__gamma=0.1934347898661638, svm__kernel=sigmoid;, score=0.819 total time=  11.1s
[CV 2/5; 1/10] START imputer__n_neighbors=5, svm__C=8.065429868602328, svm__gamma=0.1934347898661638, svm__kernel=sigmoid
[CV 2/5; 1/10] END imputer__n_neighbors=5, svm__C=8.065429868602328, svm__gamma=0.1934347898661638, svm__kernel=sigmoid;, score=0.775 total time=  11.0s
[CV 3/5; 1/10] START imputer__n_neighbors=5, svm__C=8.065429868602328, svm__gamma=0.1934347898661638, svm__kernel=sigmoid
[CV 3/5; 1/10] END imputer__n_neighbors=5, svm__C=8.065429868602328, svm__gamma=0.1934347898661638, svm__kernel=sigmoid;, score=0.779 total time=  11.0s
[CV 4/5; 1/10] START imputer__n_neighbors=5, svm__C=8.065429868602328, svm__gamma=0.1934347898661638, svm__kernel=

[CV 1/5; 7/10] END imputer__n_neighbors=7, svm__C=0.33062425041415755, svm__gamma=0.5347746602583892, svm__kernel=poly;, score=0.959 total time=  12.5s
[CV 2/5; 7/10] START imputer__n_neighbors=7, svm__C=0.33062425041415755, svm__gamma=0.5347746602583892, svm__kernel=poly
[CV 2/5; 7/10] END imputer__n_neighbors=7, svm__C=0.33062425041415755, svm__gamma=0.5347746602583892, svm__kernel=poly;, score=0.967 total time=  12.5s
[CV 3/5; 7/10] START imputer__n_neighbors=7, svm__C=0.33062425041415755, svm__gamma=0.5347746602583892, svm__kernel=poly
[CV 3/5; 7/10] END imputer__n_neighbors=7, svm__C=0.33062425041415755, svm__gamma=0.5347746602583892, svm__kernel=poly;, score=0.955 total time=  12.4s
[CV 4/5; 7/10] START imputer__n_neighbors=7, svm__C=0.33062425041415755, svm__gamma=0.5347746602583892, svm__kernel=poly
[CV 4/5; 7/10] END imputer__n_neighbors=7, svm__C=0.33062425041415755, svm__gamma=0.5347746602583892, svm__kernel=poly;, score=0.960 total time=  12.5s
[CV 5/5; 7/10] START imputer_

In [42]:
model_svm_all_pubmedbert = Pipeline([
    ('imputer', KNNImputer(n_neighbors=7, weights='distance', metric='nan_euclidean', add_indicator=True)),
    ('scaler', StandardScaler()),
    ('svm', SVC(C=0.33062425041415755, gamma=0.5347746602583892, kernel='poly'))
])
model_svm_all_pubmedbert.fit(X_train_pubmed.astype(np.float64), y_train_pubmed)

print(f1_score(y_test_pubmed, model_svm_all_pubmedbert.predict(X_test_pubmed.astype(np.float64))))
print(accuracy_score(y_test_pubmed, model_svm_all_pubmedbert.predict(X_test_pubmed.astype(np.float64))))
print(precision_score(y_test_pubmed, model_svm_all_pubmedbert.predict(X_test_pubmed.astype(np.float64))))
print(recall_score(y_test_pubmed, model_svm_all_pubmedbert.predict(X_test_pubmed.astype(np.float64))))
print(confusion_matrix(y_test_pubmed, model_svm_all_pubmedbert.predict(X_test_pubmed.astype(np.float64))))

0.9643387815750372
0.9723502304147466
0.9803625377643505
0.9488304093567251
[[1039   13]
 [  35  649]]


# XGB_text_pubmedbert

In [43]:
X_train_text_pubmed, X_test_text_pubmed, y_train_text_pubmed, y_test_text_pubmed = train_test_split(cls_X_matrix_pubmed,
    classified_df_vect_pubmed[
    'class'],
    test_size = 0.2,
    random_state = 42,
    stratify =
    classified_df_vect_pubmed[
    'class'])

np.random.seed(42)

xgb_text_pubmedbert = xgb.XGBClassifier()

param_grid_xgb_text_pubmedbert = {
    'max_depth': stats.randint(3, 10),
    'learning_rate': stats.uniform(0.001, 0.1),
    'subsample': stats.uniform(0.3, 0.7),
    'n_estimators': stats.randint(50, 200)
}

random_search_xgb_text_pubmedbert = RandomizedSearchCV(xgb_text_pubmedbert, param_grid_xgb_text_pubmedbert, cv=5,
                                                       random_state=42, n_iter=10, scoring='f1', verbose=10)
random_search_xgb_text_pubmedbert.fit(X_train_text_pubmed.astype(np.float64), y_train_text_pubmed)

print("Best set of hyperparameters: ", random_search_xgb_text_pubmedbert.best_params_)
print("Best score: ", random_search_xgb_text_pubmedbert.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 1/5; 1/10] END learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836;, score=0.913 total time=   6.5s
[CV 2/5; 1/10] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 2/5; 1/10] END learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836;, score=0.926 total time=   6.4s
[CV 3/5; 1/10] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 3/5; 1/10] END learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836;, score=0.916 total time=   6.1s
[CV 4/5; 1/10] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 4/5; 1/10] END learning_rate=0.038454

[CV 2/5; 7/10] END learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925;, score=0.927 total time=   2.4s
[CV 3/5; 7/10] START learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925
[CV 3/5; 7/10] END learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925;, score=0.914 total time=   2.4s
[CV 4/5; 7/10] START learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925
[CV 4/5; 7/10] END learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925;, score=0.908 total time=   2.5s
[CV 5/5; 7/10] START learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925
[CV 5/5; 7/10] END learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925;, score=0.903 total time=   2.4s
[CV 1/5; 8/10] START learning_rate=0.07951759613930137, max_depth=5, n_estimat

In [44]:
model_xgb_text_pubmedbert = xgb.XGBClassifier(learning_rate=0.06086584841970366, max_depth=9, n_estimators=171,
                                              subsample=0.40919616423534183)
model_xgb_text_pubmedbert.fit(X_train_text_pubmed.astype(np.float64), y_train_text_pubmed)

print(f1_score(y_test_text_pubmed, model_xgb_text_pubmedbert.predict(X_test_text_pubmed.astype(np.float64))))
print(accuracy_score(y_test_text_pubmed, model_xgb_text_pubmedbert.predict(X_test_text_pubmed.astype(np.float64))))
print(precision_score(y_test_text_pubmed, model_xgb_text_pubmedbert.predict(X_test_text_pubmed.astype(np.float64))))
print(recall_score(y_test_text_pubmed, model_xgb_text_pubmedbert.predict(X_test_text_pubmed.astype(np.float64))))
print(confusion_matrix(y_test_text_pubmed, model_xgb_text_pubmedbert.predict(X_test_text_pubmed.astype(np.float64))))

0.9389600602863603
0.9533410138248848
0.968895800933126
0.9108187134502924
[[1032   20]
 [  61  623]]


# RF_text_pubmedbert

In [45]:
np.random.seed(42)

rf_text_pubmedbert = RandomForestClassifier()

param_grid_rf_text_pubmedbert = {
    'n_estimators': stats.randint(50, 250),
    'max_depth': stats.randint(5, 100),
    'criterion': ['gini', 'entropy'],
    'max_samples': stats.uniform(0.3, 0.7)
}

random_search_rf_text_pubmedbert = RandomizedSearchCV(rf_text_pubmedbert, param_grid_rf_text_pubmedbert, cv=5,
                                                      random_state=42, n_iter=10, scoring='f1', verbose=10)
random_search_rf_text_pubmedbert.fit(X_train_text_pubmed.astype(np.float64), y_train_text_pubmed)

print("Best set of hyperparameters: ", random_search_rf_text_pubmedbert.best_params_)
print("Best score: ", random_search_rf_text_pubmedbert.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 1/5; 1/10] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.918 total time=  23.3s
[CV 2/5; 1/10] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 2/5; 1/10] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.923 total time=  23.4s
[CV 3/5; 1/10] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 3/5; 1/10] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.911 total time=  22.8s
[CV 4/5; 1/10] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 4/5; 1/10] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.905 total time=  23.6s
[CV 5/5; 1/10] 

[CV 1/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219;, score=0.918 total time=  25.3s
[CV 2/5; 8/10] START criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219
[CV 2/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219;, score=0.919 total time=  25.4s
[CV 3/5; 8/10] START criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219
[CV 3/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219;, score=0.910 total time=  25.2s
[CV 4/5; 8/10] START criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219
[CV 4/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219;, score=0.904 total time=  25.3s
[CV 5/5; 8/10] START criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219
[CV 5/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056

In [46]:
best_model_rf_text_pubmedbert = random_search_rf_text_pubmedbert.best_estimator_

print(f1_score(y_test_text_pubmed, best_model_rf_text_pubmedbert.predict(X_test_text_pubmed.astype(np.float64))))
print(accuracy_score(y_test_text_pubmed, best_model_rf_text_pubmedbert.predict(X_test_text_pubmed.astype(np.float64))))
print(precision_score(y_test_text_pubmed, best_model_rf_text_pubmedbert.predict(X_test_text_pubmed.astype(np.float64))))
print(recall_score(y_test_text_pubmed, best_model_rf_text_pubmedbert.predict(X_test_text_pubmed.astype(np.float64))))
print(confusion_matrix(y_test_text_pubmed, best_model_rf_text_pubmedbert.predict(X_test_text_pubmed.astype(np.float64))))

0.9201228878648233
0.9400921658986175
0.9692556634304207
0.8757309941520468
[[1033   19]
 [  85  599]]


# SVM_text_pubmedbert

In [47]:
np.random.seed(42)

pipeline_svm_text_pubmedbert = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC())
])

param_grid_svm_text_pubmedbert = {
    'svm__C': stats.uniform(0.1, 10),
    'svm__gamma': stats.uniform(0.01, 1),
    'svm__kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}

random_search_svm_text_pubmedbert = RandomizedSearchCV(pipeline_svm_text_pubmedbert, param_grid_svm_text_pubmedbert,
                                                       cv=5, random_state=42, n_iter=10, scoring='f1', verbose=10)
random_search_svm_text_pubmedbert.fit(X_train_text_pubmed.astype(np.float64), y_train_text_pubmed)

print("Best set of hyperparameters: ", random_search_svm_text_pubmedbert.best_params_)
print("Best score: ", random_search_svm_text_pubmedbert.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly
[CV 1/5; 1/10] END svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly;, score=0.936 total time=   4.5s
[CV 2/5; 1/10] START svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly
[CV 2/5; 1/10] END svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly;, score=0.940 total time=   4.6s
[CV 3/5; 1/10] START svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly
[CV 3/5; 1/10] END svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly;, score=0.931 total time=   4.5s
[CV 4/5; 1/10] START svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly
[CV 4/5; 1/10] END svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly;, score=0.932 total time=   4.6s
[CV 5/5; 1/10] START svm__C=3.845401188473625, svm__gam

[CV 2/5; 8/10] END svm__C=0.17066305219717406, svm__gamma=0.03306242504141576, svm__kernel=poly;, score=0.940 total time=   4.6s
[CV 3/5; 8/10] START svm__C=0.17066305219717406, svm__gamma=0.03306242504141576, svm__kernel=poly
[CV 3/5; 8/10] END svm__C=0.17066305219717406, svm__gamma=0.03306242504141576, svm__kernel=poly;, score=0.931 total time=   4.6s
[CV 4/5; 8/10] START svm__C=0.17066305219717406, svm__gamma=0.03306242504141576, svm__kernel=poly
[CV 4/5; 8/10] END svm__C=0.17066305219717406, svm__gamma=0.03306242504141576, svm__kernel=poly;, score=0.932 total time=   4.6s
[CV 5/5; 8/10] START svm__C=0.17066305219717406, svm__gamma=0.03306242504141576, svm__kernel=poly
[CV 5/5; 8/10] END svm__C=0.17066305219717406, svm__gamma=0.03306242504141576, svm__kernel=poly;, score=0.918 total time=   4.5s
[CV 1/5; 9/10] START svm__C=6.218528947223795, svm__gamma=0.14949386065204184, svm__kernel=sigmoid
[CV 1/5; 9/10] END svm__C=6.218528947223795, svm__gamma=0.14949386065204184, svm__kernel=si

In [48]:
model_svm_text_pubmedbert = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(C=3.845401188473625, gamma=0.9607143064099162, kernel='poly'))
])
model_svm_text_pubmedbert.fit(X_train_text_pubmed.astype(np.float64), y_train_text_pubmed)

print(f1_score(y_test_text_pubmed, model_svm_text_pubmedbert.predict(X_test_text_pubmed.astype(np.float64))))
print(accuracy_score(y_test_text_pubmed, model_svm_text_pubmedbert.predict(X_test_text_pubmed.astype(np.float64))))
print(precision_score(y_test_text_pubmed, model_svm_text_pubmedbert.predict(X_test_text_pubmed.astype(np.float64))))
print(recall_score(y_test_text_pubmed, model_svm_text_pubmedbert.predict(X_test_text_pubmed.astype(np.float64))))
print(confusion_matrix(y_test_text_pubmed, model_svm_text_pubmedbert.predict(X_test_text_pubmed.astype(np.float64))))

0.9368029739776952
0.951036866359447
0.9531013615733737
0.9210526315789473
[[1021   31]
 [  54  630]]


# XGB_alldata_biobert

In [49]:
X_train_all_biobert, X_test_all_biobert, y_train_all_biobert, y_test_all_biobert = train_test_split(
    classified_df_vect_biobert.drop(columns=['class', 'pubmed_id']), classified_df_vect_biobert['class'], test_size=0.2,
    random_state=42, stratify=classified_df_vect_biobert['class'])

np.random.seed(42)

xgb_all_biobert = xgb.XGBClassifier()

param_grid_xgb_all_biobert = {
    'max_depth': stats.randint(3, 10),
    'learning_rate': stats.uniform(0.001, 0.1),
    'subsample': stats.uniform(0.3, 0.7),
    'n_estimators': stats.randint(50, 200)
}

random_search_xgb_all_biobert = RandomizedSearchCV(xgb_all_biobert, param_grid_xgb_all_biobert, cv=5, random_state=42,
                                                   n_iter=10, scoring='f1', verbose=10)
random_search_xgb_all_biobert.fit(X_train_all_biobert.astype(np.float64), y_train_all_biobert)

print("Best set of hyperparameters: ", random_search_xgb_all_biobert.best_params_)
print("Best score: ", random_search_xgb_all_biobert.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 1/5; 1/10] END learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836;, score=0.970 total time=   6.2s
[CV 2/5; 1/10] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 2/5; 1/10] END learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836;, score=0.970 total time=   6.4s
[CV 3/5; 1/10] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 3/5; 1/10] END learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836;, score=0.971 total time=   6.3s
[CV 4/5; 1/10] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 4/5; 1/10] END learning_rate=0.038454

[CV 2/5; 7/10] END learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925;, score=0.971 total time=   3.7s
[CV 3/5; 7/10] START learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925
[CV 3/5; 7/10] END learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925;, score=0.971 total time=   3.7s
[CV 4/5; 7/10] START learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925
[CV 4/5; 7/10] END learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925;, score=0.970 total time=   3.7s
[CV 5/5; 7/10] START learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925
[CV 5/5; 7/10] END learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925;, score=0.975 total time=   3.7s
[CV 1/5; 8/10] START learning_rate=0.07951759613930137, max_depth=5, n_estimat

In [50]:
model_xgb_all_biobert = xgb.XGBClassifier(learning_rate=0.07951759613930137, max_depth=5, n_estimators=157,
                                          subsample=0.6599641068895281)
model_xgb_all_biobert.fit(X_train_all_biobert.astype(np.float64), y_train_all_biobert)

print(f1_score(y_test_all_biobert, model_xgb_all_biobert.predict(X_test_all_biobert.astype(np.float64))))
print(accuracy_score(y_test_all_biobert, model_xgb_all_biobert.predict(X_test_all_biobert.astype(np.float64))))
print(precision_score(y_test_all_biobert, model_xgb_all_biobert.predict(X_test_all_biobert.astype(np.float64))))
print(recall_score(y_test_all_biobert, model_xgb_all_biobert.predict(X_test_all_biobert.astype(np.float64))))
print(confusion_matrix(y_test_all_biobert, model_xgb_all_biobert.predict(X_test_all_biobert.astype(np.float64))))

0.9778761061946902
0.9827188940092166
0.9866071428571429
0.9692982456140351
[[1043    9]
 [  21  663]]


# RF_alldata_biobert

In [51]:
np.random.seed(42)

rf_all_biobert = RandomForestClassifier()

param_grid_rf_all_biobert = {
    'n_estimators': stats.randint(50, 250),
    'max_depth': stats.randint(5, 100),
    'criterion': ['gini', 'entropy'],
    'max_samples': stats.uniform(0.3, 0.7)
}

random_search_rf_all_biobert = RandomizedSearchCV(rf_all_biobert, param_grid_rf_all_biobert, cv=5, random_state=42,
                                                  n_iter=10, scoring='f1',
                                                  verbose=10)
random_search_rf_all_biobert.fit(X_train_all_biobert.astype(np.float64), y_train_all_biobert)

print("Best set of hyperparameters: ", random_search_rf_all_biobert.best_params_)
print("Best score: ", random_search_rf_all_biobert.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 1/5; 1/10] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.958 total time=  21.3s
[CV 2/5; 1/10] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 2/5; 1/10] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.962 total time=  21.3s
[CV 3/5; 1/10] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 3/5; 1/10] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.955 total time=  21.6s
[CV 4/5; 1/10] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 4/5; 1/10] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.965 total time=  22.0s
[CV 5/5; 1/10] 

[CV 1/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219;, score=0.957 total time=  23.2s
[CV 2/5; 8/10] START criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219
[CV 2/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219;, score=0.963 total time=  23.7s
[CV 3/5; 8/10] START criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219
[CV 3/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219;, score=0.954 total time=  23.2s
[CV 4/5; 8/10] START criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219
[CV 4/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219;, score=0.964 total time=  23.6s
[CV 5/5; 8/10] START criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219
[CV 5/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056

In [52]:
best_model_rf_all_biobert = random_search_rf_all_biobert.best_estimator_

print(f1_score(y_test_all_biobert, best_model_rf_all_biobert.predict(X_test_all_biobert.astype(np.float64))))
print(accuracy_score(y_test_all_biobert, best_model_rf_all_biobert.predict(X_test_all_biobert.astype(np.float64))))
print(precision_score(y_test_all_biobert, best_model_rf_all_biobert.predict(X_test_all_biobert.astype(np.float64))))
print(recall_score(y_test_all_biobert, best_model_rf_all_biobert.predict(X_test_all_biobert.astype(np.float64))))
confusion_matrix(y_test_all_biobert, best_model_rf_all_biobert.predict(X_test_all_biobert.astype(np.float64)))

0.9616252821670429
0.9706221198156681
0.9906976744186047
0.9342105263157895


array([[1046,    6],
       [  45,  639]], dtype=int64)

# SVM_alldata_biobert

In [53]:
np.random.seed(42)

pipeline_svm_all_biobert = Pipeline([
    ('imputer', KNNImputer(weights='distance', metric='nan_euclidean', add_indicator=True)),
    ('scaler', StandardScaler()),
    ('svm', SVC())
])

param_grid_svm_all_biobert = {
    'imputer__n_neighbors': [2, 3, 5, 7],
    'svm__C': stats.uniform(0.1, 10),
    'svm__gamma': stats.uniform(0.01, 1),
    'svm__kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}

random_search_svm_all_biobert = RandomizedSearchCV(pipeline_svm_all_biobert,
                                                   param_grid_svm_all_biobert, cv=5, random_state=42,
                                                   n_iter=10, scoring='f1', verbose=10)
random_search_svm_all_biobert.fit(X_train_all_biobert.astype(np.float64), y_train_all_biobert)

print("Best set of hyperparameters: ", random_search_svm_all_biobert.best_params_)
print("Best score: ", random_search_svm_all_biobert.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START imputer__n_neighbors=5, svm__C=8.065429868602328, svm__gamma=0.1934347898661638, svm__kernel=sigmoid
[CV 1/5; 1/10] END imputer__n_neighbors=5, svm__C=8.065429868602328, svm__gamma=0.1934347898661638, svm__kernel=sigmoid;, score=0.730 total time=  16.7s
[CV 2/5; 1/10] START imputer__n_neighbors=5, svm__C=8.065429868602328, svm__gamma=0.1934347898661638, svm__kernel=sigmoid
[CV 2/5; 1/10] END imputer__n_neighbors=5, svm__C=8.065429868602328, svm__gamma=0.1934347898661638, svm__kernel=sigmoid;, score=0.718 total time=  16.6s
[CV 3/5; 1/10] START imputer__n_neighbors=5, svm__C=8.065429868602328, svm__gamma=0.1934347898661638, svm__kernel=sigmoid
[CV 3/5; 1/10] END imputer__n_neighbors=5, svm__C=8.065429868602328, svm__gamma=0.1934347898661638, svm__kernel=sigmoid;, score=0.718 total time=  16.7s
[CV 4/5; 1/10] START imputer__n_neighbors=5, svm__C=8.065429868602328, svm__gamma=0.1934347898661638, svm__kernel=

[CV 1/5; 7/10] END imputer__n_neighbors=7, svm__C=0.33062425041415755, svm__gamma=0.5347746602583892, svm__kernel=poly;, score=0.968 total time=  16.1s
[CV 2/5; 7/10] START imputer__n_neighbors=7, svm__C=0.33062425041415755, svm__gamma=0.5347746602583892, svm__kernel=poly
[CV 2/5; 7/10] END imputer__n_neighbors=7, svm__C=0.33062425041415755, svm__gamma=0.5347746602583892, svm__kernel=poly;, score=0.962 total time=  16.0s
[CV 3/5; 7/10] START imputer__n_neighbors=7, svm__C=0.33062425041415755, svm__gamma=0.5347746602583892, svm__kernel=poly
[CV 3/5; 7/10] END imputer__n_neighbors=7, svm__C=0.33062425041415755, svm__gamma=0.5347746602583892, svm__kernel=poly;, score=0.963 total time=  16.1s
[CV 4/5; 7/10] START imputer__n_neighbors=7, svm__C=0.33062425041415755, svm__gamma=0.5347746602583892, svm__kernel=poly
[CV 4/5; 7/10] END imputer__n_neighbors=7, svm__C=0.33062425041415755, svm__gamma=0.5347746602583892, svm__kernel=poly;, score=0.962 total time=  16.0s
[CV 5/5; 7/10] START imputer_

In [54]:
model_svm_all_biobert = Pipeline([
    ('imputer', KNNImputer(n_neighbors=7, weights='distance', metric='nan_euclidean', add_indicator=True)),
    ('scaler', StandardScaler()),
    ('svm', SVC(C=0.33062425041415755, gamma=0.5347746602583892, kernel='poly'))
])
model_svm_all_biobert.fit(X_train_all_biobert.astype(np.float64), y_train_all_biobert)

print(f1_score(y_test_all_biobert, model_svm_all_biobert.predict(X_test_all_biobert.astype(np.float64))))
print(accuracy_score(y_test_all_biobert, model_svm_all_biobert.predict(X_test_all_biobert.astype(np.float64))))
print(precision_score(y_test_all_biobert, model_svm_all_biobert.predict(X_test_all_biobert.astype(np.float64))))
print(recall_score(y_test_all_biobert, model_svm_all_biobert.predict(X_test_all_biobert.astype(np.float64))))
print(confusion_matrix(y_test_all_biobert, model_svm_all_biobert.predict(X_test_all_biobert.astype(np.float64))))                             

0.9602356406480118
0.9688940092165899
0.9673590504451038
0.9532163742690059
[[1030   22]
 [  32  652]]


# XGB_text_biobert

In [55]:
X_train_text_biobert, X_test_text_biobert, y_train_text_biobert, y_test_text_biobert = train_test_split(
    cls_X_matrix_biobert,
    classified_df_vect_biobert[
        'class'],
    test_size=0.2,
    random_state=42,
    stratify=
    classified_df_vect_biobert[
        'class'])

np.random.seed(42)

xgb_text_biobert = xgb.XGBClassifier()

param_grid_xgb_text_biobert = {
    'max_depth': stats.randint(3, 10),
    'learning_rate': stats.uniform(0.001, 0.1),
    'subsample': stats.uniform(0.3, 0.7),
    'n_estimators': stats.randint(50, 200)
}

random_search_xgb_text_biobert = RandomizedSearchCV(xgb_text_biobert, param_grid_xgb_text_biobert, cv=5,
                                                    random_state=42, n_iter=10, scoring='f1', verbose=10)
random_search_xgb_text_biobert.fit(X_train_text_biobert.astype(np.float64), y_train_text_biobert)

print("Best set of hyperparameters: ", random_search_xgb_text_biobert.best_params_)
print("Best score: ", random_search_xgb_text_biobert.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 1/5; 1/10] END learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836;, score=0.918 total time=   5.9s
[CV 2/5; 1/10] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 2/5; 1/10] END learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836;, score=0.915 total time=   6.1s
[CV 3/5; 1/10] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 3/5; 1/10] END learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836;, score=0.919 total time=   6.1s
[CV 4/5; 1/10] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 4/5; 1/10] END learning_rate=0.038454

[CV 2/5; 7/10] END learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925;, score=0.920 total time=   2.3s
[CV 3/5; 7/10] START learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925
[CV 3/5; 7/10] END learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925;, score=0.920 total time=   2.3s
[CV 4/5; 7/10] START learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925
[CV 4/5; 7/10] END learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925;, score=0.911 total time=   2.3s
[CV 5/5; 7/10] START learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925
[CV 5/5; 7/10] END learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925;, score=0.904 total time=   2.3s
[CV 1/5; 8/10] START learning_rate=0.07951759613930137, max_depth=5, n_estimat

In [56]:
model_xgb_text_biobert = xgb.XGBClassifier(learning_rate=0.06086584841970366, max_depth=9, n_estimators=171,
                                           subsample=0.40919616423534183)
model_xgb_text_biobert.fit(X_train_text_biobert.astype(np.float64), y_train_text_biobert)

print(f1_score(y_test_text_biobert, model_xgb_text_biobert.predict(X_test_text_biobert.astype(np.float64))))
print(accuracy_score(y_test_text_biobert, model_xgb_text_biobert.predict(X_test_text_biobert.astype(np.float64))))
print(precision_score(y_test_text_biobert, model_xgb_text_biobert.predict(X_test_text_biobert.astype(np.float64))))
print(recall_score(y_test_text_biobert, model_xgb_text_biobert.predict(X_test_text_biobert.astype(np.float64))))
print(confusion_matrix(y_test_text_biobert, model_xgb_text_biobert.predict(X_test_text_biobert.astype(np.float64))))

0.9294294294294294
0.945852534562212
0.9552469135802469
0.9049707602339181
[[1023   29]
 [  65  619]]


# RF_text_biobert

In [57]:
np.random.seed(42)

rf_text_biobert = RandomForestClassifier()

param_grid_rf_text_biobert = {
    'n_estimators': stats.randint(50, 250),
    'max_depth': stats.randint(5, 100),
    'criterion': ['gini', 'entropy'],
    'max_samples': stats.uniform(0.3, 0.7)
}

random_search_rf_text_biobert = RandomizedSearchCV(rf_text_biobert, param_grid_rf_text_biobert, cv=5,
                                                   random_state=42, n_iter=10, scoring='f1', verbose=10)
random_search_rf_text_biobert.fit(X_train_text_biobert.astype(np.float64), y_train_text_biobert)

print("Best set of hyperparameters: ", random_search_rf_text_biobert.best_params_)
print("Best score: ", random_search_rf_text_biobert.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 1/5; 1/10] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.910 total time=  23.6s
[CV 2/5; 1/10] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 2/5; 1/10] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.914 total time=  23.5s
[CV 3/5; 1/10] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 3/5; 1/10] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.917 total time=  23.9s
[CV 4/5; 1/10] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 4/5; 1/10] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.903 total time=  24.4s
[CV 5/5; 1/10] 

[CV 1/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219;, score=0.910 total time=  25.9s
[CV 2/5; 8/10] START criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219
[CV 2/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219;, score=0.913 total time=  25.9s
[CV 3/5; 8/10] START criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219
[CV 3/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219;, score=0.909 total time=  25.7s
[CV 4/5; 8/10] START criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219
[CV 4/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219;, score=0.901 total time=  26.3s
[CV 5/5; 8/10] START criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219
[CV 5/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056

In [58]:
best_model_rf_text_biobert = random_search_rf_text_biobert.best_estimator_

print(f1_score(y_test_text_biobert, best_model_rf_text_biobert.predict(X_test_text_biobert.astype(np.float64))))
print(accuracy_score(y_test_text_biobert, best_model_rf_text_biobert.predict(X_test_text_biobert.astype(np.float64))))
print(precision_score(y_test_text_biobert, best_model_rf_text_biobert.predict(X_test_text_biobert.astype(np.float64))))
print(recall_score(y_test_text_biobert, best_model_rf_text_biobert.predict(X_test_text_biobert.astype(np.float64))))
print(confusion_matrix(y_test_text_biobert, best_model_rf_text_biobert.predict(X_test_text_biobert.astype(np.float64))))

0.9164750957854406
0.9372119815668203
0.9629629629629629
0.8742690058479532
[[1029   23]
 [  86  598]]


# SVM_text_biobert

In [59]:
np.random.seed(42)

pipeline_svm_text_biobert = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC())
])

param_grid_svm_text_biobert = {
    'svm__C': stats.uniform(0.1, 10),
    'svm__gamma': stats.uniform(0.01, 1),
    'svm__kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}

random_search_svm_text_biobert = RandomizedSearchCV(pipeline_svm_text_biobert, param_grid_svm_text_biobert,
                                                    cv=5, random_state=42, n_iter=10, scoring='f1', verbose=10)
random_search_svm_text_biobert.fit(X_train_text_biobert.astype(np.float64), y_train_text_biobert)

print("Best set of hyperparameters: ", random_search_svm_text_biobert.best_params_)
print("Best score: ", random_search_svm_text_biobert.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly
[CV 1/5; 1/10] END svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly;, score=0.936 total time=   3.8s
[CV 2/5; 1/10] START svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly
[CV 2/5; 1/10] END svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly;, score=0.938 total time=   3.7s
[CV 3/5; 1/10] START svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly
[CV 3/5; 1/10] END svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly;, score=0.937 total time=   3.8s
[CV 4/5; 1/10] START svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly
[CV 4/5; 1/10] END svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly;, score=0.936 total time=   3.8s
[CV 5/5; 1/10] START svm__C=3.845401188473625, svm__gam

[CV 2/5; 8/10] END svm__C=0.17066305219717406, svm__gamma=0.03306242504141576, svm__kernel=poly;, score=0.938 total time=   3.7s
[CV 3/5; 8/10] START svm__C=0.17066305219717406, svm__gamma=0.03306242504141576, svm__kernel=poly
[CV 3/5; 8/10] END svm__C=0.17066305219717406, svm__gamma=0.03306242504141576, svm__kernel=poly;, score=0.937 total time=   3.7s
[CV 4/5; 8/10] START svm__C=0.17066305219717406, svm__gamma=0.03306242504141576, svm__kernel=poly
[CV 4/5; 8/10] END svm__C=0.17066305219717406, svm__gamma=0.03306242504141576, svm__kernel=poly;, score=0.936 total time=   3.8s
[CV 5/5; 8/10] START svm__C=0.17066305219717406, svm__gamma=0.03306242504141576, svm__kernel=poly
[CV 5/5; 8/10] END svm__C=0.17066305219717406, svm__gamma=0.03306242504141576, svm__kernel=poly;, score=0.924 total time=   3.8s
[CV 1/5; 9/10] START svm__C=6.218528947223795, svm__gamma=0.14949386065204184, svm__kernel=sigmoid
[CV 1/5; 9/10] END svm__C=6.218528947223795, svm__gamma=0.14949386065204184, svm__kernel=si

In [60]:
model_svm_text_biobert = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(C=3.845401188473625, gamma=0.9607143064099162, kernel='poly'))
])
model_svm_text_biobert.fit(X_train_text_biobert.astype(np.float64), y_train_text_biobert)

print(f1_score(y_test_text_biobert, model_svm_text_biobert.predict(X_test_text_biobert.astype(np.float64))))
print(accuracy_score(y_test_text_biobert, model_svm_text_biobert.predict(X_test_text_biobert.astype(np.float64))))
print(precision_score(y_test_text_biobert, model_svm_text_biobert.predict(X_test_text_biobert.astype(np.float64))))
print(recall_score(y_test_text_biobert, model_svm_text_biobert.predict(X_test_text_biobert.astype(np.float64))))
print(confusion_matrix(y_test_text_biobert, model_svm_text_biobert.predict(X_test_text_biobert.astype(np.float64))))

0.9289940828402367
0.9447004608294931
0.9401197604790419
0.9181286549707602
[[1012   40]
 [  56  628]]


# XGB_alldata_biomed_roberta

In [61]:
X_train_all_biomed_roberta, X_test_all_biomed_roberta, y_train_all_biomed_roberta, y_test_all_biomed_roberta = train_test_split(
    classified_df_vect_biomed_roberta.drop(columns=['class', 'pubmed_id']), classified_df_vect_biomed_roberta['class'],
    test_size=0.2,
    random_state=42, stratify=classified_df_vect_biomed_roberta['class'])

np.random.seed(42)

xgb_all_biomed_roberta = xgb.XGBClassifier()

param_grid_xgb_all_biomed_roberta = {
    'max_depth': stats.randint(3, 10),
    'learning_rate': stats.uniform(0.001, 0.1),
    'subsample': stats.uniform(0.3, 0.7),
    'n_estimators': stats.randint(50, 200)
}

random_search_xgb_all_biomed_roberta = RandomizedSearchCV(xgb_all_biomed_roberta, param_grid_xgb_all_biomed_roberta,
                                                          cv=5, random_state=42,
                                                          n_iter=10, scoring='f1', verbose=10)
random_search_xgb_all_biomed_roberta.fit(X_train_all_biomed_roberta.astype(np.float64), y_train_all_biomed_roberta)

print("Best set of hyperparameters: ", random_search_xgb_all_biomed_roberta.best_params_)
print("Best score: ", random_search_xgb_all_biomed_roberta.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 1/5; 1/10] END learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836;, score=0.962 total time=   6.8s
[CV 2/5; 1/10] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 2/5; 1/10] END learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836;, score=0.974 total time=   6.6s
[CV 3/5; 1/10] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 3/5; 1/10] END learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836;, score=0.962 total time=   6.6s
[CV 4/5; 1/10] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 4/5; 1/10] END learning_rate=0.038454

[CV 2/5; 7/10] END learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925;, score=0.969 total time=   3.8s
[CV 3/5; 7/10] START learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925
[CV 3/5; 7/10] END learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925;, score=0.967 total time=   3.8s
[CV 4/5; 7/10] START learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925
[CV 4/5; 7/10] END learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925;, score=0.966 total time=   3.8s
[CV 5/5; 7/10] START learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925
[CV 5/5; 7/10] END learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925;, score=0.965 total time=   3.8s
[CV 1/5; 8/10] START learning_rate=0.07951759613930137, max_depth=5, n_estimat

In [62]:
model_xgb_all_biomed_roberta = xgb.XGBClassifier(learning_rate=0.07951759613930137, max_depth=5, n_estimators=157,
                                                 subsample=0.6599641068895281)
model_xgb_all_biomed_roberta.fit(X_train_all_biomed_roberta.astype(np.float64), y_train_all_biomed_roberta)

print(f1_score(y_test_all_biomed_roberta,
               model_xgb_all_biomed_roberta.predict(X_test_all_biomed_roberta.astype(np.float64))))
print(accuracy_score(y_test_all_biomed_roberta,
                     model_xgb_all_biomed_roberta.predict(X_test_all_biomed_roberta.astype(np.float64))))
print(precision_score(y_test_all_biomed_roberta,
                      model_xgb_all_biomed_roberta.predict(X_test_all_biomed_roberta.astype(np.float64))))
print(recall_score(y_test_all_biomed_roberta,
                   model_xgb_all_biomed_roberta.predict(X_test_all_biomed_roberta.astype(np.float64))))
print(confusion_matrix(y_test_all_biomed_roberta,
                       model_xgb_all_biomed_roberta.predict(X_test_all_biomed_roberta.astype(np.float64))))

0.9725722757598221
0.9786866359447005
0.9864661654135338
0.9590643274853801
[[1043    9]
 [  28  656]]


# RF_alldata_biomed_roberta

In [63]:
np.random.seed(42)

rf_all_biomed_roberta = RandomForestClassifier()

param_grid_rf_all_biomed_roberta = {
    'n_estimators': stats.randint(50, 250),
    'max_depth': stats.randint(5, 100),
    'criterion': ['gini', 'entropy'],
    'max_samples': stats.uniform(0.3, 0.7)
}

random_search_rf_all_biomed_roberta = RandomizedSearchCV(rf_all_biomed_roberta, param_grid_rf_all_biomed_roberta, cv=5,
                                                         random_state=42,
                                                         n_iter=10, scoring='f1',
                                                         verbose=10)
random_search_rf_all_biomed_roberta.fit(X_train_all_biomed_roberta.astype(np.float64), y_train_all_biomed_roberta)

print("Best set of hyperparameters: ", random_search_rf_all_biomed_roberta.best_params_)
print("Best score: ", random_search_rf_all_biomed_roberta.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 1/5; 1/10] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.956 total time=  20.1s
[CV 2/5; 1/10] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 2/5; 1/10] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.953 total time=  20.5s
[CV 3/5; 1/10] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 3/5; 1/10] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.951 total time=  19.9s
[CV 4/5; 1/10] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 4/5; 1/10] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.958 total time=  20.7s
[CV 5/5; 1/10] 

[CV 1/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219;, score=0.952 total time=  21.9s
[CV 2/5; 8/10] START criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219
[CV 2/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219;, score=0.952 total time=  22.7s
[CV 3/5; 8/10] START criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219
[CV 3/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219;, score=0.951 total time=  22.5s
[CV 4/5; 8/10] START criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219
[CV 4/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219;, score=0.954 total time=  22.1s
[CV 5/5; 8/10] START criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219
[CV 5/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056

In [64]:
best_model_rf_all_biomed_roberta = random_search_rf_all_biomed_roberta.best_estimator_

print(f1_score(y_test_all_biomed_roberta,
               best_model_rf_all_biomed_roberta.predict(X_test_all_biomed_roberta.astype(np.float64))))
print(accuracy_score(y_test_all_biomed_roberta,
                     best_model_rf_all_biomed_roberta.predict(X_test_all_biomed_roberta.astype(np.float64))))
print(precision_score(y_test_all_biomed_roberta,
                      best_model_rf_all_biomed_roberta.predict(X_test_all_biomed_roberta.astype(np.float64))))
print(recall_score(y_test_all_biomed_roberta,
                   best_model_rf_all_biomed_roberta.predict(X_test_all_biomed_roberta.astype(np.float64))))
print(confusion_matrix(y_test_all_biomed_roberta,
                       best_model_rf_all_biomed_roberta.predict(X_test_all_biomed_roberta.astype(np.float64))))

0.95423855963991
0.9648617511520737
0.9799691833590138
0.9298245614035088
[[1039   13]
 [  48  636]]


# SVM_alldata_biomed_roberta

In [65]:
np.random.seed(42)

pipeline_svm_all_biomed_roberta = Pipeline([
    ('imputer', KNNImputer(weights='distance', metric='nan_euclidean', add_indicator=True)),
    ('scaler', StandardScaler()),
    ('svm', SVC())
])

param_grid_svm_all_biomed_roberta = {
    'imputer__n_neighbors': [2, 3, 5, 7],
    'svm__C': stats.uniform(0.1, 10),
    'svm__gamma': stats.uniform(0.01, 1),
    'svm__kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}

random_search_svm_all_biomed_roberta = RandomizedSearchCV(pipeline_svm_all_biomed_roberta,
                                                          param_grid_svm_all_biomed_roberta, cv=5, random_state=42,
                                                          n_iter=10, scoring='f1', verbose=10)
random_search_svm_all_biomed_roberta.fit(X_train_all_biomed_roberta.astype(np.float64), y_train_all_biomed_roberta)

print("Best set of hyperparameters: ", random_search_svm_all_biomed_roberta.best_params_)
print("Best score: ", random_search_svm_all_biomed_roberta.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START imputer__n_neighbors=5, svm__C=8.065429868602328, svm__gamma=0.1934347898661638, svm__kernel=sigmoid
[CV 1/5; 1/10] END imputer__n_neighbors=5, svm__C=8.065429868602328, svm__gamma=0.1934347898661638, svm__kernel=sigmoid;, score=0.765 total time=  17.9s
[CV 2/5; 1/10] START imputer__n_neighbors=5, svm__C=8.065429868602328, svm__gamma=0.1934347898661638, svm__kernel=sigmoid
[CV 2/5; 1/10] END imputer__n_neighbors=5, svm__C=8.065429868602328, svm__gamma=0.1934347898661638, svm__kernel=sigmoid;, score=0.774 total time=  17.9s
[CV 3/5; 1/10] START imputer__n_neighbors=5, svm__C=8.065429868602328, svm__gamma=0.1934347898661638, svm__kernel=sigmoid
[CV 3/5; 1/10] END imputer__n_neighbors=5, svm__C=8.065429868602328, svm__gamma=0.1934347898661638, svm__kernel=sigmoid;, score=0.761 total time=  17.7s
[CV 4/5; 1/10] START imputer__n_neighbors=5, svm__C=8.065429868602328, svm__gamma=0.1934347898661638, svm__kernel=

[CV 1/5; 7/10] END imputer__n_neighbors=7, svm__C=0.33062425041415755, svm__gamma=0.5347746602583892, svm__kernel=poly;, score=0.952 total time=  17.5s
[CV 2/5; 7/10] START imputer__n_neighbors=7, svm__C=0.33062425041415755, svm__gamma=0.5347746602583892, svm__kernel=poly
[CV 2/5; 7/10] END imputer__n_neighbors=7, svm__C=0.33062425041415755, svm__gamma=0.5347746602583892, svm__kernel=poly;, score=0.955 total time=  17.3s
[CV 3/5; 7/10] START imputer__n_neighbors=7, svm__C=0.33062425041415755, svm__gamma=0.5347746602583892, svm__kernel=poly
[CV 3/5; 7/10] END imputer__n_neighbors=7, svm__C=0.33062425041415755, svm__gamma=0.5347746602583892, svm__kernel=poly;, score=0.949 total time=  17.4s
[CV 4/5; 7/10] START imputer__n_neighbors=7, svm__C=0.33062425041415755, svm__gamma=0.5347746602583892, svm__kernel=poly
[CV 4/5; 7/10] END imputer__n_neighbors=7, svm__C=0.33062425041415755, svm__gamma=0.5347746602583892, svm__kernel=poly;, score=0.959 total time=  17.4s
[CV 5/5; 7/10] START imputer_

In [66]:
model_svm_all_biomed_roberta = Pipeline([
    ('imputer', KNNImputer(n_neighbors=3, weights='distance', metric='nan_euclidean', add_indicator=True)),
    ('scaler', StandardScaler()),
    ('svm', SVC(C=0.5666566321361542, gamma=0.9837555188414592, kernel='poly'))
])
model_svm_all_biomed_roberta.fit(X_train_all_biomed_roberta.astype(np.float64), y_train_all_biomed_roberta)

print(f1_score(y_test_all_biomed_roberta,
               model_svm_all_biomed_roberta.predict(X_test_all_biomed_roberta.astype(np.float64))))
print(accuracy_score(y_test_all_biomed_roberta,
                     model_svm_all_biomed_roberta.predict(X_test_all_biomed_roberta.astype(np.float64))))
print(precision_score(y_test_all_biomed_roberta,
                      model_svm_all_biomed_roberta.predict(X_test_all_biomed_roberta.astype(np.float64))))
print(recall_score(y_test_all_biomed_roberta,
                   model_svm_all_biomed_roberta.predict(X_test_all_biomed_roberta.astype(np.float64))))
print(confusion_matrix(y_test_all_biomed_roberta,
                       model_svm_all_biomed_roberta.predict(X_test_all_biomed_roberta.astype(np.float64))))

0.9595290654893304
0.9683179723502304
0.965925925925926
0.9532163742690059
[[1029   23]
 [  32  652]]


# XGB_text_biomed_roberta

In [67]:
X_train_text_biomed_roberta, X_test_text_biomed_roberta, y_train_text_biomed_roberta, y_test_text_biomed_roberta = train_test_split(
    cls_X_matrix_biomed_roberta,
    classified_df_vect_biomed_roberta[
        'class'],
    test_size=0.2,
    random_state=42,
    stratify=
    classified_df_vect_biomed_roberta[
        'class'])

np.random.seed(42)

xgb_text_biomed_roberta = xgb.XGBClassifier()

param_grid_xgb_text_biomed_roberta = {
    'max_depth': stats.randint(3, 10),
    'learning_rate': stats.uniform(0.001, 0.1),
    'subsample': stats.uniform(0.3, 0.7),
    'n_estimators': stats.randint(50, 200)
}

random_search_xgb_text_biomed_roberta = RandomizedSearchCV(xgb_text_biomed_roberta, param_grid_xgb_text_biomed_roberta,
                                                           cv=5,
                                                           random_state=42, n_iter=10, scoring='f1', verbose=10)
random_search_xgb_text_biomed_roberta.fit(X_train_text_biomed_roberta.astype(np.float64), y_train_text_biomed_roberta)

print("Best set of hyperparameters: ", random_search_xgb_text_biomed_roberta.best_params_)
print("Best score: ", random_search_xgb_text_biomed_roberta.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 1/5; 1/10] END learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836;, score=0.916 total time=   6.5s
[CV 2/5; 1/10] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 2/5; 1/10] END learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836;, score=0.919 total time=   6.6s
[CV 3/5; 1/10] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 3/5; 1/10] END learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836;, score=0.923 total time=   6.4s
[CV 4/5; 1/10] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 4/5; 1/10] END learning_rate=0.038454

[CV 2/5; 7/10] END learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925;, score=0.925 total time=   2.4s
[CV 3/5; 7/10] START learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925
[CV 3/5; 7/10] END learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925;, score=0.920 total time=   2.4s
[CV 4/5; 7/10] START learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925
[CV 4/5; 7/10] END learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925;, score=0.909 total time=   2.4s
[CV 5/5; 7/10] START learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925
[CV 5/5; 7/10] END learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925;, score=0.909 total time=   2.4s
[CV 1/5; 8/10] START learning_rate=0.07951759613930137, max_depth=5, n_estimat

In [68]:
model_xgb_text_biomed_roberta = xgb.XGBClassifier(learning_rate=0.07951759613930137, max_depth=5, n_estimators=157,
                                                  subsample=0.6599641068895281)
model_xgb_text_biomed_roberta.fit(X_train_text_biomed_roberta.astype(np.float64), y_train_text_biomed_roberta)

print(f1_score(y_test_text_biomed_roberta,
               model_xgb_text_biomed_roberta.predict(X_test_text_biomed_roberta.astype(np.float64))))
print(accuracy_score(y_test_text_biomed_roberta,
                     model_xgb_text_biomed_roberta.predict(X_test_text_biomed_roberta.astype(np.float64))))
print(precision_score(y_test_text_biomed_roberta,
                      model_xgb_text_biomed_roberta.predict(X_test_text_biomed_roberta.astype(np.float64))))
print(recall_score(y_test_text_biomed_roberta,
                   model_xgb_text_biomed_roberta.predict(X_test_text_biomed_roberta.astype(np.float64))))
print(confusion_matrix(y_test_text_biomed_roberta,
                       model_xgb_text_biomed_roberta.predict(X_test_text_biomed_roberta.astype(np.float64))))

0.9266467065868264
0.9435483870967742
0.9493865030674846
0.9049707602339181
[[1019   33]
 [  65  619]]


# RF_text_biomed_roberta

In [69]:
np.random.seed(42)

rf_text_biomed_roberta = RandomForestClassifier()

param_grid_rf_text_biomed_roberta = {
    'n_estimators': stats.randint(50, 250),
    'max_depth': stats.randint(5, 100),
    'criterion': ['gini', 'entropy'],
    'max_samples': stats.uniform(0.3, 0.7)
}

random_search_rf_text_biomed_roberta = RandomizedSearchCV(rf_text_biomed_roberta, param_grid_rf_text_biomed_roberta,
                                                          cv=5,
                                                          random_state=42, n_iter=10, scoring='f1', verbose=10)
random_search_rf_text_biomed_roberta.fit(X_train_text_biomed_roberta.astype(np.float64), y_train_text_biomed_roberta)

print("Best set of hyperparameters: ", random_search_rf_text_biomed_roberta.best_params_)
print("Best score: ", random_search_rf_text_biomed_roberta.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 1/5; 1/10] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.916 total time=  21.9s
[CV 2/5; 1/10] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 2/5; 1/10] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.914 total time=  22.2s
[CV 3/5; 1/10] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 3/5; 1/10] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.920 total time=  21.9s
[CV 4/5; 1/10] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 4/5; 1/10] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.909 total time=  22.6s
[CV 5/5; 1/10] 

[CV 1/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219;, score=0.915 total time=  24.1s
[CV 2/5; 8/10] START criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219
[CV 2/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219;, score=0.910 total time=  24.5s
[CV 3/5; 8/10] START criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219
[CV 3/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219;, score=0.915 total time=  24.5s
[CV 4/5; 8/10] START criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219
[CV 4/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219;, score=0.909 total time=  24.2s
[CV 5/5; 8/10] START criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219
[CV 5/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056

In [70]:
best_model_rf_text_biomed_roberta = random_search_rf_text_biomed_roberta.best_estimator_

print(f1_score(y_test_text_biomed_roberta,
               best_model_rf_text_biomed_roberta.predict(X_test_text_biomed_roberta.astype(np.float64))))
print(accuracy_score(y_test_text_biomed_roberta,
                     best_model_rf_text_biomed_roberta.predict(X_test_text_biomed_roberta.astype(np.float64))))
print(precision_score(y_test_text_biomed_roberta,
                      best_model_rf_text_biomed_roberta.predict(X_test_text_biomed_roberta.astype(np.float64))))
print(recall_score(y_test_text_biomed_roberta,
                   best_model_rf_text_biomed_roberta.predict(X_test_text_biomed_roberta.astype(np.float64))))
print(confusion_matrix(y_test_text_biomed_roberta,
                       best_model_rf_text_biomed_roberta.predict(X_test_text_biomed_roberta.astype(np.float64))))

0.9193302891933028
0.9389400921658986
0.9587301587301588
0.8830409356725146
[[1026   26]
 [  80  604]]


# SVM_text_biomed_roberta

In [71]:
np.random.seed(42)

pipeline_svm_text_biomed_roberta = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC())
])

param_grid_svm_text_biomed_roberta = {
    'svm__C': stats.uniform(0.1, 10),
    'svm__gamma': stats.uniform(0.01, 1),
    'svm__kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}

random_search_svm_text_biomed_roberta = RandomizedSearchCV(pipeline_svm_text_biomed_roberta,
                                                           param_grid_svm_text_biomed_roberta,
                                                           cv=5, random_state=42, n_iter=10, scoring='f1', verbose=10)
random_search_svm_text_biomed_roberta.fit(X_train_text_biomed_roberta.astype(np.float64), y_train_text_biomed_roberta)

print("Best set of hyperparameters: ", random_search_svm_text_biomed_roberta.best_params_)
print("Best score: ", random_search_svm_text_biomed_roberta.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly
[CV 1/5; 1/10] END svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly;, score=0.933 total time=   4.4s
[CV 2/5; 1/10] START svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly
[CV 2/5; 1/10] END svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly;, score=0.932 total time=   4.2s
[CV 3/5; 1/10] START svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly
[CV 3/5; 1/10] END svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly;, score=0.934 total time=   4.4s
[CV 4/5; 1/10] START svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly
[CV 4/5; 1/10] END svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly;, score=0.943 total time=   4.4s
[CV 5/5; 1/10] START svm__C=3.845401188473625, svm__gam

[CV 2/5; 8/10] END svm__C=0.17066305219717406, svm__gamma=0.03306242504141576, svm__kernel=poly;, score=0.932 total time=   4.4s
[CV 3/5; 8/10] START svm__C=0.17066305219717406, svm__gamma=0.03306242504141576, svm__kernel=poly
[CV 3/5; 8/10] END svm__C=0.17066305219717406, svm__gamma=0.03306242504141576, svm__kernel=poly;, score=0.934 total time=   4.3s
[CV 4/5; 8/10] START svm__C=0.17066305219717406, svm__gamma=0.03306242504141576, svm__kernel=poly
[CV 4/5; 8/10] END svm__C=0.17066305219717406, svm__gamma=0.03306242504141576, svm__kernel=poly;, score=0.943 total time=   4.4s
[CV 5/5; 8/10] START svm__C=0.17066305219717406, svm__gamma=0.03306242504141576, svm__kernel=poly
[CV 5/5; 8/10] END svm__C=0.17066305219717406, svm__gamma=0.03306242504141576, svm__kernel=poly;, score=0.938 total time=   4.7s
[CV 1/5; 9/10] START svm__C=6.218528947223795, svm__gamma=0.14949386065204184, svm__kernel=sigmoid
[CV 1/5; 9/10] END svm__C=6.218528947223795, svm__gamma=0.14949386065204184, svm__kernel=si

In [72]:
model_svm_text_biomed_roberta = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(C=3.845401188473625, gamma=0.9607143064099162, kernel='poly'))
])
model_svm_text_biomed_roberta.fit(X_train_text_biomed_roberta.astype(np.float64), y_train_text_biomed_roberta)

print(f1_score(y_test_text_biomed_roberta,
               model_svm_text_biomed_roberta.predict(X_test_text_biomed_roberta.astype(np.float64))))
print(accuracy_score(y_test_text_biomed_roberta,
                     model_svm_text_biomed_roberta.predict(X_test_text_biomed_roberta.astype(np.float64))))
print(precision_score(y_test_text_biomed_roberta,
                      model_svm_text_biomed_roberta.predict(X_test_text_biomed_roberta.astype(np.float64))))
print(recall_score(y_test_text_biomed_roberta,
                   model_svm_text_biomed_roberta.predict(X_test_text_biomed_roberta.astype(np.float64))))
print(confusion_matrix(y_test_text_biomed_roberta,
                       model_svm_text_biomed_roberta.predict(X_test_text_biomed_roberta.astype(np.float64))))

0.9367647058823529
0.9504608294930875
0.9423076923076923
0.9312865497076024
[[1013   39]
 [  47  637]]


# XGB_alldata_distilbert

In [73]:
X_train_all_distilbert, X_test_all_distilbert, y_train_all_distilbert, y_test_all_distilbert = train_test_split(
    classified_df_vect_distilbert.drop(columns=['class', 'pubmed_id']), classified_df_vect_distilbert['class'],
    test_size=0.2,
    random_state=42, stratify=classified_df_vect_distilbert['class'])

np.random.seed(42)

xgb_all_distilbert = xgb.XGBClassifier()

param_grid_xgb_all_distilbert = {
    'max_depth': stats.randint(3, 10),
    'learning_rate': stats.uniform(0.001, 0.1),
    'subsample': stats.uniform(0.3, 0.7),
    'n_estimators': stats.randint(50, 200)
}

random_search_xgb_all_distilbert = RandomizedSearchCV(xgb_all_distilbert, param_grid_xgb_all_distilbert,
                                                          cv=5, random_state=42,
                                                          n_iter=10, scoring='f1', verbose=10)
random_search_xgb_all_distilbert.fit(X_train_all_distilbert.astype(np.float64), y_train_all_distilbert)

print("Best set of hyperparameters: ", random_search_xgb_all_distilbert.best_params_)
print("Best score: ", random_search_xgb_all_distilbert.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 1/5; 1/10] END learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836;, score=0.963 total time=   6.8s
[CV 2/5; 1/10] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 2/5; 1/10] END learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836;, score=0.966 total time=   6.7s
[CV 3/5; 1/10] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 3/5; 1/10] END learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836;, score=0.964 total time=   6.6s
[CV 4/5; 1/10] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 4/5; 1/10] END learning_rate=0.038454

[CV 2/5; 7/10] END learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925;, score=0.971 total time=   3.8s
[CV 3/5; 7/10] START learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925
[CV 3/5; 7/10] END learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925;, score=0.970 total time=   3.7s
[CV 4/5; 7/10] START learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925
[CV 4/5; 7/10] END learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925;, score=0.966 total time=   3.8s
[CV 5/5; 7/10] START learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925
[CV 5/5; 7/10] END learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925;, score=0.964 total time=   3.8s
[CV 1/5; 8/10] START learning_rate=0.07951759613930137, max_depth=5, n_estimat

In [74]:
model_xgb_all_distilbert = xgb.XGBClassifier(learning_rate=0.07951759613930137, max_depth=5, n_estimators=157,
                                                 subsample=0.6599641068895281)
model_xgb_all_distilbert.fit(X_train_all_distilbert.astype(np.float64), y_train_all_distilbert)

print(f1_score(y_test_all_distilbert,
               model_xgb_all_distilbert.predict(X_test_all_distilbert.astype(np.float64))))
print(accuracy_score(y_test_all_distilbert,
               model_xgb_all_distilbert.predict(X_test_all_distilbert.astype(np.float64))))
print(precision_score(y_test_all_distilbert,
               model_xgb_all_distilbert.predict(X_test_all_distilbert.astype(np.float64))))
print(recall_score(y_test_all_distilbert,
               model_xgb_all_distilbert.predict(X_test_all_distilbert.astype(np.float64))))
print(confusion_matrix(y_test_all_distilbert,
               model_xgb_all_distilbert.predict(X_test_all_distilbert.astype(np.float64))))

0.9793205317577548
0.9838709677419355
0.9895522388059701
0.9692982456140351
[[1045    7]
 [  21  663]]


# RF_alldata_distilbert

In [75]:
np.random.seed(42)

rf_all_distilbert = RandomForestClassifier()

param_grid_rf_all_distilbert = {
    'n_estimators': stats.randint(50, 250),
    'max_depth': stats.randint(5, 100),
    'criterion': ['gini', 'entropy'],
    'max_samples': stats.uniform(0.3, 0.7)
}

random_search_rf_all_distilbert = RandomizedSearchCV(rf_all_distilbert, param_grid_rf_all_distilbert, cv=5,
                                                         random_state=42,
                                                         n_iter=10, scoring='f1',
                                                         verbose=10)
random_search_rf_all_distilbert.fit(X_train_all_distilbert.astype(np.float64), y_train_all_distilbert)

print("Best set of hyperparameters: ", random_search_rf_all_distilbert.best_params_)
print("Best score: ", random_search_rf_all_distilbert.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 1/5; 1/10] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.955 total time=  21.5s
[CV 2/5; 1/10] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 2/5; 1/10] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.961 total time=  21.8s
[CV 3/5; 1/10] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 3/5; 1/10] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.953 total time=  21.4s
[CV 4/5; 1/10] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 4/5; 1/10] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.952 total time=  21.2s
[CV 5/5; 1/10] 

[CV 1/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219;, score=0.952 total time=  23.9s
[CV 2/5; 8/10] START criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219
[CV 2/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219;, score=0.958 total time=  23.7s
[CV 3/5; 8/10] START criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219
[CV 3/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219;, score=0.944 total time=  23.8s
[CV 4/5; 8/10] START criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219
[CV 4/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219;, score=0.951 total time=  23.7s
[CV 5/5; 8/10] START criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219
[CV 5/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056

In [76]:
best_model_rf_all_distilbert = random_search_rf_all_distilbert.best_estimator_

print(f1_score(y_test_all_distilbert,
               best_model_rf_all_distilbert.predict(X_test_all_distilbert.astype(np.float64))))
print(accuracy_score(y_test_all_distilbert,
               best_model_rf_all_distilbert.predict(X_test_all_distilbert.astype(np.float64))))
print(precision_score(y_test_all_distilbert,
               best_model_rf_all_distilbert.predict(X_test_all_distilbert.astype(np.float64))))
print(recall_score(y_test_all_distilbert,
               best_model_rf_all_distilbert.predict(X_test_all_distilbert.astype(np.float64))))
print(confusion_matrix(y_test_all_distilbert,
               best_model_rf_all_distilbert.predict(X_test_all_distilbert.astype(np.float64))))

0.9474474474474475
0.9596774193548387
0.9737654320987654
0.922514619883041
[[1035   17]
 [  53  631]]


# SVM_alldata_distilbert

In [77]:
np.random.seed(42)

pipeline_svm_all_distilbert = Pipeline([
    ('imputer', KNNImputer(weights='distance', metric='nan_euclidean', add_indicator=True)),
    ('scaler', StandardScaler()),
    ('svm', SVC())
])

param_grid_svm_all_distilbert = {
    'imputer__n_neighbors': [2, 3, 5, 7],
    'svm__C': stats.uniform(0.1, 10),
    'svm__gamma': stats.uniform(0.01, 1),
    'svm__kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}

random_search_svm_all_distilbert = RandomizedSearchCV(pipeline_svm_all_distilbert,
                                                          param_grid_svm_all_distilbert, cv=5, random_state=42,
                                                          n_iter=10, scoring='f1', verbose=10)
random_search_svm_all_distilbert.fit(X_train_all_distilbert.astype(np.float64), y_train_all_distilbert)

print("Best set of hyperparameters: ", random_search_svm_all_distilbert.best_params_)
print("Best score: ", random_search_svm_all_distilbert.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START imputer__n_neighbors=5, svm__C=8.065429868602328, svm__gamma=0.1934347898661638, svm__kernel=sigmoid
[CV 1/5; 1/10] END imputer__n_neighbors=5, svm__C=8.065429868602328, svm__gamma=0.1934347898661638, svm__kernel=sigmoid;, score=0.703 total time=  17.5s
[CV 2/5; 1/10] START imputer__n_neighbors=5, svm__C=8.065429868602328, svm__gamma=0.1934347898661638, svm__kernel=sigmoid
[CV 2/5; 1/10] END imputer__n_neighbors=5, svm__C=8.065429868602328, svm__gamma=0.1934347898661638, svm__kernel=sigmoid;, score=0.679 total time=  17.7s
[CV 3/5; 1/10] START imputer__n_neighbors=5, svm__C=8.065429868602328, svm__gamma=0.1934347898661638, svm__kernel=sigmoid
[CV 3/5; 1/10] END imputer__n_neighbors=5, svm__C=8.065429868602328, svm__gamma=0.1934347898661638, svm__kernel=sigmoid;, score=0.667 total time=  17.2s
[CV 4/5; 1/10] START imputer__n_neighbors=5, svm__C=8.065429868602328, svm__gamma=0.1934347898661638, svm__kernel=

[CV 1/5; 7/10] END imputer__n_neighbors=7, svm__C=0.33062425041415755, svm__gamma=0.5347746602583892, svm__kernel=poly;, score=0.957 total time=  16.7s
[CV 2/5; 7/10] START imputer__n_neighbors=7, svm__C=0.33062425041415755, svm__gamma=0.5347746602583892, svm__kernel=poly
[CV 2/5; 7/10] END imputer__n_neighbors=7, svm__C=0.33062425041415755, svm__gamma=0.5347746602583892, svm__kernel=poly;, score=0.951 total time=  16.7s
[CV 3/5; 7/10] START imputer__n_neighbors=7, svm__C=0.33062425041415755, svm__gamma=0.5347746602583892, svm__kernel=poly
[CV 3/5; 7/10] END imputer__n_neighbors=7, svm__C=0.33062425041415755, svm__gamma=0.5347746602583892, svm__kernel=poly;, score=0.946 total time=  17.3s
[CV 4/5; 7/10] START imputer__n_neighbors=7, svm__C=0.33062425041415755, svm__gamma=0.5347746602583892, svm__kernel=poly
[CV 4/5; 7/10] END imputer__n_neighbors=7, svm__C=0.33062425041415755, svm__gamma=0.5347746602583892, svm__kernel=poly;, score=0.953 total time=  17.8s
[CV 5/5; 7/10] START imputer_

In [78]:
model_svm_all_distilbert = Pipeline([
    ('imputer', KNNImputer(n_neighbors=3, weights='distance', metric='nan_euclidean', add_indicator=True)),
    ('scaler', StandardScaler()),
    ('svm', SVC(C=0.5666566321361542, gamma=0.9837555188414592, kernel='poly'))
])
model_svm_all_distilbert.fit(X_train_all_distilbert.astype(np.float64), y_train_all_distilbert)

print(f1_score(y_test_all_distilbert,
               model_svm_all_distilbert.predict(X_test_all_distilbert.astype(np.float64))))
print(accuracy_score(y_test_all_distilbert,
               model_svm_all_distilbert.predict(X_test_all_distilbert.astype(np.float64))))
print(precision_score(y_test_all_distilbert,
               model_svm_all_distilbert.predict(X_test_all_distilbert.astype(np.float64))))
print(recall_score(y_test_all_distilbert,
               model_svm_all_distilbert.predict(X_test_all_distilbert.astype(np.float64))))
print(confusion_matrix(y_test_all_distilbert,
               model_svm_all_distilbert.predict(X_test_all_distilbert.astype(np.float64))))

0.959349593495935
0.9683179723502304
0.9701046337817638
0.9488304093567251
[[1032   20]
 [  35  649]]


# XGB_text_distilbert

In [79]:
X_train_text_distilbert, X_test_text_distilbert, y_train_text_distilbert, y_test_text_distilbert = train_test_split(
    cls_X_matrix_distilbert,
    classified_df_vect_distilbert[
        'class'],
    test_size=0.2,
    random_state=42,
    stratify=
    classified_df_vect_distilbert[
        'class'])

np.random.seed(42)

xgb_text_distilbert = xgb.XGBClassifier()

param_grid_xgb_text_distilbert = {
    'max_depth': stats.randint(3, 10),
    'learning_rate': stats.uniform(0.001, 0.1),
    'subsample': stats.uniform(0.3, 0.7),
    'n_estimators': stats.randint(50, 200)
}

random_search_xgb_text_distilbert = RandomizedSearchCV(xgb_text_distilbert, param_grid_xgb_text_distilbert,
                                                       cv=5,
                                                       random_state=42, n_iter=10, scoring='f1', verbose=10)
random_search_xgb_text_distilbert.fit(X_train_text_distilbert.astype(np.float64), y_train_text_distilbert)

print("Best set of hyperparameters: ", random_search_xgb_text_distilbert.best_params_)
print("Best score: ", random_search_xgb_text_distilbert.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 1/5; 1/10] END learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836;, score=0.916 total time=   6.4s
[CV 2/5; 1/10] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 2/5; 1/10] END learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836;, score=0.914 total time=   7.1s
[CV 3/5; 1/10] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 3/5; 1/10] END learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836;, score=0.911 total time=   6.8s
[CV 4/5; 1/10] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 4/5; 1/10] END learning_rate=0.038454

[CV 2/5; 7/10] END learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925;, score=0.910 total time=   2.5s
[CV 3/5; 7/10] START learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925
[CV 3/5; 7/10] END learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925;, score=0.904 total time=   2.5s
[CV 4/5; 7/10] START learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925
[CV 4/5; 7/10] END learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925;, score=0.901 total time=   2.5s
[CV 5/5; 7/10] START learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925
[CV 5/5; 7/10] END learning_rate=0.06218528947223795, max_depth=4, n_estimators=64, subsample=0.619248988951925;, score=0.884 total time=   2.4s
[CV 1/5; 8/10] START learning_rate=0.07951759613930137, max_depth=5, n_estimat

In [80]:
model_xgb_text_distilbert = xgb.XGBClassifier(learning_rate=0.07951759613930137, max_depth=5, n_estimators=157,
                                              subsample=0.6599641068895281)
model_xgb_text_distilbert.fit(X_train_text_distilbert.astype(np.float64), y_train_text_distilbert)

print(f1_score(y_test_text_distilbert,
               model_xgb_text_distilbert.predict(X_test_text_distilbert.astype(np.float64))))
print(accuracy_score(y_test_text_distilbert,
                     model_xgb_text_distilbert.predict(X_test_text_distilbert.astype(np.float64))))
print(precision_score(y_test_text_distilbert,
                      model_xgb_text_distilbert.predict(X_test_text_distilbert.astype(np.float64))))
print(recall_score(y_test_text_distilbert,
                   model_xgb_text_distilbert.predict(X_test_text_distilbert.astype(np.float64))))
print(confusion_matrix(y_test_text_distilbert,
                       model_xgb_text_distilbert.predict(X_test_text_distilbert.astype(np.float64))))

0.9202087994034303
0.9383640552995391
0.939117199391172
0.902046783625731
[[1012   40]
 [  67  617]]


# RF_text_distilbert

In [81]:
np.random.seed(42)

rf_text_distilbert = RandomForestClassifier()

param_grid_rf_text_distilbert = {
    'n_estimators': stats.randint(50, 250),
    'max_depth': stats.randint(5, 100),
    'criterion': ['gini', 'entropy'],
    'max_samples': stats.uniform(0.3, 0.7)
}

random_search_rf_text_distilbert = RandomizedSearchCV(rf_text_distilbert, param_grid_rf_text_distilbert,
                                                          cv=5,
                                                          random_state=42, n_iter=10, scoring='f1', verbose=10)
random_search_rf_text_distilbert.fit(X_train_text_distilbert.astype(np.float64), y_train_text_distilbert)

print("Best set of hyperparameters: ", random_search_rf_text_distilbert.best_params_)
print("Best score: ", random_search_rf_text_distilbert.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 1/5; 1/10] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.909 total time=  24.7s
[CV 2/5; 1/10] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 2/5; 1/10] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.903 total time=  24.4s
[CV 3/5; 1/10] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 3/5; 1/10] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.906 total time=  24.8s
[CV 4/5; 1/10] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 4/5; 1/10] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.901 total time=  24.5s
[CV 5/5; 1/10] 

[CV 1/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219;, score=0.912 total time=  26.5s
[CV 2/5; 8/10] START criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219
[CV 2/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219;, score=0.902 total time=  26.4s
[CV 3/5; 8/10] START criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219
[CV 3/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219;, score=0.906 total time=  26.8s
[CV 4/5; 8/10] START criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219
[CV 4/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219;, score=0.899 total time=  26.9s
[CV 5/5; 8/10] START criterion=gini, max_depth=95, max_samples=0.7282970263056656, n_estimators=219
[CV 5/5; 8/10] END criterion=gini, max_depth=95, max_samples=0.7282970263056

In [82]:
best_model_rf_text_distilbert = random_search_rf_text_distilbert.best_estimator_

print(f1_score(y_test_text_distilbert,
               best_model_rf_text_distilbert.predict(X_test_text_distilbert.astype(np.float64))))
print(accuracy_score(y_test_text_distilbert,
               best_model_rf_text_distilbert.predict(X_test_text_distilbert.astype(np.float64))))
print(precision_score(y_test_text_distilbert,
               best_model_rf_text_distilbert.predict(X_test_text_distilbert.astype(np.float64))))
print(recall_score(y_test_text_distilbert,
               best_model_rf_text_distilbert.predict(X_test_text_distilbert.astype(np.float64))))
print(confusion_matrix(y_test_text_distilbert,
               best_model_rf_text_distilbert.predict(X_test_text_distilbert.astype(np.float64))))

0.9064638783269962
0.929147465437788
0.9445324881141046
0.8713450292397661
[[1017   35]
 [  88  596]]


# SVM_text_distilbert

In [83]:
np.random.seed(42)

pipeline_svm_text_distilbert = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC())
])

param_grid_svm_text_distilbert = {
    'svm__C': stats.uniform(0.1, 10),
    'svm__gamma': stats.uniform(0.01, 1),
    'svm__kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}

random_search_svm_text_distilbert = RandomizedSearchCV(pipeline_svm_text_distilbert,
                                                           param_grid_svm_text_distilbert,
                                                           cv=5, random_state=42, n_iter=10, scoring='f1', verbose=10)
random_search_svm_text_distilbert.fit(X_train_text_distilbert.astype(np.float64), y_train_text_distilbert)

print("Best set of hyperparameters: ", random_search_svm_text_distilbert.best_params_)
print("Best score: ", random_search_svm_text_distilbert.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly
[CV 1/5; 1/10] END svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly;, score=0.923 total time=   3.8s
[CV 2/5; 1/10] START svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly
[CV 2/5; 1/10] END svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly;, score=0.920 total time=   3.8s
[CV 3/5; 1/10] START svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly
[CV 3/5; 1/10] END svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly;, score=0.915 total time=   3.7s
[CV 4/5; 1/10] START svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly
[CV 4/5; 1/10] END svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly;, score=0.914 total time=   3.8s
[CV 5/5; 1/10] START svm__C=3.845401188473625, svm__gam

[CV 2/5; 8/10] END svm__C=0.17066305219717406, svm__gamma=0.03306242504141576, svm__kernel=poly;, score=0.920 total time=   3.8s
[CV 3/5; 8/10] START svm__C=0.17066305219717406, svm__gamma=0.03306242504141576, svm__kernel=poly
[CV 3/5; 8/10] END svm__C=0.17066305219717406, svm__gamma=0.03306242504141576, svm__kernel=poly;, score=0.915 total time=   3.7s
[CV 4/5; 8/10] START svm__C=0.17066305219717406, svm__gamma=0.03306242504141576, svm__kernel=poly
[CV 4/5; 8/10] END svm__C=0.17066305219717406, svm__gamma=0.03306242504141576, svm__kernel=poly;, score=0.914 total time=   3.8s
[CV 5/5; 8/10] START svm__C=0.17066305219717406, svm__gamma=0.03306242504141576, svm__kernel=poly
[CV 5/5; 8/10] END svm__C=0.17066305219717406, svm__gamma=0.03306242504141576, svm__kernel=poly;, score=0.915 total time=   3.9s
[CV 1/5; 9/10] START svm__C=6.218528947223795, svm__gamma=0.14949386065204184, svm__kernel=sigmoid
[CV 1/5; 9/10] END svm__C=6.218528947223795, svm__gamma=0.14949386065204184, svm__kernel=si

In [84]:
model_svm_text_distilbert = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(C=3.845401188473625, gamma=0.9607143064099162, kernel='poly'))
])
model_svm_text_distilbert.fit(X_train_text_distilbert.astype(np.float64), y_train_text_distilbert)

print(f1_score(y_test_text_distilbert,
               model_svm_text_distilbert.predict(X_test_text_distilbert.astype(np.float64))))
print(accuracy_score(y_test_text_distilbert,
               model_svm_text_distilbert.predict(X_test_text_distilbert.astype(np.float64))))
print(precision_score(y_test_text_distilbert,
               model_svm_text_distilbert.predict(X_test_text_distilbert.astype(np.float64))))
print(recall_score(y_test_text_distilbert,
               model_svm_text_distilbert.predict(X_test_text_distilbert.astype(np.float64))))
print(confusion_matrix(y_test_text_distilbert,
               model_svm_text_distilbert.predict(X_test_text_distilbert.astype(np.float64))))

0.9195230998509687
0.9377880184331797
0.9376899696048632
0.902046783625731
[[1011   41]
 [  67  617]]


# XGB_alldata_tfidf

In [85]:
!pip install unidecode
!pip install nltk
!pip install stopwords




[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [86]:
import re, unidecode
from bs4 import BeautifulSoup
import string, textblob
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nkyar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nkyar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nkyar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [87]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text(separator=" ")
    return stripped_text


def remove_accented_chars(text):
    text = unidecode.unidecode(text)
    return text


def remove_numbers(text):
    result = re.sub(r'\d+', '', text)
    return result


def remove_slash_with_space(text):
    return text.replace('\\', " ")


def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)


def text_lowercase(text):
    return text.lower()


def remove_whitespace(text):
    return " ".join(text.split())


def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_text)


def stem_words(text):
    stemmer = PorterStemmer()
    word_tokens = word_tokenize(text)
    stems = [stemmer.stem(word) for word in word_tokens]
    return ' '.join(stems)


def lemmatize_words(text):
    lemmatizer = WordNetLemmatizer()
    word_tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(word, pos='v') for word in word_tokens]
    return ' '.join(lemmas)


def perform_preprocessing(text):
    text = remove_html_tags(text)
    text = remove_accented_chars(text)
    text = remove_numbers(text)
    text = remove_stopwords(text)
    text = text_lowercase(text)
    text = remove_slash_with_space(text)
    text = remove_punctuation(text)
    text = stem_words(text)
    text = lemmatize_words(text)
    text = remove_whitespace(text)
    return text

In [88]:
def text_tokenizer(text_list):
    processed_abs = []
    for i in tqdm(range(len(text_list))):
        abst = perform_preprocessing(text_list[i])
        processed_abs.append(abst)
    return processed_abs

In [89]:
processed_abs_train = text_tokenizer(texts_X)
processed_abs_all = text_tokenizer(all_texts)

100%|█████████████████████████████████████████████████████████████████████████████| 8676/8676 [00:28<00:00, 302.60it/s]
100%|███████████████████████████████████████████████████████████████████████████| 73154/73154 [04:29<00:00, 271.88it/s]


In [90]:
pd.DataFrame(processed_abs_train).to_csv('processed_abs_train.csv', index=False)
pd.DataFrame(processed_abs_all).to_csv('processed_abs_all.csv', index=False)

In [91]:
classified_df_vect_tfidf = classified_df.copy()
classified_df_vect_tfidf.pages.replace(0, None, inplace = True)
classified_df_vect_tfidf.reference.replace(0, None, inplace = True)
classified_df_vect_tfidf['processed_abs'] = np.array(processed_abs_train)

In [92]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [93]:
X_train_all_tfidf, X_test_all_tfidf, y_train_all_tfidf, y_test_all_tfidf = train_test_split(
    classified_df_vect_tfidf.drop(columns=['title', 'abstract', 'class', 'pubmed_id']),
    classified_df_vect_tfidf['class'], test_size=0.2, random_state=42, stratify=classified_df_vect_tfidf['class'])

np.random.seed(42)

num_features = ['pages', 'year', 'auth_num', 'reference', 'journal_Medchemcomm', 'journal_J Nat Prod',
                'journal_J Med Chem', 'journal_Eur J Med Chem', 'journal_Bioorg Med Chem Lett',
                'journal_Bioorg Med Chem', 'journal_ACS Med Chem Lett']
text_features = 'processed_abs'

xgb_all_tfidf = xgb.XGBClassifier()

text_transformer = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(
        analyzer='word',
        ngram_range=(1, 2),
    ))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, text_features)
    ]
)

pipeline_xgb_all_tfidf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb_all_tfidf)
])

param_distributions_xgb_all_tfidf = {
    'preprocessor__text__tfidf__max_features': [200, 500, 700, 1000],
    'classifier__n_estimators': stats.randint(50, 200),
    'classifier__max_depth': stats.randint(3, 10),
    'classifier__learning_rate': stats.uniform(0.001, 0.1),
    'classifier__subsample': stats.uniform(0.3, 0.7)
}

random_search_xgb_all_tfidf = RandomizedSearchCV(
    estimator=pipeline_xgb_all_tfidf,
    param_distributions=param_distributions_xgb_all_tfidf,
    n_iter=10,
    random_state=42,
    scoring='f1',
    cv=5,
    verbose=10
)
random_search_xgb_all_tfidf.fit(X_train_all_tfidf, y_train_all_tfidf)

print("Best set of hyperparameters: ", random_search_xgb_all_tfidf.best_params_)
print("Best score: ", random_search_xgb_all_tfidf.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START classifier__learning_rate=0.03845401188473625, classifier__max_depth=7, classifier__n_estimators=64, classifier__subsample=0.8123957592679836, preprocessor__text__tfidf__max_features=200
[CV 1/5; 1/10] END classifier__learning_rate=0.03845401188473625, classifier__max_depth=7, classifier__n_estimators=64, classifier__subsample=0.8123957592679836, preprocessor__text__tfidf__max_features=200;, score=0.884 total time=   2.4s
[CV 2/5; 1/10] START classifier__learning_rate=0.03845401188473625, classifier__max_depth=7, classifier__n_estimators=64, classifier__subsample=0.8123957592679836, preprocessor__text__tfidf__max_features=200
[CV 2/5; 1/10] END classifier__learning_rate=0.03845401188473625, classifier__max_depth=7, classifier__n_estimators=64, classifier__subsample=0.8123957592679836, preprocessor__text__tfidf__max_features=200;, score=0.897 total time=   2.4s
[CV 3/5; 1/10] START classifier__learning_rat

[CV 4/5; 4/10] END classifier__learning_rate=0.08424426408004218, classifier__max_depth=8, classifier__n_estimators=179, classifier__subsample=0.42727747704497043, preprocessor__text__tfidf__max_features=200;, score=0.901 total time=   3.6s
[CV 5/5; 4/10] START classifier__learning_rate=0.08424426408004218, classifier__max_depth=8, classifier__n_estimators=179, classifier__subsample=0.42727747704497043, preprocessor__text__tfidf__max_features=200
[CV 5/5; 4/10] END classifier__learning_rate=0.08424426408004218, classifier__max_depth=8, classifier__n_estimators=179, classifier__subsample=0.42727747704497043, preprocessor__text__tfidf__max_features=200;, score=0.889 total time=   3.6s
[CV 1/5; 5/10] START classifier__learning_rate=0.06274815096277166, classifier__max_depth=4, classifier__n_estimators=71, classifier__subsample=0.30494641365380215, preprocessor__text__tfidf__max_features=200
[CV 1/5; 5/10] END classifier__learning_rate=0.06274815096277166, classifier__max_depth=4, classifi

[CV 3/5; 8/10] END classifier__learning_rate=0.0056450412719997725, classifier__max_depth=5, classifier__n_estimators=184, classifier__subsample=0.41936688658110405, preprocessor__text__tfidf__max_features=700;, score=0.841 total time=   5.0s
[CV 4/5; 8/10] START classifier__learning_rate=0.0056450412719997725, classifier__max_depth=5, classifier__n_estimators=184, classifier__subsample=0.41936688658110405, preprocessor__text__tfidf__max_features=700
[CV 4/5; 8/10] END classifier__learning_rate=0.0056450412719997725, classifier__max_depth=5, classifier__n_estimators=184, classifier__subsample=0.41936688658110405, preprocessor__text__tfidf__max_features=700;, score=0.847 total time=   5.0s
[CV 5/5; 8/10] START classifier__learning_rate=0.0056450412719997725, classifier__max_depth=5, classifier__n_estimators=184, classifier__subsample=0.41936688658110405, preprocessor__text__tfidf__max_features=700
[CV 5/5; 8/10] END classifier__learning_rate=0.0056450412719997725, classifier__max_depth=

In [94]:
best_model_xgb_all_tfidf = random_search_xgb_all_tfidf.best_estimator_

print(f1_score(y_test_all_tfidf, best_model_xgb_all_tfidf.predict(X_test_all_tfidf)))
print(accuracy_score(y_test_all_tfidf, best_model_xgb_all_tfidf.predict(X_test_all_tfidf)))
print(precision_score(y_test_all_tfidf, best_model_xgb_all_tfidf.predict(X_test_all_tfidf)))
print(recall_score(y_test_all_tfidf, best_model_xgb_all_tfidf.predict(X_test_all_tfidf)))
print(confusion_matrix(y_test_all_tfidf, best_model_xgb_all_tfidf.predict(X_test_all_tfidf)))

0.9226053639846743
0.9418202764976958
0.9694041867954911
0.8801169590643275
[[1033   19]
 [  82  602]]


# RF_alldata_tfidf

In [95]:
np.random.seed(42)

rf_all_tfidf = RandomForestClassifier()

text_transformer = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(
        analyzer='word',
        ngram_range=(1, 2),
    ))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, text_features)
    ]
)

pipeline_rf_all_tfidf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', rf_all_tfidf)
])

param_grid_rf_all_tfidf = {
    'preprocessor__text__tfidf__max_features': [200, 500, 700, 1000],
    'classifier__n_estimators': stats.randint(50, 250),
    'classifier__max_depth': stats.randint(5, 100),
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_samples': stats.uniform(0.3, 0.7)
}

random_search_rf_all_tfidf = RandomizedSearchCV(
    estimator=pipeline_rf_all_tfidf,
    param_distributions=param_grid_rf_all_tfidf,
    n_iter=10,
    random_state=42,
    scoring='f1',
    cv=5,
    verbose=10
)

random_search_rf_all_tfidf.fit(X_train_all_tfidf, y_train_all_tfidf)

print("Best set of hyperparameters: ", random_search_rf_all_tfidf.best_params_)
print("Best score: ", random_search_rf_all_tfidf.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START classifier__criterion=gini, classifier__max_depth=56, classifier__max_samples=0.9655000144869412, classifier__n_estimators=156, preprocessor__text__tfidf__max_features=1000
[CV 1/5; 1/10] END classifier__criterion=gini, classifier__max_depth=56, classifier__max_samples=0.9655000144869412, classifier__n_estimators=156, preprocessor__text__tfidf__max_features=1000;, score=0.912 total time=   5.7s
[CV 2/5; 1/10] START classifier__criterion=gini, classifier__max_depth=56, classifier__max_samples=0.9655000144869412, classifier__n_estimators=156, preprocessor__text__tfidf__max_features=1000
[CV 2/5; 1/10] END classifier__criterion=gini, classifier__max_depth=56, classifier__max_samples=0.9655000144869412, classifier__n_estimators=156, preprocessor__text__tfidf__max_features=1000;, score=0.919 total time=   5.8s
[CV 3/5; 1/10] START classifier__criterion=gini, classifier__max_depth=56, classifier__max_samples=0.

[CV 5/5; 4/10] END classifier__criterion=entropy, classifier__max_depth=34, classifier__max_samples=0.44863737747479326, classifier__n_estimators=241, preprocessor__text__tfidf__max_features=1000;, score=0.889 total time=   5.9s
[CV 1/5; 5/10] START classifier__criterion=gini, classifier__max_depth=37, classifier__max_samples=0.5129695700716763, classifier__n_estimators=71, preprocessor__text__tfidf__max_features=200
[CV 1/5; 5/10] END classifier__criterion=gini, classifier__max_depth=37, classifier__max_samples=0.5129695700716763, classifier__n_estimators=71, preprocessor__text__tfidf__max_features=200;, score=0.894 total time=   2.4s
[CV 2/5; 5/10] START classifier__criterion=gini, classifier__max_depth=37, classifier__max_samples=0.5129695700716763, classifier__n_estimators=71, preprocessor__text__tfidf__max_features=200
[CV 2/5; 5/10] END classifier__criterion=gini, classifier__max_depth=37, classifier__max_samples=0.5129695700716763, classifier__n_estimators=71, preprocessor__text

[CV 5/5; 8/10] END classifier__criterion=entropy, classifier__max_depth=51, classifier__max_samples=0.7328702065331612, classifier__n_estimators=157, preprocessor__text__tfidf__max_features=700;, score=0.889 total time=   5.1s
[CV 1/5; 9/10] START classifier__criterion=entropy, classifier__max_depth=68, classifier__max_samples=0.6267340252735859, classifier__n_estimators=100, preprocessor__text__tfidf__max_features=700
[CV 1/5; 9/10] END classifier__criterion=entropy, classifier__max_depth=68, classifier__max_samples=0.6267340252735859, classifier__n_estimators=100, preprocessor__text__tfidf__max_features=700;, score=0.900 total time=   3.5s
[CV 2/5; 9/10] START classifier__criterion=entropy, classifier__max_depth=68, classifier__max_samples=0.6267340252735859, classifier__n_estimators=100, preprocessor__text__tfidf__max_features=700
[CV 2/5; 9/10] END classifier__criterion=entropy, classifier__max_depth=68, classifier__max_samples=0.6267340252735859, classifier__n_estimators=100, prep

In [96]:
best_model_rf_all_tfidf = random_search_rf_all_tfidf.best_estimator_

print(f1_score(y_test_all_tfidf, best_model_rf_all_tfidf.predict(X_test_all_tfidf)))
print(accuracy_score(y_test_all_tfidf, best_model_rf_all_tfidf.predict(X_test_all_tfidf)))
print(precision_score(y_test_all_tfidf, best_model_rf_all_tfidf.predict(X_test_all_tfidf)))
print(recall_score(y_test_all_tfidf, best_model_rf_all_tfidf.predict(X_test_all_tfidf)))
print(confusion_matrix(y_test_all_tfidf, best_model_rf_all_tfidf.predict(X_test_all_tfidf)))

0.923896499238965
0.9423963133640553
0.9634920634920635
0.8874269005847953
[[1029   23]
 [  77  607]]


# SVM_alldata_tfidf

In [97]:
np.random.seed(42)

svm_all_tfidf = SVC()

numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(weights='distance', metric='nan_euclidean', add_indicator=True)),
    ('scaler', StandardScaler()),
])

text_transformer = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(
        analyzer='word',
        ngram_range=(1, 2),
    ))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('text', text_transformer, text_features)
    ]
)

pipeline_svm_all_tfidf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', svm_all_tfidf)
])

param_grid_svm_all_tfidf = {
    'preprocessor__num__imputer__n_neighbors': [2, 3, 5, 7],
    'preprocessor__text__tfidf__max_features': [200, 500, 700, 1000],
    'classifier__C': stats.uniform(0.1, 10),
    'classifier__gamma': stats.uniform(0.01, 1),
    'classifier__kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}

random_search_svm_all_tfidf = RandomizedSearchCV(
    estimator=pipeline_svm_all_tfidf,
    param_distributions=param_grid_svm_all_tfidf,
    n_iter=10,
    scoring='f1',
    random_state=42,
    cv=5,
    verbose=10
)

random_search_svm_all_tfidf.fit(X_train_all_tfidf, y_train_all_tfidf)

print("Best set of hyperparameters: ", random_search_svm_all_tfidf.best_params_)
print("Best score: ", random_search_svm_all_tfidf.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START classifier__C=3.845401188473625, classifier__gamma=0.9607143064099162, classifier__kernel=poly, preprocessor__num__imputer__n_neighbors=7, preprocessor__text__tfidf__max_features=200
[CV 1/5; 1/10] END classifier__C=3.845401188473625, classifier__gamma=0.9607143064099162, classifier__kernel=poly, preprocessor__num__imputer__n_neighbors=7, preprocessor__text__tfidf__max_features=200;, score=0.971 total time=   3.2s
[CV 2/5; 1/10] START classifier__C=3.845401188473625, classifier__gamma=0.9607143064099162, classifier__kernel=poly, preprocessor__num__imputer__n_neighbors=7, preprocessor__text__tfidf__max_features=200
[CV 2/5; 1/10] END classifier__C=3.845401188473625, classifier__gamma=0.9607143064099162, classifier__kernel=poly, preprocessor__num__imputer__n_neighbors=7, preprocessor__text__tfidf__max_features=200;, score=0.978 total time=   3.1s
[CV 3/5; 1/10] START classifier__C=3.845401188473625, classif

[CV 4/5; 4/10] END classifier__C=0.6641157902710025, classifier__gamma=0.7319987722668247, classifier__kernel=rbf, preprocessor__num__imputer__n_neighbors=3, preprocessor__text__tfidf__max_features=500;, score=0.962 total time=   7.5s
[CV 5/5; 4/10] START classifier__C=0.6641157902710025, classifier__gamma=0.7319987722668247, classifier__kernel=rbf, preprocessor__num__imputer__n_neighbors=3, preprocessor__text__tfidf__max_features=500
[CV 5/5; 4/10] END classifier__C=0.6641157902710025, classifier__gamma=0.7319987722668247, classifier__kernel=rbf, preprocessor__num__imputer__n_neighbors=3, preprocessor__text__tfidf__max_features=500;, score=0.961 total time=   6.5s
[CV 1/5; 5/10] START classifier__C=1.9182496720710063, classifier__gamma=0.19340450985343383, classifier__kernel=sigmoid, preprocessor__num__imputer__n_neighbors=3, preprocessor__text__tfidf__max_features=500
[CV 1/5; 5/10] END classifier__C=1.9182496720710063, classifier__gamma=0.19340450985343383, classifier__kernel=sigmoi

[CV 3/5; 8/10] END classifier__C=1.006064345328208, classifier__gamma=0.6283860093330873, classifier__kernel=sigmoid, preprocessor__num__imputer__n_neighbors=5, preprocessor__text__tfidf__max_features=1000;, score=0.760 total time=   5.0s
[CV 4/5; 8/10] START classifier__C=1.006064345328208, classifier__gamma=0.6283860093330873, classifier__kernel=sigmoid, preprocessor__num__imputer__n_neighbors=5, preprocessor__text__tfidf__max_features=1000
[CV 4/5; 8/10] END classifier__C=1.006064345328208, classifier__gamma=0.6283860093330873, classifier__kernel=sigmoid, preprocessor__num__imputer__n_neighbors=5, preprocessor__text__tfidf__max_features=1000;, score=0.754 total time=   5.1s
[CV 5/5; 8/10] START classifier__C=1.006064345328208, classifier__gamma=0.6283860093330873, classifier__kernel=sigmoid, preprocessor__num__imputer__n_neighbors=5, preprocessor__text__tfidf__max_features=1000
[CV 5/5; 8/10] END classifier__C=1.006064345328208, classifier__gamma=0.6283860093330873, classifier__kern

In [98]:
best_model_svm_all_tfidf = random_search_svm_all_tfidf.best_estimator_

print(f1_score(y_test_all_tfidf, best_model_svm_all_tfidf.predict(X_test_all_tfidf)))
print(accuracy_score(y_test_all_tfidf, best_model_svm_all_tfidf.predict(X_test_all_tfidf)))
print(precision_score(y_test_all_tfidf, best_model_svm_all_tfidf.predict(X_test_all_tfidf)))
print(recall_score(y_test_all_tfidf, best_model_svm_all_tfidf.predict(X_test_all_tfidf)))
print(confusion_matrix(y_test_all_tfidf, best_model_svm_all_tfidf.predict(X_test_all_tfidf)))

0.9786293294030951
0.9832949308755761
0.986627043090639
0.9707602339181286
[[1043    9]
 [  20  664]]


# XGB_text_tfidf

In [99]:
X_train_text_tfidf, X_test_text_tfidf, y_train_text_tfidf, y_test_text_tfidf = train_test_split(
    classified_df_vect_tfidf['processed_abs'], classified_df_vect_tfidf['class'], test_size=0.2, random_state=42,
    stratify=classified_df_vect_tfidf['class'])

np.random.seed(42)

xgb_text_tfidf = xgb.XGBClassifier()

pipeline_xgb_text_tfidf = Pipeline(steps=[
    ('vectorizer', TfidfVectorizer(analyzer='word', ngram_range=(1, 2))),
    ('classifier', xgb_text_tfidf)
])

param_grid_xgb_text_tfidf = {
    'vectorizer__max_features': [200, 500, 700, 1000],
    'classifier__n_estimators': stats.randint(50, 200),
    'classifier__max_depth': stats.randint(3, 10),
    'classifier__learning_rate': stats.uniform(0.001, 0.1),
    'classifier__subsample': stats.uniform(0.3, 0.7)
}

random_search_xgb_text_tfidf = RandomizedSearchCV(
    estimator=pipeline_xgb_text_tfidf,
    param_distributions=param_grid_xgb_text_tfidf,
    n_iter=10,
    random_state=42,
    scoring='f1',
    cv=5,
    verbose=10
)

random_search_xgb_text_tfidf.fit(X_train_text_tfidf, y_train_text_tfidf)

print("Best set of hyperparameters: ", random_search_xgb_text_tfidf.best_params_)
print("Best score: ", random_search_xgb_text_tfidf.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START classifier__learning_rate=0.03845401188473625, classifier__max_depth=7, classifier__n_estimators=64, classifier__subsample=0.8123957592679836, vectorizer__max_features=200
[CV 1/5; 1/10] END classifier__learning_rate=0.03845401188473625, classifier__max_depth=7, classifier__n_estimators=64, classifier__subsample=0.8123957592679836, vectorizer__max_features=200;, score=0.884 total time=   2.4s
[CV 2/5; 1/10] START classifier__learning_rate=0.03845401188473625, classifier__max_depth=7, classifier__n_estimators=64, classifier__subsample=0.8123957592679836, vectorizer__max_features=200
[CV 2/5; 1/10] END classifier__learning_rate=0.03845401188473625, classifier__max_depth=7, classifier__n_estimators=64, classifier__subsample=0.8123957592679836, vectorizer__max_features=200;, score=0.897 total time=   2.3s
[CV 3/5; 1/10] START classifier__learning_rate=0.03845401188473625, classifier__max_depth=7, classifier__

[CV 5/5; 4/10] END classifier__learning_rate=0.08424426408004218, classifier__max_depth=8, classifier__n_estimators=179, classifier__subsample=0.42727747704497043, vectorizer__max_features=200;, score=0.889 total time=   3.5s
[CV 1/5; 5/10] START classifier__learning_rate=0.06274815096277166, classifier__max_depth=4, classifier__n_estimators=71, classifier__subsample=0.30494641365380215, vectorizer__max_features=200
[CV 1/5; 5/10] END classifier__learning_rate=0.06274815096277166, classifier__max_depth=4, classifier__n_estimators=71, classifier__subsample=0.30494641365380215, vectorizer__max_features=200;, score=0.885 total time=   1.7s
[CV 2/5; 5/10] START classifier__learning_rate=0.06274815096277166, classifier__max_depth=4, classifier__n_estimators=71, classifier__subsample=0.30494641365380215, vectorizer__max_features=200
[CV 2/5; 5/10] END classifier__learning_rate=0.06274815096277166, classifier__max_depth=4, classifier__n_estimators=71, classifier__subsample=0.30494641365380215

[CV 5/5; 8/10] END classifier__learning_rate=0.0056450412719997725, classifier__max_depth=5, classifier__n_estimators=184, classifier__subsample=0.41936688658110405, vectorizer__max_features=700;, score=0.845 total time=   5.1s
[CV 1/5; 9/10] START classifier__learning_rate=0.002326496115986653, classifier__max_depth=3, classifier__n_estimators=109, classifier__subsample=0.6943017524918775, vectorizer__max_features=500
[CV 1/5; 9/10] END classifier__learning_rate=0.002326496115986653, classifier__max_depth=3, classifier__n_estimators=109, classifier__subsample=0.6943017524918775, vectorizer__max_features=500;, score=0.745 total time=   2.3s
[CV 2/5; 9/10] START classifier__learning_rate=0.002326496115986653, classifier__max_depth=3, classifier__n_estimators=109, classifier__subsample=0.6943017524918775, vectorizer__max_features=500
[CV 2/5; 9/10] END classifier__learning_rate=0.002326496115986653, classifier__max_depth=3, classifier__n_estimators=109, classifier__subsample=0.6943017524

In [100]:
best_model_xgb_text_tfidf = random_search_xgb_text_tfidf.best_estimator_

print(f1_score(y_test_text_tfidf, best_model_xgb_text_tfidf.predict(X_test_text_tfidf)))
print(accuracy_score(y_test_text_tfidf, best_model_xgb_text_tfidf.predict(X_test_text_tfidf)))
print(precision_score(y_test_text_tfidf, best_model_xgb_text_tfidf.predict(X_test_text_tfidf)))
print(recall_score(y_test_text_tfidf, best_model_xgb_text_tfidf.predict(X_test_text_tfidf)))
print(confusion_matrix(y_test_text_tfidf, best_model_xgb_text_tfidf.predict(X_test_text_tfidf)))

0.9226053639846743
0.9418202764976958
0.9694041867954911
0.8801169590643275
[[1033   19]
 [  82  602]]


# RF_text_tfidf

In [101]:
np.random.seed(42)

rf_text_tfidf = RandomForestClassifier()

pipeline_rf_text_tfidf = Pipeline(steps=[
    ('vectorizer', TfidfVectorizer(analyzer='word', ngram_range=(1, 2))),
    ('classifier', rf_text_tfidf)
])

param_grid_rf_text_tfidf = {
    'vectorizer__max_features': [200, 500, 700, 1000],
    'classifier__n_estimators': stats.randint(50, 250),
    'classifier__max_depth': stats.randint(5, 100),
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_samples': stats.uniform(0.3, 0.7)
}

random_search_rf_text_tfidf = RandomizedSearchCV(
    estimator=pipeline_rf_text_tfidf,
    param_distributions=param_grid_rf_text_tfidf,
    n_iter=10,
    random_state=42,
    scoring='f1',
    cv=5,
    verbose=10
)

random_search_rf_text_tfidf.fit(X_train_text_tfidf, y_train_text_tfidf)

print("Best set of hyperparameters: ", random_search_rf_text_tfidf.best_params_)
print("Best score: ", random_search_rf_text_tfidf.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START classifier__criterion=gini, classifier__max_depth=56, classifier__max_samples=0.9655000144869412, classifier__n_estimators=156, vectorizer__max_features=1000
[CV 1/5; 1/10] END classifier__criterion=gini, classifier__max_depth=56, classifier__max_samples=0.9655000144869412, classifier__n_estimators=156, vectorizer__max_features=1000;, score=0.912 total time=   5.9s
[CV 2/5; 1/10] START classifier__criterion=gini, classifier__max_depth=56, classifier__max_samples=0.9655000144869412, classifier__n_estimators=156, vectorizer__max_features=1000
[CV 2/5; 1/10] END classifier__criterion=gini, classifier__max_depth=56, classifier__max_samples=0.9655000144869412, classifier__n_estimators=156, vectorizer__max_features=1000;, score=0.919 total time=   5.8s
[CV 3/5; 1/10] START classifier__criterion=gini, classifier__max_depth=56, classifier__max_samples=0.9655000144869412, classifier__n_estimators=156, vectorizer__

[CV 2/5; 5/10] END classifier__criterion=gini, classifier__max_depth=37, classifier__max_samples=0.5129695700716763, classifier__n_estimators=71, vectorizer__max_features=200;, score=0.912 total time=   2.4s
[CV 3/5; 5/10] START classifier__criterion=gini, classifier__max_depth=37, classifier__max_samples=0.5129695700716763, classifier__n_estimators=71, vectorizer__max_features=200
[CV 3/5; 5/10] END classifier__criterion=gini, classifier__max_depth=37, classifier__max_samples=0.5129695700716763, classifier__n_estimators=71, vectorizer__max_features=200;, score=0.891 total time=   2.4s
[CV 4/5; 5/10] START classifier__criterion=gini, classifier__max_depth=37, classifier__max_samples=0.5129695700716763, classifier__n_estimators=71, vectorizer__max_features=200
[CV 4/5; 5/10] END classifier__criterion=gini, classifier__max_depth=37, classifier__max_samples=0.5129695700716763, classifier__n_estimators=71, vectorizer__max_features=200;, score=0.893 total time=   2.4s
[CV 5/5; 5/10] START c

[CV 3/5; 9/10] END classifier__criterion=entropy, classifier__max_depth=68, classifier__max_samples=0.6267340252735859, classifier__n_estimators=100, vectorizer__max_features=700;, score=0.907 total time=   3.5s
[CV 4/5; 9/10] START classifier__criterion=entropy, classifier__max_depth=68, classifier__max_samples=0.6267340252735859, classifier__n_estimators=100, vectorizer__max_features=700
[CV 4/5; 9/10] END classifier__criterion=entropy, classifier__max_depth=68, classifier__max_samples=0.6267340252735859, classifier__n_estimators=100, vectorizer__max_features=700;, score=0.902 total time=   3.6s
[CV 5/5; 9/10] START classifier__criterion=entropy, classifier__max_depth=68, classifier__max_samples=0.6267340252735859, classifier__n_estimators=100, vectorizer__max_features=700
[CV 5/5; 9/10] END classifier__criterion=entropy, classifier__max_depth=68, classifier__max_samples=0.6267340252735859, classifier__n_estimators=100, vectorizer__max_features=700;, score=0.883 total time=   3.5s
[C

In [102]:
best_model_rf_text_tfidf = random_search_rf_text_tfidf.best_estimator_

print(f1_score(y_test_text_tfidf, best_model_rf_text_tfidf.predict(X_test_text_tfidf)))
print(accuracy_score(y_test_text_tfidf, best_model_rf_text_tfidf.predict(X_test_text_tfidf)))
print(precision_score(y_test_text_tfidf, best_model_rf_text_tfidf.predict(X_test_text_tfidf)))
print(recall_score(y_test_text_tfidf, best_model_rf_text_tfidf.predict(X_test_text_tfidf)))
print(confusion_matrix(y_test_text_tfidf, best_model_rf_text_tfidf.predict(X_test_text_tfidf)))

0.923896499238965
0.9423963133640553
0.9634920634920635
0.8874269005847953
[[1029   23]
 [  77  607]]


# SVM_text_tfidf

In [103]:
np.random.seed(42)

svm_text_tfidf = SVC()

pipeline_svm_text_tfidf = Pipeline(steps=[
    ('vectorizer', TfidfVectorizer(analyzer='word', ngram_range=(1,2))),
    ('classifier', svm_text_tfidf)
])

param_grid_svm_text_tfidf = {
    'vectorizer__max_features' : [200, 500, 700, 1000],
    'classifier__C': stats.uniform(0.1, 10),
    'classifier__gamma': stats.uniform(0.01, 1),
    'classifier__kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}


random_search_svm_text_tfidf = RandomizedSearchCV(
    estimator=pipeline_svm_text_tfidf,
    param_distributions=param_grid_svm_text_tfidf,
    n_iter=10,
    random_state=42,
    scoring='f1',
    cv=5,
    verbose=10
)

random_search_svm_text_tfidf.fit(X_train_text_tfidf, y_train_text_tfidf)

print("Best set of hyperparameters: ", random_search_svm_text_tfidf.best_params_)
print("Best score: ", random_search_svm_text_tfidf.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START classifier__C=3.845401188473625, classifier__gamma=0.9607143064099162, classifier__kernel=poly, vectorizer__max_features=1000
[CV 1/5; 1/10] END classifier__C=3.845401188473625, classifier__gamma=0.9607143064099162, classifier__kernel=poly, vectorizer__max_features=1000;, score=0.905 total time=  12.2s
[CV 2/5; 1/10] START classifier__C=3.845401188473625, classifier__gamma=0.9607143064099162, classifier__kernel=poly, vectorizer__max_features=1000
[CV 2/5; 1/10] END classifier__C=3.845401188473625, classifier__gamma=0.9607143064099162, classifier__kernel=poly, vectorizer__max_features=1000;, score=0.900 total time=  12.2s
[CV 3/5; 1/10] START classifier__C=3.845401188473625, classifier__gamma=0.9607143064099162, classifier__kernel=poly, vectorizer__max_features=1000
[CV 3/5; 1/10] END classifier__C=3.845401188473625, classifier__gamma=0.9607143064099162, classifier__kernel=poly, vectorizer__max_features=10

[CV 1/5; 6/10] END classifier__C=1.9340450985343383, classifier__gamma=0.31424224295953773, classifier__kernel=rbf, vectorizer__max_features=200;, score=0.907 total time=   2.8s
[CV 2/5; 6/10] START classifier__C=1.9340450985343383, classifier__gamma=0.31424224295953773, classifier__kernel=rbf, vectorizer__max_features=200
[CV 2/5; 6/10] END classifier__C=1.9340450985343383, classifier__gamma=0.31424224295953773, classifier__kernel=rbf, vectorizer__max_features=200;, score=0.917 total time=   2.8s
[CV 3/5; 6/10] START classifier__C=1.9340450985343383, classifier__gamma=0.31424224295953773, classifier__kernel=rbf, vectorizer__max_features=200
[CV 3/5; 6/10] END classifier__C=1.9340450985343383, classifier__gamma=0.31424224295953773, classifier__kernel=rbf, vectorizer__max_features=200;, score=0.903 total time=   2.8s
[CV 4/5; 6/10] START classifier__C=1.9340450985343383, classifier__gamma=0.31424224295953773, classifier__kernel=rbf, vectorizer__max_features=200
[CV 4/5; 6/10] END classi

In [104]:
best_model_svm_text_tfidf = random_search_svm_text_tfidf.best_estimator_

print(f1_score(y_test_text_tfidf, best_model_svm_text_tfidf.predict(X_test_text_tfidf)))
print(accuracy_score(y_test_text_tfidf, best_model_svm_text_tfidf.predict(X_test_text_tfidf)))
print(precision_score(y_test_text_tfidf, best_model_svm_text_tfidf.predict(X_test_text_tfidf)))
print(recall_score(y_test_text_tfidf, best_model_svm_text_tfidf.predict(X_test_text_tfidf)))
print(confusion_matrix(y_test_text_tfidf, best_model_svm_text_tfidf.predict(X_test_text_tfidf)))

0.9223813112283346
0.940668202764977
0.9517884914463453
0.8947368421052632
[[1021   31]
 [  72  612]]


# XGB_alldata_fasttext

In [105]:
train_embeddings = pd.read_csv('train_embeddings_fasttext.csv', sep=',')
test_embeddings = pd.read_csv('test_embeddings_fasttext.csv', sep=',')
val_embeddings = pd.read_csv('val_embeddings_fasttext.csv', sep=',')

In [106]:
classified_df_vect_ft = classified_df.copy()
classified_df_vect_ft.pages.replace(0, None, inplace = True)
classified_df_vect_ft.reference.replace(0, None, inplace = True)

In [107]:
X_train_val_ft, X_test_ft, y_train_val_ft, y_test_ft = train_test_split(
    classified_df_vect_ft.drop(columns=['title', 'abstract', 'class', 'pubmed_id']), classified_df_vect_ft['class'],
    test_size=0.15, random_state=42, stratify=classified_df_vect_ft['class'])
X_train_ft, X_val_ft, y_train_ft, y_val_ft = train_test_split(X_train_val_ft, y_train_val_ft, test_size=0.17647,
                                                              random_state=42, stratify=y_train_val_ft)
X_train_all_ft = pd.concat([X_train_ft.reset_index(drop=True), train_embeddings], axis=1)
X_val_all_ft = pd.concat([X_val_ft.reset_index(drop=True), val_embeddings], axis=1)
X_test_all_ft = pd.concat([X_test_ft.reset_index(drop=True), test_embeddings], axis=1)

In [108]:
np.random.seed(42)

xgb_all_fasttext = xgb.XGBClassifier()

param_grid_xgb_all_fasttext = {
    'max_depth': stats.randint(3, 10),
    'learning_rate': stats.uniform(0.001, 0.1),
    'subsample': stats.uniform(0.3, 0.7),
    'n_estimators': stats.randint(50, 200)
}

train_indices = np.arange(len(X_train_all_ft))
val_indices = np.arange(len(X_train_all_ft), len(X_train_all_ft) + len(X_val_all_ft))
cv = [(train_indices, val_indices)]

random_search_xgb_all_fasttext = RandomizedSearchCV(xgb_all_fasttext, param_grid_xgb_all_fasttext, cv=cv, n_iter=50,
                                                    scoring='f1', verbose=10)

X_combined_all_ft = np.vstack((X_train_all_ft, X_val_all_ft))
y_combined_all_ft = np.hstack((y_train_ft, y_val_ft))

random_search_xgb_all_fasttext.fit(X_combined_all_ft, y_combined_all_ft)

print("Best set of hyperparameters: ", random_search_xgb_all_fasttext.best_params_)
print("Best score: ", random_search_xgb_all_fasttext.best_score_)

Fitting 1 folds for each of 50 candidates, totalling 50 fits
[CV 1/1; 1/50] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 1/1; 1/50] END learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836;, score=0.965 total time=   6.4s
[CV 1/1; 2/50] START learning_rate=0.06086584841970366, max_depth=9, n_estimators=171, subsample=0.40919616423534183
[CV 1/1; 2/50] END learning_rate=0.06086584841970366, max_depth=9, n_estimators=171, subsample=0.40919616423534183;, score=0.971 total time=  14.6s
[CV 1/1; 3/50] START learning_rate=0.006808361216819946, max_depth=7, n_estimators=149, subsample=0.40000677254535855
[CV 1/1; 3/50] END learning_rate=0.006808361216819946, max_depth=7, n_estimators=149, subsample=0.40000677254535855;, score=0.959 total time=  12.5s
[CV 1/1; 4/50] START learning_rate=0.06608884729488529, max_depth=7, n_estimators=51, subsample=0.8053991405867773
[CV 1/1; 4/50] END learning_rat

[CV 1/1; 32/50] END learning_rate=0.06343540481337932, max_depth=8, n_estimators=183, subsample=0.6195741993380371;, score=0.971 total time=  15.0s
[CV 1/1; 33/50] START learning_rate=0.02284404372168336, max_depth=6, n_estimators=79, subsample=0.9182961812432078
[CV 1/1; 33/50] END learning_rate=0.02284404372168336, max_depth=6, n_estimators=79, subsample=0.9182961812432078;, score=0.965 total time=   6.5s
[CV 1/1; 34/50] START learning_rate=0.0334345021005274, max_depth=9, n_estimators=141, subsample=0.5494084866538824
[CV 1/1; 34/50] END learning_rate=0.0334345021005274, max_depth=9, n_estimators=141, subsample=0.5494084866538824;, score=0.968 total time=  13.4s
[CV 1/1; 35/50] START learning_rate=0.09168284415457541, max_depth=3, n_estimators=170, subsample=0.7533830843789535
[CV 1/1; 35/50] END learning_rate=0.09168284415457541, max_depth=3, n_estimators=170, subsample=0.7533830843789535;, score=0.971 total time=   7.5s
[CV 1/1; 36/50] START learning_rate=0.001052037699531582, max

In [109]:
best_model_xgb_all_fasttext = random_search_xgb_all_fasttext.best_estimator_

print(f1_score(y_test_ft, best_model_xgb_all_fasttext.predict(X_test_all_ft.values)))
print(accuracy_score(y_test_ft, best_model_xgb_all_fasttext.predict(X_test_all_ft.values)))
print(precision_score(y_test_ft, best_model_xgb_all_fasttext.predict(X_test_all_ft.values)))
print(recall_score(y_test_ft, best_model_xgb_all_fasttext.predict(X_test_all_ft.values)))
print(confusion_matrix(y_test_ft, best_model_xgb_all_fasttext.predict(X_test_all_ft.values)))

0.9742574257425742
0.9800307219662059
0.9899396378269618
0.9590643274853801
[[784   5]
 [ 21 492]]


# RF_alldata_fasttext

In [110]:
np.random.seed(42)

rf_all_fasttex = RandomForestClassifier()

param_grid_rf_all_fasttex = {
    'n_estimators': stats.randint(50, 250),
    'max_depth': stats.randint(5, 100),
    'criterion': ['gini', 'entropy'],
    'max_samples': stats.uniform(0.3, 0.7)
}

random_search_rf_all_fasttex = RandomizedSearchCV(rf_all_fasttex, param_grid_rf_all_fasttex, cv=cv, n_iter=50,
                                                  scoring='f1', verbose=10)

random_search_rf_all_fasttex.fit(X_combined_all_ft, y_combined_all_ft)

print("Best set of hyperparameters: ", random_search_rf_all_fasttex.best_params_)
print("Best score: ", random_search_rf_all_fasttex.best_score_)

Fitting 1 folds for each of 50 candidates, totalling 50 fits
[CV 1/1; 1/50] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 1/1; 1/50] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.949 total time=  22.4s
[CV 1/1; 2/50] START criterion=entropy, max_depth=65, max_samples=0.7177951105625409, n_estimators=171
[CV 1/1; 2/50] END criterion=entropy, max_depth=65, max_samples=0.7177951105625409, n_estimators=171;, score=0.938 total time=  17.1s
[CV 1/1; 3/50] START criterion=gini, max_depth=91, max_samples=0.3406585285177396, n_estimators=137
[CV 1/1; 3/50] END criterion=gini, max_depth=91, max_samples=0.3406585285177396, n_estimators=137;, score=0.942 total time=   7.2s
[CV 1/1; 4/50] START criterion=gini, max_depth=28, max_samples=0.755621931064197, n_estimators=102
[CV 1/1; 4/50] END criterion=gini, max_depth=28, max_samples=0.755621931064197, n_estimators=102;, score=0.944 total time=  12.0s
[CV 1/1; 5/

[CV 1/1; 36/50] END criterion=entropy, max_depth=46, max_samples=0.607535551306039, n_estimators=228;, score=0.941 total time=  19.6s
[CV 1/1; 37/50] START criterion=gini, max_depth=56, max_samples=0.6942929003834686, n_estimators=181
[CV 1/1; 37/50] END criterion=gini, max_depth=56, max_samples=0.6942929003834686, n_estimators=181;, score=0.945 total time=  19.8s
[CV 1/1; 38/50] START criterion=entropy, max_depth=27, max_samples=0.723092165494472, n_estimators=192
[CV 1/1; 38/50] END criterion=entropy, max_depth=27, max_samples=0.723092165494472, n_estimators=192;, score=0.938 total time=  19.4s
[CV 1/1; 39/50] START criterion=gini, max_depth=33, max_samples=0.4601587158441357, n_estimators=209
[CV 1/1; 39/50] END criterion=gini, max_depth=33, max_samples=0.4601587158441357, n_estimators=209;, score=0.943 total time=  15.1s
[CV 1/1; 40/50] START criterion=gini, max_depth=63, max_samples=0.7863494531277931, n_estimators=77
[CV 1/1; 40/50] END criterion=gini, max_depth=63, max_samples=0

In [111]:
best_model_rf_all_fasttext = random_search_rf_all_fasttex.best_estimator_

print(f1_score(y_test_ft, best_model_rf_all_fasttext.predict(X_test_all_ft.values)))
print(accuracy_score(y_test_ft, best_model_rf_all_fasttext.predict(X_test_all_ft.values)))
print(precision_score(y_test_ft, best_model_rf_all_fasttext.predict(X_test_all_ft.values)))
print(recall_score(y_test_ft, best_model_rf_all_fasttext.predict(X_test_all_ft.values)))
print(confusion_matrix(y_test_ft, best_model_rf_all_fasttext.predict(X_test_all_ft.values)))

0.9438877755511023
0.956989247311828
0.9711340206185567
0.9181286549707602
[[775  14]
 [ 42 471]]


# SVM_alldata_fasttext

In [112]:
np.random.seed(42)

pipeline_svm_all_fasttext = Pipeline([
    ('imputer', KNNImputer(weights='distance', metric='nan_euclidean', add_indicator=True)),
    ('scaler', StandardScaler()),
    ('svm', SVC())
])

param_distributions_svm_all_fasttext = {
    'imputer__n_neighbors': [2, 3, 5, 7],
    'svm__C': stats.uniform(0.1, 10),
    'svm__gamma': stats.uniform(0.01, 1),
    'svm__kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}

random_search_svm_all_fasttext = RandomizedSearchCV(pipeline_svm_all_fasttext, param_distributions_svm_all_fasttext,
                                                    cv=cv, n_iter=50, scoring='f1', verbose=10)
random_search_svm_all_fasttext.fit(X_combined_all_ft, y_combined_all_ft)

print("Best set of hyperparameters: ", random_search_svm_all_fasttext.best_params_)
print("Best score: ", random_search_svm_all_fasttext.best_score_)

Fitting 1 folds for each of 50 candidates, totalling 50 fits
[CV 1/1; 1/50] START imputer__n_neighbors=5, svm__C=8.065429868602328, svm__gamma=0.1934347898661638, svm__kernel=sigmoid
[CV 1/1; 1/50] END imputer__n_neighbors=5, svm__C=8.065429868602328, svm__gamma=0.1934347898661638, svm__kernel=sigmoid;, score=0.732 total time=  13.2s
[CV 1/1; 2/50] START imputer__n_neighbors=2, svm__C=6.068501579464869, svm__gamma=0.45583275285359115, svm__kernel=poly
[CV 1/1; 2/50] END imputer__n_neighbors=2, svm__C=6.068501579464869, svm__gamma=0.45583275285359115, svm__kernel=poly;, score=0.928 total time=  11.6s
[CV 1/1; 3/50] START imputer__n_neighbors=5, svm__C=4.692488919658671, svm__gamma=0.34370861113902185, svm__kernel=sigmoid
[CV 1/1; 3/50] END imputer__n_neighbors=5, svm__C=4.692488919658671, svm__gamma=0.34370861113902185, svm__kernel=sigmoid;, score=0.730 total time=  13.7s
[CV 1/1; 4/50] START imputer__n_neighbors=7, svm__C=6.608884729488528, svm__gamma=0.06641157902710025, svm__kernel=s

[CV 1/1; 31/50] END imputer__n_neighbors=2, svm__C=9.817120953891036, svm__gamma=0.8589138242660839, svm__kernel=linear;, score=0.954 total time=  38.8s
[CV 1/1; 32/50] START imputer__n_neighbors=3, svm__C=2.4598491974895578, svm__gamma=0.26606832276132397, svm__kernel=poly
[CV 1/1; 32/50] END imputer__n_neighbors=3, svm__C=2.4598491974895578, svm__gamma=0.26606832276132397, svm__kernel=poly;, score=0.929 total time=  11.5s
[CV 1/1; 33/50] START imputer__n_neighbors=2, svm__C=7.206628896857874, svm__gamma=0.12089082081183132, svm__kernel=sigmoid
[CV 1/1; 33/50] END imputer__n_neighbors=2, svm__C=7.206628896857874, svm__gamma=0.12089082081183132, svm__kernel=sigmoid;, score=0.731 total time=  13.1s
[CV 1/1; 34/50] START imputer__n_neighbors=2, svm__C=2.117192023353962, svm__gamma=0.9057635956735194, svm__kernel=linear
[CV 1/1; 34/50] END imputer__n_neighbors=2, svm__C=2.117192023353962, svm__gamma=0.9057635956735194, svm__kernel=linear;, score=0.954 total time=  18.6s
[CV 1/1; 35/50] ST

In [113]:
best_model_svm_all_fasttext= random_search_svm_all_fasttext.best_estimator_

print(f1_score(y_test_ft, best_model_svm_all_fasttext.predict(X_test_all_ft.values)))
print(accuracy_score(y_test_ft, best_model_svm_all_fasttext.predict(X_test_all_ft.values)))
print(precision_score(y_test_ft, best_model_svm_all_fasttext.predict(X_test_all_ft.values)))
print(recall_score(y_test_ft, best_model_svm_all_fasttext.predict(X_test_all_ft.values)))
print(confusion_matrix(y_test_ft, best_model_svm_all_fasttext.predict(X_test_all_ft.values)))

0.9532338308457712
0.9639016897081413
0.9735772357723578
0.9337231968810916
[[776  13]
 [ 34 479]]


# XGB_text_fasttext

In [114]:
np.random.seed(42)

xgb_text_ft = xgb.XGBClassifier()

param_grid_xgb_text_ft = {
    'max_depth': stats.randint(3, 10),
    'learning_rate': stats.uniform(0.001, 0.1),
    'subsample': stats.uniform(0.3, 0.7),
    'n_estimators': stats.randint(50, 200)
}

X_combined_text_ft = np.vstack((train_embeddings, val_embeddings))
y_combined_text_ft = np.hstack((y_train_ft, y_val_ft))

random_search_xgb_text_ft = RandomizedSearchCV(xgb_text_ft, param_grid_xgb_text_ft, cv=cv, n_iter=50, scoring='f1',
                                               verbose=10)
random_search_xgb_text_ft.fit(X_combined_text_ft, y_combined_text_ft)

print("Best set of hyperparameters: ", random_search_xgb_text_ft.best_params_)
print("Best score: ", random_search_xgb_text_ft.best_score_)

Fitting 1 folds for each of 50 candidates, totalling 50 fits
[CV 1/1; 1/50] START learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836
[CV 1/1; 1/50] END learning_rate=0.03845401188473625, max_depth=7, n_estimators=64, subsample=0.8123957592679836;, score=0.905 total time=   6.1s
[CV 1/1; 2/50] START learning_rate=0.06086584841970366, max_depth=9, n_estimators=171, subsample=0.40919616423534183
[CV 1/1; 2/50] END learning_rate=0.06086584841970366, max_depth=9, n_estimators=171, subsample=0.40919616423534183;, score=0.903 total time=  13.7s
[CV 1/1; 3/50] START learning_rate=0.006808361216819946, max_depth=7, n_estimators=149, subsample=0.40000677254535855
[CV 1/1; 3/50] END learning_rate=0.006808361216819946, max_depth=7, n_estimators=149, subsample=0.40000677254535855;, score=0.896 total time=  10.9s
[CV 1/1; 4/50] START learning_rate=0.06608884729488529, max_depth=7, n_estimators=51, subsample=0.8053991405867773
[CV 1/1; 4/50] END learning_rat

[CV 1/1; 32/50] END learning_rate=0.06343540481337932, max_depth=8, n_estimators=183, subsample=0.6195741993380371;, score=0.903 total time=  14.9s
[CV 1/1; 33/50] START learning_rate=0.02284404372168336, max_depth=6, n_estimators=79, subsample=0.9182961812432078
[CV 1/1; 33/50] END learning_rate=0.02284404372168336, max_depth=6, n_estimators=79, subsample=0.9182961812432078;, score=0.903 total time=   5.3s
[CV 1/1; 34/50] START learning_rate=0.0334345021005274, max_depth=9, n_estimators=141, subsample=0.5494084866538824
[CV 1/1; 34/50] END learning_rate=0.0334345021005274, max_depth=9, n_estimators=141, subsample=0.5494084866538824;, score=0.906 total time=  15.1s
[CV 1/1; 35/50] START learning_rate=0.09168284415457541, max_depth=3, n_estimators=170, subsample=0.7533830843789535
[CV 1/1; 35/50] END learning_rate=0.09168284415457541, max_depth=3, n_estimators=170, subsample=0.7533830843789535;, score=0.903 total time=   4.0s
[CV 1/1; 36/50] START learning_rate=0.001052037699531582, max

In [115]:
best_model_xgb_text_fasttext = random_search_xgb_text_ft.best_estimator_

print(f1_score(y_test_ft, best_model_xgb_text_fasttext.predict(test_embeddings)))
print(accuracy_score(y_test_ft, best_model_xgb_text_fasttext.predict(test_embeddings)))
print(precision_score(y_test_ft, best_model_xgb_text_fasttext.predict(test_embeddings)))
print(recall_score(y_test_ft, best_model_xgb_text_fasttext.predict(test_embeddings)))
print(confusion_matrix(y_test_ft, best_model_xgb_text_fasttext.predict(test_embeddings)))

0.9212362911266201
0.9393241167434716
0.9428571428571428
0.9005847953216374
[[761  28]
 [ 51 462]]


# RF_text_fasttext

In [116]:
np.random.seed(42)

rf_text_ft = RandomForestClassifier()

param_grid_rf_text_ft = {
    'n_estimators': stats.randint(50, 250),
    'max_depth': stats.randint(5, 100),
    'criterion': ['gini', 'entropy'],
    'max_samples': stats.uniform(0.3, 0.7)
}

random_search_rf_text_ft = RandomizedSearchCV(rf_text_ft, param_grid_rf_text_ft, cv=cv, n_iter=50, scoring='f1',
                                              verbose=10)
random_search_rf_text_ft.fit(X_combined_text_ft, y_combined_text_ft)

print("Best set of hyperparameters: ", random_search_rf_text_ft.best_params_)
print("Best score: ", random_search_rf_text_ft.best_score_)

Fitting 1 folds for each of 50 candidates, totalling 50 fits
[CV 1/1; 1/50] START criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156
[CV 1/1; 1/50] END criterion=gini, max_depth=56, max_samples=0.9655000144869412, n_estimators=156;, score=0.895 total time=  25.2s
[CV 1/1; 2/50] START criterion=entropy, max_depth=65, max_samples=0.7177951105625409, n_estimators=171
[CV 1/1; 2/50] END criterion=entropy, max_depth=65, max_samples=0.7177951105625409, n_estimators=171;, score=0.908 total time=  17.9s
[CV 1/1; 3/50] START criterion=gini, max_depth=91, max_samples=0.3406585285177396, n_estimators=137
[CV 1/1; 3/50] END criterion=gini, max_depth=91, max_samples=0.3406585285177396, n_estimators=137;, score=0.904 total time=   7.6s
[CV 1/1; 4/50] START criterion=gini, max_depth=28, max_samples=0.755621931064197, n_estimators=102
[CV 1/1; 4/50] END criterion=gini, max_depth=28, max_samples=0.755621931064197, n_estimators=102;, score=0.900 total time=  13.1s
[CV 1/1; 5/

[CV 1/1; 36/50] END criterion=entropy, max_depth=46, max_samples=0.607535551306039, n_estimators=228;, score=0.903 total time=  20.6s
[CV 1/1; 37/50] START criterion=gini, max_depth=56, max_samples=0.6942929003834686, n_estimators=181
[CV 1/1; 37/50] END criterion=gini, max_depth=56, max_samples=0.6942929003834686, n_estimators=181;, score=0.900 total time=  21.6s
[CV 1/1; 38/50] START criterion=entropy, max_depth=27, max_samples=0.723092165494472, n_estimators=192
[CV 1/1; 38/50] END criterion=entropy, max_depth=27, max_samples=0.723092165494472, n_estimators=192;, score=0.906 total time=  20.4s
[CV 1/1; 39/50] START criterion=gini, max_depth=33, max_samples=0.4601587158441357, n_estimators=209
[CV 1/1; 39/50] END criterion=gini, max_depth=33, max_samples=0.4601587158441357, n_estimators=209;, score=0.897 total time=  16.5s
[CV 1/1; 40/50] START criterion=gini, max_depth=63, max_samples=0.7863494531277931, n_estimators=77
[CV 1/1; 40/50] END criterion=gini, max_depth=63, max_samples=0

In [117]:
best_model_rf_text_fasttext= random_search_rf_text_ft.best_estimator_

print(f1_score(y_test_ft, best_model_rf_text_fasttext.predict(test_embeddings)))
print(accuracy_score(y_test_ft, best_model_rf_text_fasttext.predict(test_embeddings)))
print(precision_score(y_test_ft, best_model_rf_text_fasttext.predict(test_embeddings)))
print(recall_score(y_test_ft, best_model_rf_text_fasttext.predict(test_embeddings)))
print(confusion_matrix(y_test_ft, best_model_rf_text_fasttext.predict(test_embeddings)))

0.916076845298281
0.9362519201228878
0.9516806722689075
0.8830409356725146
[[766  23]
 [ 60 453]]


# SVM_text_fasttext

In [118]:
np.random.seed(42)

pipeline_svm_text_ft = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC())
])

param_distributions_svm_text_ft = {
    'svm__C': stats.uniform(0.1, 10),
    'svm__gamma': stats.uniform(0.01, 1),
    'svm__kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}

random_search_svm_text_ft = RandomizedSearchCV(pipeline_svm_text_ft, param_distributions_svm_text_ft, cv=cv, n_iter=50,
                                               scoring='f1', verbose=10)
random_search_svm_text_ft.fit(X_combined_text_ft, y_combined_text_ft)

print("Best set of hyperparameters: ", random_search_svm_text_ft.best_params_)
print("Best score: ", random_search_svm_text_ft.best_score_)

Fitting 1 folds for each of 50 candidates, totalling 50 fits
[CV 1/1; 1/50] START svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly
[CV 1/1; 1/50] END svm__C=3.845401188473625, svm__gamma=0.9607143064099162, svm__kernel=poly;, score=0.884 total time=   4.0s
[CV 1/1; 2/50] START svm__C=7.896910002727692, svm__gamma=0.606850157946487, svm__kernel=rbf
[CV 1/1; 2/50] END svm__C=7.896910002727692, svm__gamma=0.606850157946487, svm__kernel=rbf;, score=0.004 total time=  18.0s
[CV 1/1; 3/50] START svm__C=1.6599452033620266, svm__gamma=0.06808361216819946, svm__kernel=sigmoid
[CV 1/1; 3/50] END svm__C=1.6599452033620266, svm__gamma=0.06808361216819946, svm__kernel=sigmoid;, score=0.727 total time=   4.2s
[CV 1/1; 4/50] START svm__C=3.4370861113902182, svm__gamma=0.1528668179219408, svm__kernel=poly
[CV 1/1; 4/50] END svm__C=3.4370861113902182, svm__gamma=0.1528668179219408, svm__kernel=poly;, score=0.884 total time=   4.0s
[CV 1/1; 5/50] START svm__C=0.30584494295802445

[CV 1/1; 37/50] END svm__C=1.295942459383017, svm__gamma=0.723244787222995, svm__kernel=linear;, score=0.902 total time=  16.1s
[CV 1/1; 38/50] START svm__C=7.317295211648732, svm__gamma=0.24598491974895575, svm__kernel=rbf
[CV 1/1; 38/50] END svm__C=7.317295211648732, svm__gamma=0.24598491974895575, svm__kernel=rbf;, score=0.008 total time=  16.7s
[CV 1/1; 39/50] START svm__C=5.037955963643907, svm__gamma=0.5327328293819941, svm__kernel=poly
[CV 1/1; 39/50] END svm__C=5.037955963643907, svm__gamma=0.5327328293819941, svm__kernel=poly;, score=0.884 total time=   4.1s
[CV 1/1; 40/50] START svm__C=1.2089082081183133, svm__gamma=0.4493365018657701, svm__kernel=poly
[CV 1/1; 40/50] END svm__C=1.2089082081183133, svm__gamma=0.4493365018657701, svm__kernel=poly;, score=0.884 total time=   4.0s
[CV 1/1; 41/50] START svm__C=0.4142918568673425, svm__gamma=0.6464104112637804, svm__kernel=sigmoid
[CV 1/1; 41/50] END svm__C=0.4142918568673425, svm__gamma=0.6464104112637804, svm__kernel=sigmoid;, s

In [119]:
best_model_svm_text_fasttext= random_search_svm_text_ft.best_estimator_

print(f1_score(y_test_ft, best_model_svm_text_fasttext.predict(test_embeddings)))
print(accuracy_score(y_test_ft, best_model_svm_text_fasttext.predict(test_embeddings)))
print(precision_score(y_test_ft, best_model_svm_text_fasttext.predict(test_embeddings)))
print(recall_score(y_test_ft, best_model_svm_text_fasttext.predict(test_embeddings)))
print(confusion_matrix(y_test_ft, best_model_svm_text_fasttext.predict(test_embeddings)))

0.9218274111675127
0.9408602150537635
0.961864406779661
0.884990253411306
[[771  18]
 [ 59 454]]
