In [None]:
# February 2024
# Text classifier using logistic regression, support vector machine, and xlm-roberta
# Violeta Berdejo-Espinola

In [8]:
# linting
# !nbqa pylint 1.pre_process_main_text.ipynb

# background theme 
# !jt -t monokai -cellw 90% #grade3

from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

import numpy as np
import pandas as pd
import os
import mpu

In [9]:
corpus = mpu.io.read('corpus_clean.pickle')
pos = mpu.io.read('pos.pickle')
neg = mpu.io.read('neg_complete.pickle')

In [10]:
x = pos + neg
y = [1] * len(pos) + [0] * len(neg)

print(len(y))
len(neg)

5019


4957

# splitting data

In [4]:
from collections import Counter
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

cntt = Counter()

for bin_class in y_train:
    cntt[bin_class] += 1

cntt

Counter({0: 3966, 1: 49})

In [None]:
# return text with max length

def find_max_length(lst):
    maxList = max(lst, key=len)
    maxLength = len(maxList)
     
    return maxList, maxLength

find_max_length(x_train)

# feature extraction

https://stackoverflow.com/questions/62812198/valueerror-in-while-predict-where-test-data-is-having-different-shape-of-word-ve

trabsformer tokenizer: Tokenizing (splitting strings in sub-word token strings), converting tokens strings to ids and back, and encoding/decoding (i.e., tokenizing and converting to integers).
Adding new tokens to the vocabulary in a way that is independent of the underlying structure (BPE, SentencePiece…).
Managing special tokens (like mask, beginning-of-sentence, etc.): adding them, assigning them to attributes in the tokenizer for easy access and making sure they are not split during tokenization.

In [45]:
%%time

# vectorise data using frquency counters and transformer embeddings - > return arrays and dict of tensors

from sklearn.feature_extraction.text import TfidfVectorizer # uses one-dim array of strings ~ shape (n,)
from sklearn.feature_extraction.text import CountVectorizer

vect_tfidf = TfidfVectorizer()
vect_cv = CountVectorizer()

X_train_cv = vect_cv.fit_transform(x_train)                    #fit: tokenize & buid vocab (turn object into an estimator)
X_train_tfidf = vect_tfidf.fit_transform(x_train)              #transform: instances into matrices
X_test_cv = vect_cv.transform(x_test)
X_test_tfidf = vect_tfidf.transform(x_test) 

from sentence_transformers import SentenceTransformer

embed_model = SentenceTransformer('distiluse-base-multilingual-cased-v1') 

embedding_train = embed_model.encode(x_train)
embedding_test = embed_model.encode(x_test)

import torch
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
     
def tokenize(batch):
    return tokenizer(batch, padding=True, truncation=True, return_tensors='pt', max_length=512)

X_train_xlm = tokenize(x_train)  # returns dictionary with two key:value. input ids:tensors and attention mask:tensors both of them contain tensors
X_test_xlm = tokenize(x_test)

a = X_train_xlm['input_ids'].size()
b = X_test_xlm['input_ids'].size()

xlm vocabulary size: 250002 
model context size: 512, ['input_ids', 'attention_mask']
CPU times: user 9.39 s, sys: 886 ms, total: 10.3 s
Wall time: 22.6 s


In [46]:
print('document-term matrix\n')
print(f'count based vectors - cv & tfidf:\n {X_train_cv.shape, X_test_cv.shape} \n {X_train_tfidf.shape, X_test_tfidf.shape}\n')
print(f'embedding - sentence transformer:\n {embedding_train.shape, embedding_test.shape}\n')
print(f'embedding - xlm roberta:\n {a} {b}\n') # size is batch_size, n_tokens

document-term matrix

count based vectors - cv & tfidf:
 ((4015, 34266), (1004, 34266)) 
 ((4015, 34266), (1004, 34266))

embedding - sentence transformer:
 ((4015, 512), (1004, 512))

embedding - xlm roberta:
 torch.Size([4015, 512]) torch.Size([1004, 512])



In [66]:
# transformers 

print(torch.is_tensor(X_train_xlm))
type(X_train_xlm)

print(f'xlm vocabulary size: {tokenizer.vocab_size} \nmodel context size: {tokenizer.model_max_length}\nmodel input {tokenizer.model_input_names}\n')
print(f'xlm input ids:\n {X_train_xlm.input_ids}\nxlm attention masks:\n {X_train_xlm.attention_mask}\n')

print('let\'s explore an example:\n')
print(X_train_xlm['input_ids'][10].size())

# the input sequence of each batch is padded [1] to the maximum sequence length in the batch (model context size)
# the attention mask array is used to ignore the paddded areas of the betch 
print(X_train_xlm['input_ids'][0])
print(tokenizer.convert_ids_to_tokens(X_train_xlm['input_ids'][0]))

False
xlm vocabulary size: 250002 
model context size: 512
model input ['input_ids', 'attention_mask']

xlm input ids:
 tensor([[    0, 99536, 43269,  ...,     1,     1,     1],
        [    0, 48643,  4503,  ...,     1,     1,     1],
        [    0,     6,  7456,  ...,     1,     1,     1],
        ...,
        [    0,  7828,   318,  ...,     1,     1,     1],
        [    0, 68541,   655,  ...,     1,     1,     1],
        [    0,  5456, 16386,  ...,     1,     1,     1]])
xlm attention masks:
 tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])

let's explore an example:

torch.Size([512])
tensor([     0,  99536,  43269,  43185,   1803,   9247,    219,  34573,   2638,
         62175,   6973,  73140,  43185,   1803,   9247,    219,  34573,   2638,
         62175,    520,  46391,  15595, 146684,  16757,    246, 161764, 

In [None]:
# hidden states from XLM are trained along with the parametres from the input data



In [None]:
from transformers import pipeline

xlm_clf = pipeline('text classifier', model = model_xlm, tokenizer=tokenizer)
outputs = xlm_clf(**X_train_xlm)

In [None]:
# attributes of a tensor

tensor = torch.rand(3,4)

print(f"Shape of tensor: {tensor.shape}")
print(f"Datatype of tensor: {tensor.dtype}")
print(f"Device tensor is stored on: {tensor.device}")

# standard numpy-like indexing and slicing:

tensor = torch.ones(4, 3)
print(tensor)
print(f"First row: {tensor[0]}")
print(f"First column: {tensor[:, 0]}")
print(f"Last column: {tensor[..., -1]}")
tensor[:,1] = 0


In [None]:
text = "hola soy Violeta!"
token_id = tokenizer(text)
print(token_id)
token = tokenizer.convert_ids_to_tokens(token_id.input_ids)
print(token)
print(tokenizer.convert_tokens_to_string(token))

# class imbalance

the distribution of one class is highly skewed so the learning algorithm might 
tend to be biased towards the majority class leading to poor predictions for the minority class

 approaches to deal with imbalanced datasets are:
 
- undersample majority class: discards potentially valuable data
- oversample minority class: can lead to overfitting and increases training times
- weight loss function: assigns higher importance to minority classes providing a direct optimisation approach.
sample: wj will be high
majority sample: wj will be low
- synthetic datasets: could be generated to complement the the minority class and increase its representation

In [None]:
%%time

# calculate weights for classes penalty (weighted cross-entropy loss)

"""weight_for_class_i = total_samples / (num_samples_in_class_i * num_classes)"""

weight_for_class_0 = len(x) / (len(neg) * 2) 
weight_for_class_1 = len(x) / (len(pos) * 2) 

print(weight_for_class_0, weight_for_class_1)

# resample vectorised x_train and y_train - > returns list of arrays

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import ADASYN # generates synthetic samples in regions of the minority class where the class density is low

rus = RandomUnderSampler(random_state=42, sampling_strategy=1)
ros = RandomOverSampler(random_state=42, sampling_strategy='not majority')
ada = ADASYN(random_state=42)

resampler = [rus, ros, ada]

# train set

res_cv = []
for x in resampler:
    res_cv.append(x.fit_resample(X_train_cv, np.array(y_train)))
    
res_tfidf = []
for x in resampler:
    res_tfidf.append(x.fit_resample(X_train_tfidf, np.array(y_train)))

# test set

res_test_cv = []
for x in resampler:
    res_test_cv.append(x.fit_resample(X_test_cv, np.array(y_test)))
    
res_test_tfidf = []
for x in resampler:
    res_test_tfidf.append(x.fit_resample(X_test_tfidf, np.array(y_test)))

# model training

In [None]:
from collections import namedtuple

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report

logreg = LogisticRegression(solver='lbfgs')
svm = SVC(kernel='linear')
logreg_weight = LogisticRegression(solver='liblinear', class_weight={0: weight_for_class_0, 1: weight_for_class_1})
svm_weight = SVC(kernel='linear', class_weight={0: weight_for_class_0, 1: weight_for_class_1})

Case = namedtuple("Case", "name model X Y x y vector weighting")

cases_baseline = [
    Case(name='Logistic Regression', model=logreg, X=X_train_cv, Y=y_train, x=X_test_cv, y=y_test, vector="word_frequency", weighting='None'),
    Case(name='SVM', model=svm, X=X_train_cv, Y=y_train, x=X_test_cv, y=y_test, vector="word_frequency", weighting='None'),
    Case(name='Logistic Regression', model=logreg, X=X_train_tfidf, Y=y_train, x=X_test_tfidf, y=y_test, vector="TF-IDF", weighting='None'),
    Case(name='SVM', model=svm, X=X_train_tfidf, Y=y_train, x=X_test_tfidf, y=y_test, vector="TF-IDF", weighting='None'),
    Case(name='Logistic Regression', model=logreg, X=embedding_train, Y=y_train, x=embedding_test, y=y_test, vector="embedding", weighting='None'),
    Case(name='SVM', model=svm, X=embedding_train, Y=y_train, x=embedding_test, y=y_test, vector="embedding", weighting='None')
]

cases_weighted = [
    Case(name='Logistic Regression', model=logreg_weight, X=X_train_cv, Y=y_train, x=X_test_cv, y=y_test, vector="word_frequency", weighting='BAL'),
    Case(name="SVM", model=svm_weight, X=X_train_cv, Y=y_train, x=X_test_cv, y=y_test, vector="word_frequency", weighting='BAL'),
    Case(name='Logistic Regression', model=logreg_weight, X=X_train_tfidf, Y=y_train, x=X_test_tfidf, y=y_test, vector="TF-IDF", weighting='BAL'),
    Case(name="SVM", model=svm_weight, X=X_train_tfidf, Y=y_train, x=X_test_tfidf, y=y_test, vector="TF-IDF", weighting='BAL'),
    Case(name='Logistic Regression', model=logreg_weight, X=embedding_train, Y=y_train, x=embedding_test, y=y_test, vector="embedding", weighting="BAL"),
    Case(name='SVM', model=svm_weight, X=embedding_train, Y=y_train, x=embedding_test, y=y_test, vector="embedding", weighting="BAL")
]

cases_resampled_tfidf = [
    Case(name="Logistic Regression", model=logreg, X=res_tfidf[0][0], Y=res_tfidf[0][1], x=res_test_tfidf[0][0], y=res_test_tfidf[0][1], vector="TF-IDF", weighting='RUS'),
    Case(name="SVM", model=svm, X=res_tfidf[0][0], Y=res_tfidf[0][1], x=res_test_tfidf[0][0], y=res_test_tfidf[0][1], vector="TF-IDF", weighting='RUS'),
    Case(name="Logistic Regression", model=logreg, X=res_tfidf[1][0], Y=res_tfidf[1][1], x=res_test_tfidf[1][0], y=res_test_tfidf[1][1], vector="TF-IDF", weighting='ROS'),
    Case(name="SVM", model=svm, X=res_tfidf[1][0], Y=res_tfidf[1][1], x=res_test_tfidf[1][0], y=res_test_tfidf[1][1], vector="TF-IDF", weighting='ROS'),
    Case(name="Logistic Regression", model=logreg, X=res_tfidf[2][0], Y=res_tfidf[2][1], x=res_test_tfidf[2][0], y=res_test_tfidf[2][1], vector="TF-IDF", weighting='ADA'),
    Case(name="SVM", model=svm, X=res_tfidf[2][0], Y=res_tfidf[2][1], x=res_test_tfidf[2][0], y=res_test_tfidf[2][1], vector="TF-IDF", weighting='ADA')
]

cases_resampled_cv = [
    Case(name="Logistic Regression", model=logreg, X=res_cv[0][0], Y=res_cv[0][1], x=res_test_cv[0][0], y=res_test_cv[0][1],  vector="word_frequency", weighting='RUS'),
    Case(name="SVM", model=svm, X=res_cv[0][0], Y=res_cv[0][1], x=res_test_cv[0][0], y=res_test_cv[0][1], vector="word_frequency", weighting='RUS'),
    Case(name="Logistic Regression", model=logreg, X=res_cv[1][0], Y=res_cv[1][1], x=res_test_cv[1][0], y=res_test_cv[1][1], vector="word_frequency", weighting='ROS'),
    Case(name="SVM", model=svm, X=res_cv[1][0], Y=res_cv[1][1], x=res_test_cv[1][0], y=res_test_cv[1][1], vector="word_frequency", weighting='ROS'),
    Case(name="Logistic Regression", model=logreg, X=res_cv[2][0], Y=res_cv[2][1], x=res_test_cv[2][0], y=res_test_cv[2][1], vector="word_frequency", weighting='ADA'),
    Case(name="SVM", model=svm, X=res_cv[2][0], Y=res_cv[2][1], x=res_test_cv[2][0], y=res_test_cv[2][1], vector="word_frequency", weighting='ADA')
]

In [None]:
# function to fit models, make predictions with train and test sets, and store results in df

from sklearn.metrics import f1_score, recall_score, precision_score

def get_scores(cases):
    # create lists to store information of each model and their prediction scores for train and test set
    scores_list = []
    models_list = []

    for case in cases:
        
        # fit models and make predictions
        model = case.model.fit(case.X, case.Y)
        y_train_pred = cross_val_predict(case.model, case.X, case.Y, cv=StratifiedKFold(10), random_state=42, method='predict')
        y_test_pred = model.predict(case.x)

        # save fitted model
        models = {
            "name": f'{case.name}_{case.weighting}_{case.vector}',
            "model": model,
            
        }
        
        # save scores from predicitons
        scores = {
            'Classifier': case.name,
            'Numeric_rep': case.vector,
            'Weighting': case.weighting,
            'Instances': len(case.Y),
            'F1_tr': round(f1_score(case.Y, y_train_pred), 3),
            'F1_ts': round(f1_score(case.y, y_test_pred), 3),
            'Precision_tr': round(precision_score(case.Y, y_train_pred), 3),
            'Precision_ts': round(precision_score(case.y, y_test_pred), 3),
            'Recall_tr': round(recall_score(case.Y, y_train_pred), 3),
            'Recall_ts': round(recall_score(case.y, y_test_pred), 3)
        }
        
        # store each model's dict and prediciton scores' dict in a list
        models_list.append(models)
        scores_list.append(scores)
        
    return models_list, scores_list

In [None]:
%%time

# fit models and make predictions

models_baseline, scores_baseline = get_scores(cases_baseline)
models_weighted, scores_weighted = get_scores(cases_weighted)
models_resampled_tfidf, scores_resampled_tfidf = get_scores(cases_resampled_tfidf)
models_resampled_cv, scores_resampled_cv = get_scores(cases_resampled_cv)

df1 = pd.DataFrame(scores_baseline)
df2 = pd.DataFrame(scores_weighted)
df3 = pd.DataFrame(scores_resampled_tfidf)
df4 = pd.DataFrame(scores_resampled_cv)

results =  pd.concat([df1,df2,df3,df4])

In [None]:
# run xlm-roberta model

import torch
import torch.nn.functional as nn
from transformers import (
    AutoConfig,
    AutoModelForTokenClassification,
    AutoModelForSequenceClassification,
    AutoTokenizer)

model_xlm_seq = AutoModelForSequenceClassification.from_pretrained('xlm-roberta-base')
model_xlm_tok = AutoModelForTokenClassification.from_pretrained('xlm-roberta-base')

In [None]:
def get_scores_xlm(model, batch):
    with torch.no_grad(): # disables gradient calculation
    outputs = model(**batch) ## ** unpacks the values in dictionary
    print(outputs) # raw outputs, logits 
    predictions = nn.softmax(outputs.logits, dim=1) # check f or nn before softmax
    print(f'predictions {predictions}')
    labels = torch.argmax(predictions, dim=1)
    print(f'labels{labels}') 
    labels = [model.config.id2label[label_id] for label_id in labels.tolist()]


In [None]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],

In [None]:
%%time

# https://www.youtube.com/watch?v=GSt00_-0ncQ&t=192s

model_xlm = AutoModelForSequenceClassification.from_pretrained('xlm-roberta-base')

with torch.no_grad(): # disables gradient calculation
    outputs = model_xlm(**X_test_xlm) ## ** unpacks the values in dictionary
    print(outputs) # raw outputs, logits 
    predictions = nn.softmax(outputs.logits, dim=1) # check f or nn before softmax
    print(f'predictions {predictions}')
    labels = torch.argmax(predictions, dim=1)
    print(f'labels{labels}') 
    labels = [model.config.id2label[label_id] for label_id in labels.tolist()]

In [None]:
results = results.reset_index(drop=True).sort_values('F1_ts', ascending=False)
results

In [None]:
models_weighted

In [None]:
results[results['Classifier'] == 'SVM'] 

In [None]:
results[results['Classifier'] == 'Logistic Regression']

In [None]:
# extract models form liest of dicts

print(models_weighted)
model_list = [x["model"] for x in models_weighted]
print(model_list)

d = {model["name"]:model for model in models_weighted}

d["Logistic Regression-word_frequency-BAL"]["input"] = 

# best_model = next(model for model in models_weighted if model["name"] == 'Logistic Regression-embedding-BAL')
# best_model = best_model['model']
# best_model

# xlm-roberta

In [None]:
# run end to end model usinf transformer architechture - sentence transformers and 
# XLMRobertaForSequenceClassification: XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) 

# selected model

In [None]:
embedding_train = embed_model.encode(x_train)
embedding_test = embed_model.encode(x_test)

mod = LogisticRegression(solver='lbfgs', class_weight={0: weight_for_class_0, 1: weight_for_class_1}).fit(embedding_train,y_train)
y_train_predict = cross_val_predict(mod, embedding_train, y_train, cv=StratifiedKFold(10), method='predict')
y_test_pred = mod.predict(embedding_test)

y_test_pred = mod.predict(embedding_test)
print(f'Accuracy of logistic regression classifier on test set: {mod.score(embedding_test, y_test)}')
        
from sklearn.metrics import classification_report
print(classification_report(y_test, y_test_pred))

from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_test_pred)
print(confusion_matrix)

# confusion matrix

- true negatives, false positives
- false negatives, true positives

# modify threshold

https://stackoverflow.com/questions/35692059/how-to-change-threshold-for-precision-and-recall-in-python-scikit-learn


https://stackoverflow.com/questions/31417487/sklearn-logisticregression-and-changing-the-default-threshold-for-classification

`decision_function`: tells us on which side of the hyperplane generated by the classifier samples in X are (and how far we are away from it)


`predict()` : returns class label for samples in X based on the values generated by the decision function


`predic_proba()` : returns probability estimates of a classification label for samples in X


In [None]:
# # to do
# try different thresholds ~ 50
# make predictions
# calculate precision, recall, f1 for these and compare
# plot

In [None]:
# decision funtion
print(model.decision_function(X_train_cv[:200]))

# probability classes 
print(model.predict(X_train_cv[:200]))

# probability estimates
probabilities = model.predict_proba(X_train_cv)
print(probabilities[:200])
probabilities.shape

In [None]:
threshold = 0.01
preds = (probabilities[:, 1] > threshold)
preds = preds.astype(int)

In [None]:
# modify threshold for probabiity estimates
high_precision = (probabilities > 0.7).astype(int)
high_recall = (probabilities > 0.2).astype(int)

print(high_precision[:200], high_recall[:200])
# # # true = high_recall == 0
# # len(high_recall)
# type(probabilities)
# probabilities.shape
# probabilities[0][0]

# decisions = (model.predict_proba() >= mythreshold).astype(int)

In [None]:
# calculate precision, recall, and F1

from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_train, y_train_pred)
recall = recall_score(y_train, y_train_pred)
f1 = recall_score(y_train, y_train_pred)

# checking feature weights of positive and negative classes

what words have high positive and negative weights in possitive class

check if keywords appear in doc
https://machinelearningmastery.com/calculate-feature-importance-with-python/

In [None]:
Linear machine learning algorithms fit a model where the prediction is the weighted sum of the input values.

In [None]:
len(y_train)

In [None]:
# get feature importance

coeffs = model.coef_[0]

print(len(coeffs))

# summarize feature importance

for i,v in enumerate(coeffs):
 print('Feature: %0d, Score: %.5f' % (i,v))

In [None]:
vectorizer = CountVectorizer()
vectorizer.fit_transform(x_train)

# extract feature names and coefficients

features = vectorizer.get_feature_names_out(x_train)
coeffs = model.coef_[0]
print(features[5:10])  
print(len(features))
print(len(coeffs))

feat_coeff_dict = {}
feat_coeff_dict['_features'] = features
feat_coeff_dict['_coefficients'] = coeffs

feature_importance_df = pd.DataFrame.from_dict(feat_coeff_dict, orient = 'index').T

In [None]:

feature_importance_df.sort_values(by='_coefficients', ascending=False).head(20)

# precison-recall trade-off

plotting curve of precision / recall trade-off

the higher the precison, the lower the recall


In [None]:

from sklearn.metrics import precision_recall_curve

precisions, recalls, thresholds = precision_recall_curve(y_train, y_scores, pos_label='1')

import matplotlib as mpl
import matplotlib.pyplot as plt

def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds,precisions[:-1],"b-",label='Precision')
    plt.plot(thresholds,recalls[:-1],"g-",label='Recall')

plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.xlabel("Threshold")
plt.legend(['Precision', 'Recall'], loc ="right")
plt.show()

In [None]:
# plot precision-recall curve 

fig, ax = plt.subplots(figsize=(6,6))
ax.plot(recalls, precisions, label='Logistic Regression')
baseline = len(y_test[y_test==1])/len(y_test)
ax.plot([0, 1], [baseline, baseline], linestyle='--', label='Baseline')
ax.set_xlabel('Recall')
ax.set_ylabel('Precision')
ax.legend(loc='center left')

In [None]:
from sklearn.metrics import auc

auc(recalls, precisions)

# ROC curve

- Plots sensitivity vs specificity
- This is True Positive Rate (recall - TP/TP+FN) vs True Negative Rate (TN/TN+FP)
- ROC is a probability curve
- AUC represents the degree or measure of separability. 
- Higher the AUC, the better the model is at predicting 0 classes as 0 and 1 classes as 1. 
- It tells how much the model is capable of distinguishing between classes as its discrimination threshold is varied. 

In [None]:
from sklearn.metrics import roc_auc_score

roc_score = roc_auc_score(y_train, y_scores)

roc_score

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_train, y_scores, pos_label='1 ')

import matplotlib as mpl
import matplotlib.pyplot as plt

def plot_roc_curve (fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0,1],[0,1],'k--') # dashed diagonal
    
plot_roc_curve(fpr,tpr)
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
ax.legend(loc='center left')

In [None]:
d = {'vocabulary size': len(vocab), 'mean_cross_val': np.mean(scores), 'precision': precision, "recall": recall, "f1_score": f1_score, "roc_score": roc_score}
df = pd.DataFrame(data=d, index=["body"])
# df.to_csv('/Users/uqvberde/Dropbox/TRANSLATE/Objective 2 - Machine Learning/classifier_spanish/datasets/py_outputs/metrics_9.csv')
df.to_csv('C:\\Users\\uqvberde\\Dropbox\\TRANSLATE\\ML\\classifier_spanish\\datasets\\py_outputs\\metrics\\metrics_69_short.csv')

`C regularisation`: hyperparametre. Regularization generally refers the concept that there should be a complexity penalty for more extreme parameters. The idea is that just looking at the training data and not paying attention to how extreme one's parameters are leads to overfitting. A high value of C tells the model to give high weight to the training data, and a lower weight to the complexity penalty. A low value tells the model to give more weight to this complexity penalty at the expense of fitting to the training data. Basically, a high C means "Trust this training data a lot", while a low value says "This data may not be fully representative of the real world data, so if it's telling you to make a parameter really large, don't listen to it".

https://stackoverflow.com/questions/67513075/what-is-c-parameter-in-sklearn-logistic-regression
