# Libraries

In [16]:
import pandas as pd
import numpy as np

import plotly.graph_objects as go

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import SMOTE
from collections import Counter

from joblib import dump, load

import torch
from transformers import AutoTokenizer
import classification_modules as cm
from classification_modules.scoring import get_prob_scores, load_checkpoint

# Data preparation

In [23]:
# Read train and test dataset
train, test = pd.read_csv("./data/train.csv"), pd.read_csv("./data/test.csv")
train, test = train[train.label.isna()==False], test[test.label.isna()==False]
train.head()

Unnamed: 0,text,label,text_no_emoji,text_preprocessed
0,Dla mnie faworytem do tytułu będzie Cracovia. ...,0,Dla mnie faworytem do tytułu będzie Cracovia. ...,"['faworyt', 'tytuł', 'cracovia', 'zobaczyć', '..."
1,@anonymized_account @anonymized_account Brawo ...,0,Brawo ty Daria kibic ma być na dobre i złe,"['brawo', 'daria', 'kibic', 'dobry', 'zły']"
2,"@anonymized_account @anonymized_account Super,...",0,"Super, polski premier składa kwiaty na grobach...","['super', 'polski', 'premier', 'składać', 'kwi..."
3,@anonymized_account @anonymized_account Musi. ...,0,Musi. Innej drogi nie mamy.,"['innej', 'droga']"
4,"Odrzut natychmiastowy, kwaśna mina, mam problem",0,"Odrzut natychmiastowy, kwaśna mina, mam problem","['odrzut', 'natychmiastowy', 'kwaśna', 'mina',..."


## Easy Data Augmentation

In [25]:
# Check proportion of classes in datasets
print(f"Majority class {np.round(train.label.value_counts()[0]/sum(train.label.value_counts())*100, 2)}%, minority class {np.round(train.label.value_counts()[1]/sum(train.label.value_counts())*100, 2)}%")

Ilość klasy większościowej wynosi 91.52%, a mniejszościowej 8.48%


In [26]:
# Perform easy data augmentation to overcome problem of unbalanced data
train_eda = cm.perform_eda(train, 0.4, train.label.value_counts()[0] - train.label.value_counts()[1])
print(f"Majority class {np.round(train_eda.label.value_counts()[0]/sum(train_eda.label.value_counts())*100, 2)}%, minority class {np.round(train_eda.label.value_counts()[1]/sum(train_eda.label.value_counts())*100, 2)}%")

Ilość klasy większościowej wynosi 50.0%, a mniejszościowej 50.0%


## Token extraction

In [27]:
# Change lists of tokens into single string in test and train dataset
train_eda.text_preprocessed = train_eda.text_preprocessed.astype('str')
train_eda['text'] = train_eda.text_preprocessed.apply(lambda row: row.strip('][').replace("'", "").replace(', ', ' ').strip())

train.text_preprocessed = train.text_preprocessed.astype('str')
train['text'] = train.text_preprocessed.apply(lambda row: row.strip('][').replace("'", "").replace(', ', ' ').strip())

test.text_preprocessed = test.text_preprocessed.astype('str')
test['text'] = test.text_preprocessed.apply(lambda row: row.strip('][').replace("'", "").replace(', ', ' ').strip())

train_eda.head()

Unnamed: 0,text_preprocessed,label,eda,text
0,"['faworyt', 'tytuł', 'cracovia', 'zobaczyć', '...",0,0,faworyt tytuł cracovia zobaczyć typ sprawdzić
1,"['brawo', 'daria', 'kibic', 'dobry', 'zły']",0,0,brawo daria kibic dobry zły
2,"['super', 'polski', 'premier', 'składać', 'kwi...",0,0,super polski premier składać kwiat groba kolab...
3,"['innej', 'droga']",0,0,innej droga
4,"['odrzut', 'natychmiastowy', 'kwaśna', 'mina',...",0,0,odrzut natychmiastowy kwaśna mina problem


## Vectorization and over-sampling

In [29]:
# Prepare X, y

# Using TF-IDF vectorizer
X = train_eda.text
y = train_eda.label

# Using SMOTE over-sampling

# Set variables with features and label
X_tr = train.text
y_tr = train.label

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform training features
X_tf = vectorizer.fit_transform(X_tr)

# Transform test features
X_test_res = vectorizer.transform(test.text)

# Initialize SMOTE technique at given seed
sm = SMOTE(random_state=42)

# Fit and transform training features and labels
X_res, y_res = sm.fit_resample(X_tf, y_tr)

Counter({0: 9190, 1: 9190})


# Modelling

## TF-IDF + NB

In [30]:
# Set parameters values
estimator = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', MultinomialNB()),
    ])

param_grid = {
        'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
        'tfidf__norm': ['l2'],
        'tfidf__smooth_idf': [True],
        'tfidf__sublinear_tf': [False]
}

# Perform CV
nb_model, nb_results, grid_search = cm.perform_cross_validation(X, y, estimator, param_grid)

cv_train_results, cv_val_results = {}, {}
for k in nb_results.keys():
    metric = k.split('_')[-1]
    if 'mean_train_' in k:
        cv_train_results[metric] = np.mean(nb_results[k])
    if 'mean_test_' in k:
        cv_val_results[metric] = np.mean(nb_results[k])

# Get predictions
y_pred = [1 if prob >= 0.5 else 0 for prob in nb_model.predict_proba(test.text)[:,1]]
y_test = test.label

print("Train CV results:\n", cv_train_results)
print("Validation CV results:\n", cv_val_results)

# Get scores
scores_nb = cm.score(y_test, y_pred)
print("Test dataset results:\n", scores_nb)

dump(nb_model, 'models/nb_model.joblib') 
#nb_model = load('models/nb_model.joblib')

Train CV results:
 {'accuracy': 0.9614074387354546, 'precision': 0.9572199796458322, 'recall': 0.9664301924899396, 'f1': 0.9616597162757913}
Validation CV results:
 {'accuracy': 0.8956110811501293, 'precision': 0.8477225112696504, 'recall': 0.9647090175078786, 'f1': 0.9022785524403458}
Test dataset results:
 {'accuracy': 0.793, 'precision': 0.3392070484581498, 'recall': 0.5746268656716418, 'f1': 0.42659279778393355}


['models/nb_model.joblib']

## SMOTE + TF-IDF + NB

In [32]:
# set parameters values
estimator = MultinomialNB()

param_grid = {}

# perform CV
nb_bert_model, nb_bert_results, grid_search = cm.perform_cross_validation(X_res, y_res, estimator, param_grid)

cv_train_results, cv_val_results = {}, {}
for k in nb_bert_results.keys():
    metric = k.split('_')[-1]
    if 'mean_train_' in k:
        cv_train_results[metric] = np.mean(nb_bert_results[k])
    if 'mean_test_' in k:
        cv_val_results[metric] = np.mean(nb_bert_results[k])

print("Train CV results:\n", cv_train_results)
print("Validation CV results:\n", cv_val_results)

# get predictions
y_pred = [1 if prob >= 0.5 else 0 for prob in nb_bert_model.predict_proba(X_test_res)[:,1]]
y_test = test.label

# get scores
scores_nb_bert = cm.score(y_test, y_pred)
print("Test dataset results:\n", scores_nb_bert)

dump(nb_bert_model, 'models/nb_smote_model.joblib') 
#nb_model = load('models/nb_model.joblib')

Train CV results:
 {'accuracy': 0.9557399161459047, 'precision': 0.9207857980163162, 'recall': 0.9972796932232274, 'f1': 0.9575062472081525}
Validation CV results:
 {'accuracy': 0.9047336510150629, 'precision': 0.8424953309555847, 'recall': 0.9956471467900517, 'f1': 0.9126811162032343}
Test dataset results:
 {'accuracy': 0.791, 'precision': 0.33766233766233766, 'recall': 0.582089552238806, 'f1': 0.4273972602739726}


['models/nb_smote_model.joblib']

## TF-IDF + SVM

In [34]:
# Set parameters values
estimator = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('svm', SVC()),
    ])

# Specify grid with parameters
param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__norm': ['l2'],
    'tfidf__smooth_idf': [True],
    'tfidf__sublinear_tf': [False],
    'svm__C': [0.1, 1, 10],
    'svm__kernel': ['linear', 'rbf'],
}

# Perform CV
svm_model, svm_results, grid_search = cm.perform_cross_validation(X, y, estimator, param_grid)

cv_train_results, cv_val_results = {}, {}
for k in svm_results.keys():
    metric = k.split('_')[-1]
    if 'mean_train_' in k:
        cv_train_results[metric] = np.mean(svm_results[k])
    if 'mean_test_' in k:
        cv_val_results[metric] = np.mean(svm_results[k])

print("Train CV results:\n", cv_train_results)
print("Validation CV results:\n", cv_val_results)

# Get predictions
y_pred = svm_model.predict(test.text)
y_test = test.label

# Evaluate results
scores_svm = cm.score(y_test, y_pred)
print("Test dataset results:\n", scores_svm)

# Save model
dump(svm_model, 'models/svm_model.joblib') 

Train CV results:
 {'accuracy': 0.917319521376367, 'precision': 0.9825779035514124, 'recall': 0.8482800830386628, 'f1': 0.870744937809589}
Validation CV results:
 {'accuracy': 0.8745587593409896, 'precision': 0.8901401643075304, 'recall': 0.8424983566424747, 'f1': 0.8301067137762548}
Test dataset results:
 {'accuracy': 0.842, 'precision': 0.3, 'recall': 0.13432835820895522, 'f1': 0.18556701030927836}


['models/svm_model.joblib']

## SMOTE + TF-IDF + SVM

In [35]:
# set parameters values
estimator = SVC()

param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
}

# perform CV
svm_bert_model, svm_bert_results, grid_search = cm.perform_cross_validation(X_res, y_res, estimator, param_grid)

cv_train_results, cv_val_results = {}, {}
for k in svm_bert_results.keys():
    metric = k.split('_')[-1]
    if 'mean_train_' in k:
        cv_train_results[metric] = np.mean(svm_bert_results[k])
    if 'mean_test_' in k:
        cv_val_results[metric] = np.mean(svm_bert_results[k])

print("Train CV results:\n", cv_train_results)
print("Validation CV results:\n", cv_val_results)

# get predictions
y_pred = svm_bert_model.predict(X_test_res)
y_test = test.label

# get scores
scores_svm_bert = cm.score(y_test, y_pred)
print("Test dataset results:\n", scores_svm_bert)

dump(svm_bert_model, 'models/svm_smote_model.joblib') 
#nb_model = load('models/svm_model.joblib')

Train CV results:
 {'accuracy': 0.9706974045575182, 'precision': 0.989607629251685, 'recall': 0.9513236528545207, 'f1': 0.9685071667719728}
Validation CV results:
 {'accuracy': 0.9573908360626068, 'precision': 0.9744828676740287, 'recall': 0.940149810416795, 'f1': 0.9541147018832018}
Test dataset results:
 {'accuracy': 0.871, 'precision': 0.6923076923076923, 'recall': 0.06716417910447761, 'f1': 0.12244897959183673}


['models/svm_smote_model.joblib']

## HerBERT

In [37]:
df_train, df_val = np.split(train_eda[['text', 'label']].sample(frac=1, random_state=42), [int(0.95 * len(train_eda))])
df_test = test[['text', 'label']] 

In [38]:
# Set number of epochs, batch size and learning rate
EPOCHS = 5
BATCH_SIZE = 4
LR = 2e-05

# Initialize HerBERT model
model = cm.HerBertForSequenceClassification(num_classes=1, dropout_rate=0.5)            
model_custom_name = 'herbert_model'

# Start model training
results, model = cm.train_model(
    model = model,  
    train_data = df_train,
    val_data = df_val, 
    learning_rate = LR, 
    epochs = EPOCHS,
    batch_size= BATCH_SIZE,
    custom_model_name=model_custom_name
    )
results['model'] = model_custom_name

# Evaluate model on test dataset
test_res = cm.evaluate(
    model = model, 
    test_data = df_test
    )
test_res['model'] = model_custom_name

# Save model
checkpoint = {
'model': cm.HerBertForSequenceClassification(num_classes=1, dropout_rate=0.5),
'state_dict': model.state_dict()
}
torch.save(checkpoint, 'models/' + model_custom_name + '.pth')


LEARNING RATE:  2e-05
#####################
Setting Datasets...

Creating DataLoaders...

Setting up optimizer and criterion...

Training on GPU...



100%|██████████| 4366/4366 [22:33<00:00,  3.23it/s]
100%|██████████| 230/230 [00:21<00:00, 10.46it/s]



Best validation loss: 0.04700564593076706

Best deviation: 0.04618926718831062

Saving best model for epoch: 1


Train accuracy: 0.9497 | Validation accuracy: 0.9423

Train precision: 0.9806 | Validation precision: 0.9206

Train recall: 0.9177 | Validation recall: 0.9640

Train f1: 0.9481 | Validation f1: 0.9418
Epochs: 1
Train Loss:  0.0352 | Validation Loss:  0.0371
Train accuracy: 0.9497 | Validation accuracy: 0.9423
Train precision: 0.9806 | Validation precision: 0.9206
Train recall: 0.9177 | Validation recall: 0.9640
Train f1: 0.9481 | Validation f1: 0.9418


100%|██████████| 4366/4366 [22:29<00:00,  3.24it/s]
100%|██████████| 230/230 [00:22<00:00, 10.39it/s]


Epochs: 2
Train Loss:  0.0246 | Validation Loss:  0.0294
Train accuracy: 0.9613 | Validation accuracy: 0.9576
Train precision: 0.9716 | Validation precision: 0.9635
Train recall: 0.9506 | Validation recall: 0.9483
Train f1: 0.9610 | Validation f1: 0.9558


100%|██████████| 4366/4366 [22:31<00:00,  3.23it/s]
100%|██████████| 230/230 [00:22<00:00, 10.45it/s]


Epochs: 3
Train Loss:  0.0178 | Validation Loss:  0.0295
Train accuracy: 0.9743 | Validation accuracy: 0.9478
Train precision: 0.9753 | Validation precision: 0.9901
Train recall: 0.9735 | Validation recall: 0.9011
Train f1: 0.9744 | Validation f1: 0.9435


100%|██████████| 4366/4366 [21:01<00:00,  3.46it/s]
100%|██████████| 230/230 [00:20<00:00, 11.20it/s]


Epochs: 4
Train Loss:  0.0117 | Validation Loss:  0.0401
Train accuracy: 0.9833 | Validation accuracy: 0.9510
Train precision: 0.9828 | Validation precision: 0.9310
Train recall: 0.9839 | Validation recall: 0.9708
Train f1: 0.9833 | Validation f1: 0.9505


100%|██████████| 4366/4366 [20:48<00:00,  3.50it/s]
100%|██████████| 230/230 [00:20<00:00, 11.18it/s]


Epochs: 5
Train Loss:  0.0095 | Validation Loss:  0.0492
Train accuracy: 0.9880 | Validation accuracy: 0.9467
Train precision: 0.9883 | Validation precision: 0.9381
Train recall: 0.9877 | Validation recall: 0.9528
Train f1: 0.9880 | Validation f1: 0.9454
Setting test data as Dataset...

Setting up DataLoader...

Test accuracy: 0.888
Test precision: 0.6341463414634146
Test recall: 0.3880597014925373
Test f1: 0.4814814814814815


# Results

In [41]:
# Load models
svm_model = load("models\\svm_model.joblib")
nb_model = load("models\\nb_model.joblib")
herbert_model = load_checkpoint("models\\herbert_model.pth")
svm_smote_model = load("models\\svm_smote_model.joblib")
nb_smote_model = load("models\\nb_smote_model.joblib")

models = {
    'EDA + TF-IDF + NB' : nb_model,
    'SMOTE + TF-IDF + NB' : nb_smote_model,
    'EDA + TF-IDF + SVM' : svm_model,
    'SMOTE + TF-IDF + SVM' : svm_smote_model,
    'HerBERT' : herbert_model, 
    }

## Classification metrics (ACC, Precision, Recall, F1)

| Model                | ACC   | Precision  | Recall   | F1   |
|----------------------|-------|-------|-------|-------|
| **EDA + TF-IDF + NB**    | **0.793** | **0.339** | **0.575** | **0.427** |
| SMOTE + TF-IDF + NB  | 0.791 | 0.338 | 0.582 | 0.427 |
| EDA + TF-IDF + SVM   | 0.842 | 0.3   | 0.134 | 0.186 |
| SMOTE + TF-IDF + SVM | 0.871 | 0.692 | 0.067 | 0.122 |
| **HerBERT**             | **0.888** | **0.634** | **0.388** | **0.481** |

## ROC + AUC

In [42]:
# Create an empty figure, and iteratively add new lines
# every time we compute a new class
fig = go.Figure()
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

y_true = test.label.values

for m in models.keys():
    if m == 'HerBERT':
        _, y_score, _ = get_prob_scores(models.get(m), df_test)
    elif 'SMOTE' in m:
        y_score = models.get(m).predict(X_test_res)
    elif 'TF-IDF' in m:
        y_score = models.get(m).predict(test.text)
    
    
    
    fpr, tpr, _ = roc_curve(y_true, y_score)
    auc_score = roc_auc_score(y_true, y_score)

    name = f"{m} (AUC={auc_score:.2f})"
    fig.add_trace(go.Scatter(x=fpr, y=tpr, name=name, mode='lines'))

fig.update_layout(
    title = 'Krzywa ROC',
    xaxis_title='FPR',
    yaxis_title='TPR',
    yaxis=dict(scaleanchor="x", scaleratio=1),
    xaxis=dict(constrain='domain'),
    width=700, height=500
    )
fig.show()


In [43]:
""" ENG """
# Create an empty figure, and iteratively add new lines
# every time we compute a new class
fig = go.Figure()
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

y_true = test.label.values

for m in models.keys():
    if m == 'HerBERT':
        _, y_score, _ = get_prob_scores(models.get(m), df_test)
    elif 'SMOTE' in m:
        y_score = models.get(m).predict(X_test_res)
    elif 'TF-IDF' in m:
        y_score = models.get(m).predict(test.text)
    
    
    
    fpr, tpr, _ = roc_curve(y_true, y_score)
    auc_score = roc_auc_score(y_true, y_score)

    name = f"{m} (AUC={auc_score:.2f})"
    fig.add_trace(go.Scatter(x=fpr, y=tpr, name=name, mode='lines'))

fig.update_layout(
    title = 'ROC',
    xaxis_title='FPR',
    yaxis_title='TPR',
    yaxis=dict(scaleanchor="x", scaleratio=1),
    xaxis=dict(constrain='domain'),
    width=700, height=500
    )
fig.show()


[ENG]: The best results were achieved by the simplest models and the most advanced ones. In the case of a simple Naive Bayes classifier, the oversampling technique is of little importance. For this reason, the choice of EDA is recommended, as it is easy to supervise and explain how to obtain synthetic samples. In the case of SMOTE, it is based on vectorized observations (so there is an additional layer of abstraction) and creates new samples in an unsupervised manner which hinders explainability. 

The HerBERT deep learning model, however, achieved a higher value for the F1 metric (which was the main evaluation metric in the PolEval 2019 competition). The difference between the HerBERT model and the NB model is apparent, as HerBERT has a higher precision index at the expense of recall. This means that of the harmful observations detected, most are indeed harmful, but the fact that not all harmful observations were detected suffers. The opposite is true for the NB model, here a higher percentage is correctly detected harmful content, but at the expense of less precision. 

Based on the assumption that it is more valuable to detect harmful content (even less precisely), it is recommended to choose the NB model. Moreover, this decision is also confirmed by the AUC metric, which is higher. Besides, the model is explainable, as it is based on a fundamental theorem of probability calculus. 

[PL]: Najlepsze wyniki osiągnęły modele najprostsze oraz te najbardziej zaawansowane. W przypadku prostego klasyfikatora Naiwnego Bayesa, technika oversamplingu nie ma większego znaczenia. Z tego powodu zalecany jest wybór EDA, gdyż w łatwy sposób można go nadzorować oraz wytłumaczyć sposób uzyskania syntetycznych próbek. W przypadku SMOTE, oparty jest on na obserwacjach zwektoryzowanych (więc pojawia się dodatkowa warstwa abstrakcji) i tworzy nowe próbki w sposób nienadzorowany co utrudnia wyjaśnialność. 

Model uczenia głębokiego HerBERT osiągnął jednak wyższą wartość metryki F1 (która była główną metryką oceny w zawodach PolEval 2019). Widoczna jest różnica między modelem HerBERT a modelem NB, gdyż HerBERT ma wyższy wskaźnik precyzji kosztem recall. Oznacza to, że spośród wykrytych obserwacji szkodliwych większość rzeczywiście jest szkodliwa, ale cierpi na tym fakt, że nie wszystkie obserwacje szkodliwe zostały wykryte. Odwrotnie jest w przypadku modelu NB, tutaj większy odsetek jest poprawnie wykrytych treści szkodliwych, ale za to mniej precyzyjnie. 

Wychodząc z założenia, że bardziej wartościowe jest wykrycie treści szkodliwej (nawet mniej precyzyjnie) zalecany jest wybór modelu NB. Co więcej, decyzję taką potwierdza też metryka AUC, która jest wyższa. Oprócz tego, model ten jest wyjaśnialny, gdyż opiera się na fundamentalnym twierdzeniu rachunku prawdopodobieństwa. 