# Task 1

Get acquainted with the data of the Polish Cyberbullying detection dataset. Pay special attention to the distribution of the positive and negative examples in the first task as well as distribution of the classes in the second task.

In [None]:
!pip install datasets
!pip install fasttext 
!pip install transformers 
from datasets import load_dataset
import re
from collections import defaultdict
import pandas as pd
from sklearn.metrics import  f1_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import confusion_matrix
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
import fasttext
from transformers import create_optimizer
import tensorflow as tf
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification
import numpy as np
from sklearn.metrics import classification_report
from collections import Counter

dataset = load_dataset("poleval2019_cyberbullying", "task01")
dataset2 = load_dataset("poleval2019_cyberbullying", "task02")

In [2]:
def clean_text(line):
  line = re.sub("@[A-z]*", "", line)
  line = re.sub("https:[^ ]*", "", line)

  return line

In [3]:
train_X1 = []
test_X1 = []
for line in dataset['train']['text']:
  line = clean_text(line)
  train_X1.append(line)

for line in dataset['test']['text']:
  line = clean_text(line)
  test_X1.append(line)


train_y1 = dataset['train']['label'] # corresponds to Labels 0 non harmful, 1 cyberbullying
test_y1 = dataset['test']['label']

In [None]:
len(test_X1) == len(test_y1)

In [4]:
train_X2 = []
test_X2 = []

for line in dataset2['train']['text']:
  line = clean_text(line)
  train_X2.append(line)

for line in dataset2['test']['text']:
  line = clean_text(line)
  test_X2.append(line)

train_y2 = dataset2['train']['label']   # corresponds to Labels 0 non harmful, 1 cyberbullying, 2 hate speech
test_y2 = dataset2['test']['label']

In [None]:

ctr1 = Counter(dataset['train']['label'])
ctr2 = Counter(dataset2['train']['label'])

print(f'The distribution of clases in task 1: {dict(ctr1)}')
print(f'The distribution of clases in task 2: {dict(ctr2)}')

#Task 2

Train the following classifiers on the training sets (for the task 1 and the task 2)

### Bayesian classifier with TF * IDF weighting.


In [None]:
# loading CountVectorizer
tf_vectorizer = TfidfVectorizer() # or term frequency

X_train_tf1 = tf_vectorizer.fit_transform(train_X1).toarray()
X_train_tf2 = tf_vectorizer.fit_transform(train_X2).toarray()

print("n_samples: %d, n_features: %d" % X_train_tf1.shape)
print("n_samples: %d, n_features: %d" % X_train_tf2.shape)

In [7]:
X_test_tf1 = tf_vectorizer.transform(test_X1).toarray()
X_test_tf2 = tf_vectorizer.transform(test_X2).toarray()

In [None]:
naive_bayes_classifier1 = GaussianNB()
naive_bayes_classifier1.fit(X_train_tf1, train_y1)

naive_bayes_classifier2 = GaussianNB()
naive_bayes_classifier2.fit(X_train_tf2, train_y2)

In [None]:
y_pred1b = naive_bayes_classifier1.predict(X_test_tf1)

# compute the performance measures for task 1
score1 = metrics.accuracy_score(test_y1, y_pred1b)
print("accuracy task1:   %0.3f" % score1)

print(metrics.classification_report(test_y1, y_pred1b,
                                            target_names=['non-harmful', 'cyberbullying']))

print("confusion matrix:")
print(metrics.confusion_matrix(test_y1, y_pred1b))

print('------------------------------')

In [None]:
acc = accuracy_score(test_y1, y_pred1b)
f1 = f1_score(test_y1, y_pred1b, average='micro')
prec = precision_score(test_y1, y_pred1b, average='macro')
rec = recall_score(test_y1, y_pred1b, average='macro')

print(f'For the task 1 the accuracy of the model is {acc}, precision is {prec}, recall is {rec} and f1 score equals {f1}')

In [None]:
y_pred2b = naive_bayes_classifier2.predict(X_test_tf2)

# compute the performance measures for task 2
score2 = metrics.accuracy_score(test_y2, y_pred2b)
print("accuracy task2:   %0.3f" % score2)

print(metrics.classification_report(test_y2, y_pred2b,
                                            target_names=['non-harmful', 'cyberbullying', 'hate-speech']))

print("confusion matrix:")
print(metrics.confusion_matrix(test_y2, y_pred2b))

print('------------------------------')

In [None]:
acc = accuracy_score(test_y2, y_pred2b)
f1 = f1_score(test_y2, y_pred2b, average='micro')
prec = precision_score(test_y2, y_pred2b, average='macro')
rec = recall_score(test_y2, y_pred2b, average='macro')

print(f'For the task 2 the accuracy of the model is {acc}, precision is {prec}, recall is {rec} and f1 score equals {f1}')

### Fasttext text classifier

In [12]:
with open('train1.txt', 'a') as f:
  for i in range(len(dataset['train']['label'])):
    x = '__label__' + str(dataset['train']['label'][i]) + ' ' + dataset['train']['text'][i] + ' \n'
    f.write(x)

In [13]:
with open('train2.txt', 'a') as f:
  for i in range(len(dataset2['train']['label'])):
    x = '__label__' + str(dataset2['train']['label'][i]) + ' ' + dataset2['train']['text'][i] + ' \n'
    f.write(x)

In [37]:
model1 = fasttext.train_supervised('train1.txt', lr=1.0, epoch=25, wordNgrams=2)
model2 = fasttext.train_supervised('train2.txt', lr=1.0, epoch=25, wordNgrams=2)

In [38]:
result = []

for line in test_X1:
  a = model1.predict(line)
  result.append(a[0][0])

result2 = []

for line in test_X2:
  a = model2.predict(line)
  result2.append(a[0][0])

In [None]:
len(test_y1) == len(result)

In [17]:
test1 = []

for i in test_y1:
  x = '__label__' + str(i)
  test1.append(x)

test2 = []

for i in test_y2:
  x = '__label__' + str(i)
  test2.append(x)

In [None]:

confusion_matrix(test1, result)


In [None]:
acc = accuracy_score(test1, result)
f1 = f1_score(test1, result, average='micro')
prec = precision_score(test1, result, average='macro')
rec = recall_score(test1, result, average='macro')

print(f'For the task 1 the accuracy of the model is {acc}, precision is {prec}, recall is {rec} and f1 score equals {f1}')

In [None]:
confusion_matrix(test2, result2)

In [None]:
acc = accuracy_score(test2, result2)
f1 = f1_score(test2, result2, average='micro')
prec = precision_score(test2, result2, average='macro')
rec = recall_score(test2, result2, average='macro')

print(f'For the task 2 the accuracy of the model is {acc}, precision is {prec}, recall is {rec} and f1 score equals {f1}')

### Transformer classifier (take into account that a number of experiments should be performed for this model).


In [6]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenized_dataset = dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [8]:
batch_size = 16
num_epochs = 5
batches_per_epoch = len(tokenized_dataset["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [None]:
model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
model.compile(optimizer=optimizer)

In [None]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_dataset["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_dataset["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [None]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3)


In [170]:
test_y1 = [int(x) for x in test_y1]

In [None]:
predicted_raw = model.predict(tf_validation_set)
predicted_raw = predicted_raw[0]
y_predicted = np.argmax(predicted_raw, axis = 1)
print(classification_report(test_y1, y_predicted))

In [None]:
confusion_matrix(test_y1, y_predicted)

In [None]:
acc = accuracy_score(test_y1, y_predicted)
f1 = f1_score(test_y1, y_predicted, average='micro')
prec = precision_score(test_y1, y_predicted, average='macro')
rec = recall_score(test_y1, y_predicted, average='macro')

print(f'For the task 1 the accuracy of the model is {acc}, precision is {prec}, recall is {rec} and f1 score equals {f1}')

Task 2

In [None]:
tokenized_dataset = dataset2.map(preprocess_function, batched=True)

batch_size = 16
num_epochs = 5
batches_per_epoch = len(tokenized_dataset["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [None]:
model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
model.compile(optimizer=optimizer)


In [13]:

tf_train_set = model.prepare_tf_dataset(
    tokenized_dataset["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_dataset["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [None]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3)


In [None]:
test_y2 = [int(x) for x in test_y2]
predicted_raw = model.predict(tf_validation_set)
predicted_raw = predicted_raw[0]
y_predicted2 = np.argmax(predicted_raw, axis = 1)
print(classification_report(test_y2, y_predicted2))

In [None]:
acc = accuracy_score(test_y2, y_predicted2)
f1 = f1_score(test_y2, y_predicted2, average='micro')
prec = precision_score(test_y2, y_predicted2, average='macro')
rec = recall_score(test_y2, y_predicted2, average='macro')

print(f'For the task 1 the accuracy of the model is {acc}, precision is {prec}, recall is {rec} and f1 score equals {f1}')

In [None]:
confusion_matrix(test_y2, y_predicted2)

# Task 3

Compare the results of classification on the test set. Select the appropriate measures (from accuracy, F1, macro/micro F1, MCC) to compare the results

Bayesian

1. Task 1
* accuracy = 0.782
* precision = 0.5657311238184153
* recall = 0.5776601978559857 
* f1 score = 0.782

2. Task 2
* accuracy = 0.787
* precision = 0.40132515731936985
* recall = 0.4081828647301029 
* f1 score = 0.787



Fasttext

1. Task 1
* accuracy = 0.87
* precision = 0.7695681511470984
* recall = 0.5275412774464858 
* f1 score = 0.87


2. Task 2

* accuracy = 0.866
* precision = 0.5352245862884161
* recall = 0.35697452521699824 
* f1 score = 0.866

Transformers:

1. Task 1
* accuracy = 0.876
* precision = 0.8142857142857143
* recall = 0.5530833132260178
* f1 score = 0.8759999999999999

2. Task 2
* accuracy = 0.866
* precision = 0.2886666666666667
* recall = 0.3333333333333333 
* f1 score = 0.866


# Task 4

Select 1 TP, 1 TN, 1 FP and 1 FN from your predictions (for the best classifier) and compare the decisions of each classifier on these examples using LIME.


In [None]:
!pip install lime

import lime

### Task1

In [None]:
import pandas as pd
result = [0 if i =='__label__0' else 1 for i in result]
temp_dict = dict()
temp_dict['text'] = dataset['test']['text']
temp_dict['true_labels'] = dataset['test']['label']
temp_dict['predicted_labels'] = result
temp_df = pd.DataFrame(temp_dict)

temp_df.head()

In [22]:
TP = temp_df['text'][temp_df['true_labels'] == 1][temp_df['predicted_labels'] == 1].iloc[:1].values[0]
TP_idx = temp_df['text'][temp_df['true_labels'] == 1][temp_df['predicted_labels'] == 1].iloc[:1].index[0]

FP = temp_df['text'][temp_df['true_labels'] == 0][temp_df['predicted_labels'] == 1].iloc[:1].values[0]
FP_idx = temp_df['text'][temp_df['true_labels'] == 0][temp_df['predicted_labels'] == 1].iloc[:1].index[0]

TN = temp_df['text'][temp_df['true_labels'] == 0][temp_df['predicted_labels'] == 0].iloc[:1].values[0]
TN_idx = temp_df['text'][temp_df['true_labels'] == 0][temp_df['predicted_labels'] == 0].iloc[:1].index[0]

FN = temp_df['text'][temp_df['true_labels'] == 1][temp_df['predicted_labels'] == 0].iloc[:1].values[0]
FN_idx = temp_df['text'][temp_df['true_labels'] == 1][temp_df['predicted_labels'] == 0].iloc[:1].index[0]

In [23]:
from lime import lime_text
from sklearn.pipeline import make_pipeline
from sklearn import feature_extraction
from mlxtend.preprocessing import DenseTransformer

from lime.lime_text import LimeTextExplainer


In [24]:
class_names = ['Non-harmful', 'Harmful']


TP = @anonymized_account Dokładnie wie co mówi. A Ty pajacu poczytaj ustawę domsie dowiesz kto decyduje o wysokości zarobków w samorządach.

Bayes

In [None]:
c = make_pipeline(tf_vectorizer, DenseTransformer(), naive_bayes_classifier1)
explainer = LimeTextExplainer(class_names=class_names)
exp = explainer.explain_instance(TP, c.predict_proba)
exp.show_in_notebook(text=False)

Fasttext

In [None]:
def get_prob(texts: str) -> np.array:
  result = []
  for text in texts:
    label, temp = model1.predict(str(text))
    label = int(label[0][-1])
    temp = np.array(temp)
    if label == 1:
      prob_0 = 1-temp
      prob_1 = temp
    else:
      prob_0 = temp
      prob_1 = 1-temp
    result.append(list(zip(prob_0, prob_1))[0])
  return np.array(result)

explainer = LimeTextExplainer(class_names=class_names)
exp = explainer.explain_instance(TP, get_prob)
exp.show_in_notebook(text=False)

FP = @anonymized_account A ty taki dziennikarz, że nie wiesz co oznacza znak zapytania 😂

Bayes

In [None]:
c = make_pipeline(tf_vectorizer, DenseTransformer(), naive_bayes_classifier1)
explainer = LimeTextExplainer(class_names=class_names)
exp = explainer.explain_instance(FP, c.predict_proba)
exp.show_in_notebook(text=False)

Fasttext

In [None]:
explainer = LimeTextExplainer(class_names=class_names)
exp = explainer.explain_instance(FP, get_prob)
exp.show_in_notebook(text=False)

TN = @anonymized_account Spoko, jak im Duda z Morawieckim zamówią po pięć piw to wszystko będzie ok.

Bayes

In [None]:
c = make_pipeline(tf_vectorizer, DenseTransformer(), naive_bayes_classifier1)
explainer = LimeTextExplainer(class_names=class_names)
exp = explainer.explain_instance(TN, c.predict_proba)
exp.show_in_notebook(text=False)

Fasttext

In [None]:
explainer = LimeTextExplainer(class_names=class_names)
exp = explainer.explain_instance(TN, get_prob)
exp.show_in_notebook(text=False)

FN = @anonymized_account Tej szmaty się nie komentuje

Bayes

In [None]:
c = make_pipeline(tf_vectorizer, DenseTransformer(), naive_bayes_classifier1)
explainer = LimeTextExplainer(class_names=class_names)
exp = explainer.explain_instance(FN, c.predict_proba)
exp.show_in_notebook(text=False)

Fasttext

In [None]:
explainer = LimeTextExplainer(class_names=class_names)
exp = explainer.explain_instance(FN, get_prob)
exp.show_in_notebook(text=False)

### Task 2

In [None]:
import pandas as pd
result2 = [0 if i =='__label__0' else 1 for i in result2]
temp_dict = dict()
temp_dict['text'] = dataset2['test']['text']
temp_dict['true_labels'] = dataset2['test']['label']
temp_dict['predicted_labels'] = result2
temp_df = pd.DataFrame(temp_dict)

temp_df.head()

In [59]:
TP = temp_df['text'][temp_df['true_labels'] == 1][temp_df['predicted_labels'] == 1].iloc[:1].values[0]
TP_idx = temp_df['text'][temp_df['true_labels'] == 1][temp_df['predicted_labels'] == 1].iloc[:1].index[0]

FP = temp_df['text'][temp_df['true_labels'] == 0][temp_df['predicted_labels'] == 1].iloc[:1].values[0]
FP_idx = temp_df['text'][temp_df['true_labels'] == 0][temp_df['predicted_labels'] == 1].iloc[:1].index[0]

TN = temp_df['text'][temp_df['true_labels'] == 0][temp_df['predicted_labels'] == 0].iloc[:1].values[0]
TN_idx = temp_df['text'][temp_df['true_labels'] == 0][temp_df['predicted_labels'] == 0].iloc[:1].index[0]

FN = temp_df['text'][temp_df['true_labels'] == 1][temp_df['predicted_labels'] == 0].iloc[:1].values[0]
FN_idx = temp_df['text'][temp_df['true_labels'] == 1][temp_df['predicted_labels'] == 0].iloc[:1].index[0]

TP = @anonymized_account @anonymized_account @anonymized_account Zreszta ty chuja zobaczysz, kutasa ziobry najwyzej

Bayes

In [None]:
c = make_pipeline(tf_vectorizer, DenseTransformer(), naive_bayes_classifier1)
explainer = LimeTextExplainer(class_names=class_names)
exp = explainer.explain_instance(TP, c.predict_proba)
exp.show_in_notebook(text=False)

Fasttext

In [None]:
explainer = LimeTextExplainer(class_names=class_names)
exp = explainer.explain_instance(TP, get_prob)
exp.show_in_notebook(text=False)

FP = @anonymized_account Bierz tego @anonymized_account razem jesteście mocni

Bayes

In [None]:
c = make_pipeline(tf_vectorizer, DenseTransformer(), naive_bayes_classifier1)
explainer = LimeTextExplainer(class_names=class_names)
exp = explainer.explain_instance(FP, c.predict_proba)
exp.show_in_notebook(text=False)

Fasttext

In [None]:
explainer = LimeTextExplainer(class_names=class_names)
exp = explainer.explain_instance(FP, get_prob)
exp.show_in_notebook(text=False)

TN = @anonymized_account Spoko, jak im Duda z Morawieckim zamówią po pięć piw to wszystko będzie ok.

Bayes

In [None]:
c = make_pipeline(tf_vectorizer, DenseTransformer(), naive_bayes_classifier1)
explainer = LimeTextExplainer(class_names=class_names)
exp = explainer.explain_instance(TN, c.predict_proba)
exp.show_in_notebook(text=False)

Fasttext

In [None]:
explainer = LimeTextExplainer(class_names=class_names)
exp = explainer.explain_instance(TN, get_prob)
exp.show_in_notebook(text=False)

FN = @anonymized_account Tej szmaty się nie komentuje

Bayes

In [None]:
c = make_pipeline(tf_vectorizer, DenseTransformer(), naive_bayes_classifier1)
explainer = LimeTextExplainer(class_names=class_names)
exp = explainer.explain_instance(FN, c.predict_proba)
exp.show_in_notebook(text=False)

Fasttext

In [None]:
explainer = LimeTextExplainer(class_names=class_names)
exp = explainer.explain_instance(FN, get_prob)
exp.show_in_notebook(text=False)

# Task 5

Answer the following questions:

* Which of the classifiers works the best for the task 1 and the task 2.

Analyzing F1 score the best classifier for task one was the transformer and for task 2 was fasttext.


* Did you achieve results comparable with the results of PolEval Task?

For task 1 my Fasttext managed to produce slightly better outcomes than the PolEval results. The outcomes of Bayessian classifier were worse than those recieved by PolEval.

* Did you achieve results comparable with the Klej leaderboard?

No. My resuslts were higher

* Describe strengths and weaknesses of each of the compared algorithms.

The Bayesian classifier is the easiest to understand and implement, but at the same time it has the weakest performance. It is called naive because it considers each feature of an object as an independent entity, but despite these simplifications it does not do so badly in practice at all. FastText's results were satisfactory although the model had specific conditions to be met in order to work properly. The transformer classifier took by far the most time to train (despite the use of Google Colaboratory with GPU) and several times led to the death of the kernel and loss of other results. It was also the most difficult one to implement.


* Do you think comparison of raw performance values on a single task is enough to assess the value of a given algorithm/model?

I think not, but a specialist could extract enough information from it. The matter is certainly made more difficult by the fact that different metrics are suggested for different tasks, and the ranking varies greatly depending on the selected parameter.


* Did LIME show that the models use valuable features/words when performing their decision?

Yes, it did show that. We can see that for both classifiers LIME suggested words that greatly impacted the narrative.