# Assignment 3: Subjectivity Mining
Matthias, Teo and Noa

**Instructions:**

To run this notebook, make sure that the data (olid-train-small.csv, olid-test.csv and hasoc-train.csv) is stored in a folder called **data_SM** inside 'My Drive' on google drive. Data is found [here](https://canvas.vu.nl/courses/63973/files/5284148?wrap=1).
 


In [1]:
# install necessary libraries
!pip install simpletransformers 
!pip install pytorch-pretrained-bert pytorch-nlp
!pip install emoji==0.6.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting simpletransformers
  Downloading simpletransformers-0.63.9-py3-none-any.whl (250 kB)
[K     |████████████████████████████████| 250 kB 19.7 MB/s 
Collecting wandb>=0.10.32
  Downloading wandb-0.13.4-py2.py3-none-any.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 54.1 MB/s 
Collecting tokenizers
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 54.2 MB/s 
[?25hCollecting streamlit
  Downloading streamlit-1.13.0-py2.py3-none-any.whl (9.2 MB)
[K     |████████████████████████████████| 9.2 MB 35.4 MB/s 
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 1.3 MB/s 
Collecting datasets
  Downloading datasets-2.6.1-py3-none-any.whl (441 kB)
[K     |████████████████████████████████| 441 kB 46.2 MB/s 
Collec

In [2]:
# load libraries
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import pandas as pd
from google.colab import drive
import sklearn
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, classification_report
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

np.random.seed(42)

In [3]:
# mount google drive to access documents: a window should open that asks for access which you should give
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
# load data (ensure that data is loaded in a non-shared folder)
traindata_olid = pd.read_csv('/content/gdrive/MyDrive/data_SM/olid-train-small.csv')
testdata_olid = pd.read_csv('/content/gdrive/MyDrive/data_SM/olid-test.csv')
traindata_hasoc = pd.read_csv('/content/gdrive/MyDrive/data_SM/hasoc-train.csv')

In [5]:
# training models 

# bert base cased
def train_bert(traindata, dataset_name, args):
  """
  function that loads the saved trained bert-base-cased model, if file not found:
  it starts training and saves the file in the data_SM folder
  
  :param dataset_name: pandas dataframe traindata
  :param filename: name of the dataset for which the model should be saved
  :param args: options to load bert model in the same output directory

  :returns: bert model 
  """

  try:
    # try to find the saved trained pickled model
    with open(f'/content/gdrive/MyDrive/data_SM/bert_model_{dataset_name}.pkl', 'rb') as infile:
      model_bert = pickle.load(infile)

  except FileNotFoundError:
    # if saved model not found: train
    model_bert = ClassificationModel('bert', 'bert-base-cased', args=args) 
    model_bert.train_model(traindata)

    # save model as pickle file
    with open(f'/content/gdrive/MyDrive/data_SM/bert_model_{dataset_name}.pkl', 'wb') as f:
        pickle.dump(model_bert, f)
  
  return model_bert


# bertweet
def train_bertweet(traindata, dataset_name, args):
  """
  function that loads the saved trained bertweet-base model. 
  if file not found: it starts training and saves the file in the data_SM folder
  
  :param traindata: pandas dataframe traindata
  :param dataset_name: name of the dataset for which the model should be saved
  :param args: options to load bert model in the same output directory

  :returns: bertweet model 
  """

  try:
    # try to find the saved trained pickled model
    with open(f'/content/gdrive/MyDrive/data_SM/bertweet_model_{dataset_name}.pkl', 'rb') as infile:
      model_bertweet = pickle.load(infile)

  except FileNotFoundError:
    # if saved model not found: train
    model_bertweet = ClassificationModel('bertweet', 'vinai/bertweet-base', args=args) 
    model_bertweet.train_model(traindata)

    # save model as pickle file
    with open(f'/content/gdrive/MyDrive/data_SM/bertweet_model_{dataset_name}.pkl', 'wb') as f:
        pickle.dump(model_bertweet, f)
  
  return model_bertweet


In [None]:
#### PART 2.1: In-domain experiments (olid)

# training/loading models

train_args = {
    "overwrite_output_dir": True}

bert_model_olid = train_bert(traindata=traindata_olid, dataset_name='olid', args=train_args)
bertweet_model_olid = train_bertweet(traindata=traindata_olid, dataset_name='olid', args=train_args)

bert_model_hasoc = train_bert(traindata=traindata_hasoc, dataset_name='hasoc', args=train_args)
bertweet_model_hasoc = train_bertweet(traindata=traindata_hasoc, dataset_name='hasoc', args=train_args)


In [7]:
def conf_matrix(test_y, sys_y, title, labels):
    """
    Function to generate a confusion matrix and returns the classification report

    :param test_y: actual labels of testdata in a list
    :param sys_y: predicted labels in a list
    :param title: main title of the plot in string
    :param labels: list of label names in corresponding order to be put on the axis
    :returns: classification report in string
    """
    report = classification_report(test_y, sys_y)
    conf = confusion_matrix(test_y, sys_y)
    ax = sns.heatmap(conf, annot=True, cmap='Blues', fmt='g', annot_kws={"size": 16}, cbar=False)
    sns.set(font_scale=3)
    ax.set_title(title,fontsize=20)
    ax.xaxis.set_ticklabels(labels,fontsize=14)
    ax.yaxis.set_ticklabels(labels,fontsize=14)
    plt.ylabel('True label',fontsize=18)
    plt.xlabel('Predicted label',fontsize=18)
    plt_name = title.replace(' ', '_').lower()
    plt.savefig(f"{plt_name}.pdf", dpi=40, bbox_inches='tight')

    return report

In [8]:
# evaluate model on the testdata

def evaluate(model, testdata, title):
  
  # results per label
  model.eval_model(testdata, acc=sklearn.metrics.precision_recall_fscore_support)[0]
  result, model_outputs, wrong_predictions = model.eval_model(testdata, acc=sklearn.metrics.precision_recall_fscore_support)

  # predictions
  y_pred = [0 if output[0] == max(output) else 1 for output in model_outputs]
  
  # confusion matrix & classification report
  report = conf_matrix(test_y=testdata.labels, sys_y=y_pred, title=title, labels=['NON', 'OFF'])
  report.splitlines()

  return report  

In [None]:
# bertweet olid
bertweet_olid = evaluate(model=bertweet_model_olid, testdata=testdata_olid, title='BERTweet OLID matrix')
print(bertweet_olid)

In [None]:
# bertweet hasoc
bertweet_hasoc = evaluate(model=bertweet_model_hasoc, testdata=testdata_olid, title='BERTweet HASOC matrix')
print(bertweet_hasoc)

In [None]:
# bert olid
bert_olid = evaluate(model=bert_model_olid, testdata=testdata_olid, title='BERT OLID matrix')
print(bert_olid)

In [None]:
# bert hasoc
bert_hasoc = evaluate(model=bert_model_hasoc, testdata=testdata_olid, title='BERT HASOC matrix')
print(bert_hasoc)