# Assignment 4: Subjectivity Mining
Matthias, Teo and Noa

**Instructions:**

To run this notebook, make sure that the data (olid-train-small.csv, olid-test.csv, hasoc-train.csv and hatebase_dict_vua_format.csv) are stored in a folder called **data_SM** inside 'My Drive' on google drive. Data is found [here](https://canvas.vu.nl/courses/63973/files/5284148?wrap=1).
 


In [None]:
# install necessary libraries
!pip install simpletransformers 
!pip install pytorch-pretrained-bert pytorch-nlp
!pip install emoji==0.6.0

In [2]:
# load libraries
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import pandas as pd
from google.colab import drive
import sklearn
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, classification_report
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import LinearSVC
from scipy.special import softmax
from sklearn.linear_model import LogisticRegression
import spacy
import emoji
import regex
from tqdm import tqdm
from sklearn import svm

np.random.seed(42)

In [None]:
# mount google drive to access documents: a window should open that asks for access which you should give
drive.mount('/content/gdrive')

In [4]:
# load data (ensure that data is loaded in a non-shared folder)
traindata_olid = pd.read_csv('/content/gdrive/MyDrive/data_SM/olid-train-small.csv')
testdata_olid = pd.read_csv('/content/gdrive/MyDrive/data_SM/olid-test.csv')
traindata_hasoc = pd.read_csv('/content/gdrive/MyDrive/data_SM/hasoc-train.csv')

In [None]:
# training models 

# bert base cased
def train_bert(traindata, dataset_name, args):
  """
  function that loads the saved trained bert-base-cased model, if file not found:
  it starts training and saves the file in the data_SM folder
  
  :param dataset_name: pandas dataframe traindata
  :param filename: name of the dataset for which the model should be saved
  :param args: options to load bert model in the same output directory

  :returns: bert model 
  """

  try:
    # try to find the saved trained pickled model
    with open(f'/content/gdrive/MyDrive/data_SM/bert_model_{dataset_name}.pkl', 'rb') as infile:
      model_bert = pickle.load(infile)

  except FileNotFoundError:
    # if saved model not found: train
    model_bert = ClassificationModel('bert', 'bert-base-cased', args=args, use_cuda=True) 
    model_bert.train_model(traindata)

    # save model as pickle file
    with open(f'/content/gdrive/MyDrive/data_SM/bert_model_{dataset_name}.pkl', 'wb') as f:
        pickle.dump(model_bert, f)
  
  return model_bert


# bertweet
def train_bertweet(traindata, dataset_name, args):
  """
  function that loads the saved trained bertweet-base model. 
  if file not found: it starts training and saves the file in the data_SM folder
  
  :param traindata: pandas dataframe traindata
  :param dataset_name: name of the dataset for which the model should be saved
  :param args: options to load bert model in the same output directory

  :returns: bertweet model 
  """

  try:
    # try to find the saved trained pickled model
    with open(f'/content/gdrive/MyDrive/data_SM/bertweet_model_{dataset_name}.pkl', 'rb') as infile:
      model_bertweet = pickle.load(infile)

  except FileNotFoundError:
    # if saved model not found: train
    model_bertweet = ClassificationModel('bertweet', 'vinai/bertweet-base', args=args, use_cuda=True) 
    model_bertweet.train_model(traindata)

    # save model as pickle file
    with open(f'/content/gdrive/MyDrive/data_SM/bertweet_model_{dataset_name}.pkl', 'wb') as f:
        pickle.dump(model_bertweet, f)
  
  return model_bertweet


# hateBERT
def train_hatebert(traindata, dataset_name, args):
  """
  function that loads the saved trained hatebert-base-uncased model. 
  if file not found: it starts training and saves the file in the data_SM folder
  
  :param traindata: pandas dataframe traindata
  :param dataset_name: name of the dataset for which the model should be saved
  :param args: options to load bert model in the same output directory

  :returns: bertweet model 
  """

  try:
    # try to find the saved trained pickled model
    with open(f'/content/gdrive/MyDrive/data_SM/hatebert_model_{dataset_name}.pkl', 'rb') as infile:
      model_hatebert = pickle.load(infile)

  except FileNotFoundError:
    # if saved model not found: train
    model_hatebert = ClassificationModel('bert', 'GroNLP/hateBERT', args=args, use_cuda=True) 
    model_hatebert.train_model(traindata)

    # save model as pickle file
    with open(f'/content/gdrive/MyDrive/data_SM/hatebert_model_{dataset_name}.pkl', 'wb') as f:
        pickle.dump(model_hatebert, f)
  
  return model_hatebert


In [15]:
# #### PART 2.1: In-domain experiments (olid)

# # training/loading models

train_args = {"reprocess_input_data": True,
              "overwrite_output_dir": True,
              "use_early_stopping": True,
              "early_stopping_delta": 0.01,
              "early_stopping_metric": "mcc",
              "early_stopping_metric_minimize": False,
              "early_stopping_patience": 5,
              "evaluate_during_training_steps": 1000}

bert_model_olid = train_bert(traindata=traindata_olid, dataset_name='olid', args=train_args)
bertweet_model_olid = train_bertweet(traindata=traindata_olid, dataset_name='olid', args=train_args)
hatebert_model_olid = train_hatebert(traindata=traindata_olid, dataset_name='olid', args=train_args)

bert_model_hasoc = train_bert(traindata=traindata_hasoc, dataset_name='hasoc', args=train_args)
bertweet_model_hasoc = train_bertweet(traindata=traindata_hasoc, dataset_name='hasoc', args=train_args)
hatebert_model_hasoc = train_hatebert(traindata=traindata_hasoc, dataset_name='hasoc', args=train_args)


In [5]:
def conf_matrix(test_y, sys_y, title, labels):
    """
    Function to generate a confusion matrix and returns the classification report

    :param test_y: actual labels of testdata in a list
    :param sys_y: predicted labels in a list
    :param title: main title of the plot in string
    :param labels: list of label names in corresponding order to be put on the axis
    :returns: classification report in string
    """
    report = classification_report(test_y, sys_y)
    conf = confusion_matrix(test_y, sys_y)
    ax = sns.heatmap(conf, annot=True, cmap='Blues', fmt='g', annot_kws={"size": 16}, cbar=False)
    sns.set(font_scale=3)
    ax.set_title(title,fontsize=20)
    ax.xaxis.set_ticklabels(labels,fontsize=14)
    ax.yaxis.set_ticklabels(labels,fontsize=14)
    plt.ylabel('True label',fontsize=18)
    plt.xlabel('Predicted label',fontsize=18)
    plt_name = title.replace(' ', '_').lower()
    plt.savefig(f"{plt_name}.pdf", dpi=40, bbox_inches='tight')

    return report

In [6]:
# evaluate model on the testdata

def evaluate(model, testdata, title):
  """
  Function to evaluate the results of a model with testdata and saves directly the confusion matrix

  :param model: the trained model
  :param testdata: testdata in pandas dataframe
  :param title: title of model (for saving plots)
  :returns: classification report, binary predictions and predicted probabilities
    """
  
  # results per label
  result, model_outputs, wrong_predictions = model.eval_model(testdata, acc=sklearn.metrics.precision_recall_fscore_support)

  # predictions
  y_binary_preds = [0 if output[0] == max(output) else 1 for output in model_outputs] # binary: 1 is highest probability, 0 lowest
  y_probs_preds = model_outputs # probabilities
  
  # confusion matrix & classification report
  report = conf_matrix(test_y=testdata.labels, sys_y=y_binary_preds, title=title, labels=['NON', 'OFF'])
  report.splitlines()

  return report, y_binary_preds, y_probs_preds   

In [7]:
## HARD MAJORITY VOTE

def hard_majority_vote(predictions):
  """
  Function to perform the hard majority vote method based on binary predictions 

  :param predictions: pandas dataframe with all predicted binary values
  :returns: list of votings
  """
  
  cols = list(predictions.columns)
  votings = []
  for i, row in predictions.iterrows():
    values = [row[col] for col in cols]
    vote = list(Counter(values).keys())[0]
    votings.append(vote)
  
  return votings

In [8]:
## SOFT MAJORITY VOTE

def soft_majority_vote(predictions):
  """
  Function to perform the hard majority vote method based on binary predictions 

  :param predictions: pandas dataframe with all predicted binary values
  :returns: list of votings
  """

  probs_per_class = defaultdict(list)
  votings = []
  probs = defaultdict(list)
  for row in range(len(predictions[0])):

    # non_offensive probablities
    non_model_0 = predictions[0][row][0]
    non_model_1 = predictions[1][row][0]
    non_model_2 = predictions[2][row][0]
    sum_non = non_model_0 + non_model_1 + non_model_2

    probs['probs_non (bert;bertweet;hatebert)'].append(f"{non_model_0.round(2)};{non_model_1.round(2)};{non_model_2.round(2)}")

    # offensive probabilities
    off_model_0 = predictions[0][row][1]
    off_model_1 = predictions[1][row][1]
    off_model_2 = predictions[2][row][1]
    sum_off = off_model_0 + off_model_1 + off_model_2

    probs['probs_off (bert;bertweet;hatebert)'].append(f"{off_model_0.round(2)};{off_model_1.round(2)};{off_model_2.round(2)}")

    if sum_non > sum_off:
      votings.append(0)
    else:
      votings.append(1)
    
  return votings, probs

In [9]:
def add_features(df, args):
  """
  Function that adds features to the given dataframe. Features are: number of characters, 
  relative frequency of uppercase characters, number of tokens and number of special symbols, 

  :param df: pandas dataframe with column 'text' to determine the additional features from
  :returns: pandas dataframe with extra feature columns 
  """

  characters, freq_upper, tokens, symbols, emojis, hate_lexs = [], [], [], [], [], []
  nlp = spacy.load("en_core_web_sm")

  if args['num_lex'] == True:
    hate_lexicons = pd.read_csv('/content/gdrive/MyDrive/data_SM/hatebase_dict_vua_format.csv', sep=';')
    lexicons = list(hate_lexicons['Entry'])

  print("Adding additional features:")
  for text in tqdm(df.text):
    text = str(text)
    # number of characters
    if args['num_chars'] == True:
      characters.append(len(text)) 

    # frequence uppercase characters
    if args['num_uppercase'] == True:
      freq_upper.append(sum(1 for c in text if c.isupper())/len(text)) 

    # get number of special symbols
    if args['num_symbols'] == True:
      len_symbols = 0
      for char in text:
        if char.isalpha():
          continue
        elif char.isdigit():
          continue
        else:
          len_symbols+=1
      symbols.append(len_symbols)
    
    # get number of tokens and/or number of hatefull lexicons
    if (args['num_tokens']==True) or (args['num_lex']==True):
      doc = nlp(text)
      len_tokens = 0
      len_hate_lex = 0
      for token in doc:
        len_tokens += 1
        if token.text in lexicons:
          len_hate_lex += 1
      tokens.append(len_tokens)
      hate_lexs.append(len_hate_lex)

    # get number of emoticons
    if args['num_emojis'] == True:
      num_emojis = 0
      data = regex.findall(r'\X', text)
      for word in data:
          if any(char in emoji.UNICODE_EMOJI for char in word):
              num_emojis += 1
      emojis.append(num_emojis)

  features_cols = []
  if args['num_chars'] == True:
    df['num_chars'] = characters
    features_cols.append('num_chars')
  if args['num_uppercase'] == True:
    df['freq_uppercase'] = freq_upper
    features_cols.append('freq_uppercase')
  if args['num_tokens'] == True:
    df['num_tokens'] = tokens
    features_cols.append('num_tokens')
  if args['num_symbols'] == True:
    df['num_symbols'] = symbols
    features_cols.append('num_symbols')
  if args['num_emojis'] == True:
    df['num_emojis'] = emojis
    features_cols.append('num_emojis')
  if args['num_lex'] == True:
    df['num_hate_lexicons'] = hate_lexs
    features_cols.append('num_hate_lexicons')
  
  
  return df, features_cols


In [12]:
# K-FOLD + STACKED GENERALIZATION

def k_fold(trainset, model_code, model_type, filename):
  """
  Function to perform a five fold cross validation

  :param trainset: dataset that contains training instances
  :param model_code: the model code to be inserted for the simple transformers ClassificationModel
  :param model_type: the model type to be inserted for the simple transformers CLassificationModel
  :returns: pandas dataframe with output 
  """
  try:
    # try to find the saved output
    output = pd.read_excel(f'/content/gdrive/MyDrive/data_SM/{filename}.xlsx')

  except FileNotFoundError:
    rskf = StratifiedKFold(n_splits=5, shuffle=True)
    pred = []
    probabilities = []
    gold = []
    index = []
    for train_index, test_index in rskf.split(trainset['text'], trainset['labels']):
      train_df = trainset.iloc[train_index]
      test_df = trainset.iloc[test_index]
      
      model = ClassificationModel(model_code, model_type, args=train_args, use_cuda=True) 
      model.train_model(train_df)
      preds, probs = model.predict(test_df.text.to_list())

      gold.extend(test_df['labels'])
      pred.extend(preds)
      probabilities.extend(softmax(probs, axis=1)[:,1])
      index.extend(test_index)

    output = pd.DataFrame(columns = ['label', 'probabilities', 'predicted', 'id'])
    output.label, output.probabilities, output.predicted, output.id = gold, probabilities, pred, index
    output['id'] = [trainset.id.to_list()[idx] for idx in index]
    output = output[['id', 'label', 'probabilities', 'predicted']]
    output.to_excel(f'/content/gdrive/MyDrive/data_SM/{filename}.xlsx', index=False)
  return output


def stacking(traindata, testdata, title, feature_args):
  """
  Function to perform the stacked generalization method using kfold

  :param traindata: the traindata that consists of the columns id, actual labels and text
  :param testdata: the testdata that consists of the columns id, actual labels, text, binary predictions bert, 
                   binary predictions bertweet and binary predictions hatebert
  :param title: title to give to the matrix and results that will be saved
  :feature_args: arguments passed that tells which additional features to include
  :returns: binary predictions logistic regression meta learner
  """

  # perform kfold 
  output_bert = k_fold(traindata, 'bert', 'bert-base-cased', f'kfold_bert_{title}').sort_values(by=['id']).reset_index(drop=True)
  output_bertweet = k_fold(traindata, 'bertweet', 'vinai/bertweet-base', f'kfold_bertweet_{title}').sort_values(by=['id']).reset_index(drop=True)
  output_berthate = k_fold(traindata, 'bert', 'GroNLP/hateBERT', f'kfold_berthate_{title}').sort_values(by=['id']).reset_index(drop=True)

  # combine outputs
  output_kfold = pd.DataFrame(columns=['pred_bert', 'pred_bertweet', 'pred_hatebert'])
  output_kfold.pred_bert, output_kfold.pred_bertweet, output_kfold.pred_hatebert = output_bert['predicted'], output_bertweet['predicted'], output_berthate['predicted']

  # add additional features
  traindata = traindata.sort_values(by=['id']).reset_index(drop=True)
  testdata = testdata.sort_values(by=['id']).reset_index(drop=True)
  traindata_features, feature_cols = add_features(traindata, feature_args)
  new_traindata = pd.concat([traindata_features, output_kfold], axis=1)
  testdata_features, feature_cols = add_features(testdata, feature_args)

  cols = ['pred_bert', 'pred_bertweet', 'pred_hatebert'] + feature_cols

  # create train/test data
  y_train = output_bert['label']
  X_train = new_traindata.drop('labels', axis=1).drop('id', axis=1).drop('text', axis=1)
  X_train = X_train[cols]
  y_test = testdata_features['labels']
  X_test = testdata_features.drop('labels', axis=1).drop('id', axis=1).drop('text', axis=1)
  X_test = X_test[cols]
  
  # train logistic regressor
  clf_lr = LogisticRegression(random_state=0, max_iter=10000).fit(X_train, y_train)
  preds_lr = clf_lr.predict(X_test)

  # show evaluations
  report = conf_matrix(test_y=y_test, sys_y=preds_lr, title=title, labels=['NON', 'OFF'])
  
  return testdata_features[feature_cols], output_kfold, preds_lr, report


In [None]:
# bertweet olid
bertweet_report_olid, bertweet_ybinary_olid, bertweet_yprobs_olid= evaluate(model=bertweet_model_olid, testdata=testdata_olid, title='BERTweet OLID matrix')
print(bertweet_report_olid)

In [None]:
# bert olid
bert_report_olid, bert_ybinary_olid, bert_yprobs_olid = evaluate(model=bert_model_olid, testdata=testdata_olid, title='BERT OLID matrix')
print(bert_report_olid)

In [None]:
# hatebert olid
hatebert_report_olid, hatebert_ybinary_olid, hatebert_yprobs_olid = evaluate(model=hatebert_model_olid, testdata=testdata_olid, title='HateBERT OLID matrix')
print(hatebert_report_olid)

In [None]:
## PREDICTIONS OLID 

preds_olid = testdata_olid
preds_olid['pred_bert'] = bert_ybinary_olid
preds_olid['pred_bertweet'] = bertweet_ybinary_olid
preds_olid['pred_hatebert'] = hatebert_ybinary_olid

# hard majority voting
preds_olid['hard_majority_vote'] = hard_majority_vote(preds_olid[['pred_bert', 'pred_bertweet', 'pred_hatebert']])
hard_report = conf_matrix(test_y=preds_olid['labels'], sys_y=preds_olid['hard_majority_vote'], title='Hard majority vote OLID', labels=['NON', 'OFF'])
print(hard_report)

In [None]:
# soft majority voting
votings, probs = soft_majority_vote([bert_yprobs_olid, bertweet_yprobs_olid, hatebert_yprobs_olid])
preds_olid = pd.concat([preds_olid, pd.DataFrame(probs)], axis=1)
preds_olid['soft_majority_vote'] = votings
soft_report = conf_matrix(test_y=preds_olid['labels'], sys_y=preds_olid['soft_majority_vote'], title='Soft majority vote OLID', labels=['NON', 'OFF'])
print(soft_report)

In [None]:
# stacked generalization

testdata_features, kfold_preds, preds_meta, report  = stacking(traindata_olid, 
                                                               preds_olid[['id', 'labels','text', 'pred_bert', 'pred_bertweet', 'pred_hatebert']], 
                                                               title = 'Stacking OLID',
                                                               feature_args= {'num_chars': True,
                                                                              'num_tokens': True,
                                                                              'num_emojis': True,
                                                                              'num_symbols': True,
                                                                              'num_uppercase': True,
                                                                              'num_lex': True})
print(report)
preds_olid = preds_olid.sort_values(by=['id']).reset_index(drop=True)
preds_olid['kfold_pred_bert'] = kfold_preds['pred_bert']
preds_olid['kfold_pred_bertweet'] = kfold_preds['pred_bertweet']
preds_olid['kfold_pred_hatebert'] = kfold_preds['pred_hatebert']
preds_olid['stacking_preds'] = preds_meta
preds_olid = pd.concat([preds_olid, testdata_features], axis=1)

# save all results
preds_olid.to_excel(f'/content/gdrive/MyDrive/data_SM/all_predictions_olid.xlsx',sheet_name="olid",index=False)

In [None]:
# bertweet hasoc
bertweet_report_hasoc, bertweet_ybinary_hasoc, bertweet_yprobs_hasoc= evaluate(model=bertweet_model_hasoc, testdata=testdata_olid, title='BERTweet HASOC matrix')
print(bertweet_report_hasoc)

In [None]:
# bert hasoc
bert_report_hasoc, bert_ybinary_hasoc, bert_yprobs_hasoc= evaluate(model=bert_model_hasoc, testdata=testdata_olid, title='BERT HASOC matrix')
print(bert_report_hasoc)

In [None]:
# hatebert hasoc
hatebert_report_hasoc, hatebert_ybinary_hasoc, hatebert_yprobs_hasoc= evaluate(model=hatebert_model_hasoc, testdata=testdata_olid, title='HateBERT HASOC matrix')
print(hatebert_report_hasoc)

In [None]:
## PREDICTIONS HASOC 

preds_hasoc = testdata_olid
preds_hasoc['pred_bert'] = bert_ybinary_hasoc
preds_hasoc['pred_bertweet'] = bertweet_ybinary_hasoc
preds_hasoc['pred_hatebert'] = hatebert_ybinary_hasoc

# hard majority voting
preds_hasoc['hard_majority_vote'] = hard_majority_vote(preds_hasoc[['pred_bert', 'pred_bertweet', 'pred_hatebert']])
hard_report = conf_matrix(test_y=preds_hasoc['labels'], sys_y=preds_hasoc['hard_majority_vote'], title='Hard majority vote HASOC', labels=['NON', 'OFF'])
print(hard_report)

In [None]:
# soft majority voting
votings, probs = soft_majority_vote([bert_yprobs_hasoc, bertweet_yprobs_hasoc, hatebert_yprobs_hasoc])
preds_hasoc = pd.concat([preds_hasoc, pd.DataFrame(probs)], axis=1)
preds_hasoc['soft_majority_vote'] = votings
soft_report = conf_matrix(test_y=preds_hasoc['labels'], sys_y=preds_hasoc['soft_majority_vote'], title='Soft majority vote HASOC', labels=['NON', 'OFF'])
print(soft_report)

In [None]:
# stacked generalization
traindata_hasoc = pd.read_csv(f'/content/gdrive/MyDrive/data_SM/hasoc-train.csv')
preds_hasoc = pd.read_excel(f'/content/gdrive/MyDrive/data_SM/all_predictions.xlsx', sheet_name='hasoc')
testdata_features, kfold_preds, preds_meta, report  = stacking(traindata_hasoc, 
                                                               preds_hasoc[['id', 'labels','text', 'pred_bert', 'pred_bertweet', 'pred_hatebert']], 
                                                               title = 'Stacking HASOC',
                                                               feature_args= {'num_chars': True,
                                                                              'num_tokens': True,
                                                                              'num_emojis': True,
                                                                              'num_symbols': True,
                                                                              'num_uppercase': True,
                                                                              'num_lex': True})
print(report)
preds_hasoc = preds_hasoc.sort_values(by=['id']).reset_index(drop=True)
preds_hasoc['kfold_pred_bert'] = kfold_preds['pred_bert']
preds_hasoc['kfold_pred_bertweet'] = kfold_preds['pred_bertweet']
preds_hasoc['kfold_pred_hatebert'] = kfold_preds['pred_hatebert']
preds_hasoc['stacking_preds'] = preds_meta
preds_hasoc = pd.concat([preds_hasoc, testdata_features], axis=1)

# save all results
preds_hasoc.to_excel(f'/content/gdrive/MyDrive/data_SM/all_predictions_hasoc.xlsx',sheet_name="hasoc",index=False)