# Code Objective:

*   mBERT Model for Fine Grained Evaluation
*   Constructing Problem from Multilabel Classification to independent Binary Classification

# Code Results:
*   Accuracy - mBERT Model for Fake vs Non-Fake = 81.38 %
*   Accuracy - mBERT Model for Hate vs Non-Hate = 77.12 %
*   Accuracy - mBERT Model for Defamation vs Non-Defamation = 79.52 %
*   Accuracy - mBERT Model for Offensive vs Non-Offensive = 69.68 %





# Importing Libraries

In [None]:
!pip install transformers

In [None]:
import pandas as pd
import numpy as np
import json, re
from tqdm import tqdm_notebook
from uuid import uuid4

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from transformers import glue_compute_metrics
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig, BertForSequenceClassification
from transformers import XLMRobertaTokenizer, XLMRobertaModel, XLMRobertaConfig, XLMRobertaForSequenceClassification

print("GPU Torch Available = {}".format(torch.cuda.is_available()))
print("Torch Version = {}".format(torch.__version__))

# Data Loading

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
'''
Loading Dataset for Finegrained Multilabel Evaluation which has been transformed
as multiple independent binary classification (One vs Rest Approach)
'''

dataset = 'Fake'             # Choosing Dataset to Load

if dataset == 'Fake':
  # Training Data
  file = '/content/drive/My Drive/CONSTRAINT 2021 Projects (AAAI)/Hindi_Task/Dataset/fake_train.xlsx'
  train_df = pd.read_excel(file)
  # Validation Data
  file = '/content/drive/My Drive/CONSTRAINT 2021 Projects (AAAI)/Hindi_Task/Dataset/fake_validate.xlsx'
  test_df = pd.read_excel(file)
  test_df

elif dataset == 'Hate':
  # Training Data
  file = '/content/drive/My Drive/CONSTRAINT 2021 Projects (AAAI)/Hindi_Task/Dataset/hate_train.xlsx'
  train_df = pd.read_excel(file)
  # Validation Data
  file = '/content/drive/My Drive/CONSTRAINT 2021 Projects (AAAI)/Hindi_Task/Dataset/hate_validate.xlsx'
  test_df = pd.read_excel(file)
  test_df

elif dataset == 'Offensive':
  # Training Data
  file = '/content/drive/My Drive/CONSTRAINT 2021 Projects (AAAI)/Hindi_Task/Dataset/offensive_train.xlsx'
  train_df = pd.read_excel(file)
  # Validation Data
  file = '/content/drive/My Drive/CONSTRAINT 2021 Projects (AAAI)/Hindi_Task/Dataset/offensive_validate.xlsx'
  test_df = pd.read_excel(file)
  test_df

elif dataset == 'Defamation':
  # Training Data
  file = '/content/drive/My Drive/CONSTRAINT 2021 Projects (AAAI)/Hindi_Task/Dataset/defamation_train.xlsx'
  train_df = pd.read_excel(file)
  # Validation Data
  file = '/content/drive/My Drive/CONSTRAINT 2021 Projects (AAAI)/Hindi_Task/Dataset/defamation_validate.xlsx'
  test_df = pd.read_excel(file)
  test_df

else:
  print('Choose Correct Dataset')

In [None]:
# Data Preparation into Pandas Dataframe for Model Input

def get_data(a):
  Unique_ID = list(a['Unique ID'])
  sentence = list(a['Post'])
  text_labels = list(a['Labels Set'])

  label = []
  for i in text_labels:
    if i=='non_offensive':
        label.append(0)
    elif i=='offensive':
        label.append(1)
    elif i=='non_fake':
        label.append(0)
    elif i=='fake':
        label.append(1)
    elif i=='non_hate':
        label.append(0)
    elif i=='hate':
        label.append(1)
    elif i=='non_defamation':
        label.append(0)
    elif i=='defamation':
        label.append(1)

  raw_data_train = {'UID':Unique_ID,'sentence': sentence, 'label': label}
  df = pd.DataFrame(raw_data_train, columns = ['UID','sentence','label'])
  return df

train_data = get_data(train_df)
test_data  = get_data(test_df)

print(train_data[0:3])
print(test_data[0:3])

# Model Parameters

In [None]:
# Choose and Load Model
model_name = 'Bert'

if (model_name == 'Bert'):
  # Bert Parameters
  config = BertConfig.from_pretrained('bert-base-multilingual-cased',num_labels=2)
  tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
  model = BertForSequenceClassification(config)
elif (model_name == 'Roberta'):
  # XLMRoberta Parameters
  config = XLMRobertaConfig.from_pretrained('xlm-roberta-base',num_labels=2)
  tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
  model = XLMRobertaForSequenceClassification(config)
else:
  print('Choose correct Model')

# Data Preparation for Model Input

In [None]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.sentence = dataframe.sentence
        self.targets = self.data.label
        self.max_len = max_len

    def __len__(self):
        return len(self.sentence)

    def __getitem__(self, index):
        sentence1 = str(self.sentence[index])

        inputs = self.tokenizer.encode_plus(sentence1,
                                            truncation=True,
                                            add_special_tokens=True,
                                            max_length=self.max_len,
                                            pad_to_max_length=True,
                                            return_token_type_ids=True)
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {'input_ids': torch.tensor(ids, dtype=torch.long),
                'attention_mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
                'labels': torch.tensor(self.targets[index], dtype=torch.long)
               }

In [None]:
# Dataset for Input into Model
MAX_LEN = 128                                                 # Max Sequence Length
training_set = CustomDataset(train_data, tokenizer, MAX_LEN)  # Training Set
testing_set = CustomDataset(test_data, tokenizer, MAX_LEN)    # Validation Set

# Training and Evaluation Phase

In [None]:
# Device Mapping Select (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.cuda()

# Training Arguments
training_args = TrainingArguments(output_dir="./models/model_name",
                                  overwrite_output_dir=True,
                                  do_train=True,
                                  do_eval=True,
                                  per_device_train_batch_size=28,
                                  per_device_eval_batch_size=28,
                                  num_train_epochs=20,
                                  logging_steps=100,
                                  logging_first_step=True,
                                  save_steps=0,
                                  evaluate_during_training=True)

# Metric for Performance Evaluation
def compute_metrics(p):
  preds = np.argmax(p.predictions, axis=1)
  return glue_compute_metrics("mnli", preds, p.label_ids)

# Trainer for training Model
trainer = Trainer(model = model,
                  args = training_args,
                  train_dataset = training_set,
                  eval_dataset = testing_set,
                  compute_metrics = compute_metrics)

In [None]:
# Training Model
trainer.train()

In [None]:
# Evaluation of Model on Validation Data
trainer.evaluate(testing_set)

# Trained Model Save and Load for later use

In [None]:
# Model Save
model_save_path = '/content/drive/My Drive/CONSTRAINT 2021 Projects (AAAI)/Hindi_Task/Weights/BERT_state_dict_offensive_'
torch.save(model.state_dict(), model_save_path + str(uuid4())+'.pth')

In [None]:
# Model Load
model_path = '/content/drive/My Drive/CONSTRAINT 2021 Projects (AAAI)/Hindi_Task/Weights/XLMR_state_dict_offensive_de181722-b72b-4ea0-9713-27769728db16.pth'
model.load_state_dict(torch.load(model_path, map_location=device))

# Prediction

In [None]:
'''
Load Model, predict on validation or test data and get labels for each dataset
For 4 different datasets (Fake, Hate, Defamation, Offensive) 
we get 4 output numpy array of labels. 
'''

# Prediction
def prepare_features(seq_1, max_seq_length = 128, zero_pad = False, include_CLS_token = True, include_SEP_token = True):
    ## Tokenzine Input
    tokens_a = tokenizer.tokenize(seq_1)

    ## Truncate
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0:(max_seq_length - 2)]
    ## Initialize Tokens
    tokens = []
    if include_CLS_token:
        tokens.append(tokenizer.cls_token)
    ## Add Tokens and separators
    for token in tokens_a:
        tokens.append(token)

    if include_SEP_token:
        tokens.append(tokenizer.sep_token)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    ## Input Mask 
    input_mask = [1] * len(input_ids)
    ## Zero-pad sequence lenght
    if zero_pad:
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
    return torch.tensor(input_ids).unsqueeze(0), input_mask


def predict(text):
  model.eval()
  input_feature, _ = prepare_features(text)
  if torch.cuda.is_available():
    input_feature = input_feature.cuda()
  output = model(input_feature)[0]
  _, pred_label = torch.max(output.data, 1)
  prediction = pred_label[0].item()
  if (prediction == 0):
    return 'non_offensive',0
  else:
    return 'offensive',1

data = test_data

pred = []
pred_lab = []
for i in range(len(data)):
  text = data['sentence'][i]
  pred_text , pred_label = predict(text)
  pred.append(pred_text)
  pred_lab.append(pred_label)

pred_lab = np.array(pred_lab, dtype=np.float)
np.save('Final_Offensive_validation_Pred_Label.npy',pred_lab)