In [None]:
!pip install transformers[sentencepiece]
!pip install torchinfo
!pip install cryptography

In [None]:
ed = 'MPQA2.0_cleaned'
k_fold = 5


In [None]:
import random
import time
from nltk import word_tokenize
import nltk
from nltk.corpus import stopwords
import re
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

In [None]:
import gc
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer, DataCollatorWithPadding, EarlyStoppingCallback, AutoConfig
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import json
from urllib.request import urlopen
import seaborn as sns
import matplotlib.pyplot as plt
import statistics
import os
from tqdm import tqdm
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from collections import defaultdict


%matplotlib inline
%config InlineBackend.figure_format='retina'

In [None]:
# Setup device

device_string = 'cuda' if torch.cuda.is_available() else 'cpu'
device_hf = 0 if torch.cuda.is_available() else -1
device = torch.device(device_string)
print("Device:", device)
NUM_WORKERS = 0

In [None]:
# Doclists

ULA_SUBSET_DOCS = [
  'ula/119CWL041', 'ula/RindnerBonnie', 'ula/HistoryGreek',
  'ula/Article247_3500', 'ula/NapierDianne', 'ula/sw2071-UTF16-ms98-a-trans',
  'ula/118CWL050', 'ula/114CUL059', 'ula/110CYL067', 'ula/PolkMaria',
  'ula/116CUL034', 'ula/115CVL037', 'ula/118CWL049', 'ula/Article247_66',
  'ula/110CYL068', 'ula/113CWL017', 'ula/112C-L015', 'ula/115CVL036',
  'ula/115CVL035', 'ula/Article247_328', 'ula/114CUL060', 'ula/112C-L012',
  'ula/118CWL048', 'ula/ReidSandra', 'ula/112C-L016', 'ula/HistoryJerusalem',
  'ula/110CYL070', 'ula/sw2014-UTF16-ms98-a-trans', 'ula/112C-L014',
  'ula/117CWL008', 'ula/sw2078-UTF16-ms98-a-trans', 'ula/110CYL071',
  'ula/114CUL057', 'ula/116CUL032', 'ula/110CYL069', 'ula/117CWL009',
  'ula/110CYL072', 'ula/chapter-10', 'ula/116CUL033', 'ula/ch5',
  'ula/sw2015-ms98-a-trans', 'ula/113CWL018', 'ula/110CYL200',
  'ula/Article247_327', 'ula/114CUL058', 'ula/112C-L013', 'ula/Article247_500',
  'ula/Article247_400'
]

ULA_LU_SUBSET_DOCS = [
  'ula/A1.E2-NEW', 'ula/wsj_1640.mrg-NEW', 'ula/AFGP-2002-600045-Trans',
  'ula/20000410_nyt-NEW', 'ula/20000415_apw_eng-NEW',
  'ula/AFGP-2002-602187-Trans', 'ula/20000815_AFP_ARB.0084.IBM-HA-NEW',
  'ula/CNN_AARONBROWN_ENG_20051101_215800.partial-NEW', 'ula/20000424_nyt-NEW',
  'ula/20000419_apw_eng-NEW', 'ula/20000416_xin_eng-NEW',
  'ula/enron-thread-159550', 'ula/wsj_2465', 'ula/AFGP-2002-600002-Trans',
  'ula/ENRON-pearson-email-25jul02', 'ula/im_401b_e73i32c22_031705-2',
  'ula/A1.E1-NEW', 'ula/CNN_ENG_20030614_173123.4-NEW-1',
  'ula/20000420_xin_eng-NEW', 'ula/IZ-060316-01-Trans-1',
  'ula/sw2025-ms98-a-trans.ascii-1-NEW', 'ula/SNO-525',
  'ula/AFGP-2002-600175-Trans', 'ula/602CZL285-1'
]

XBANK_DOCS = [
  'xbank/wsj_0904', 'xbank/wsj_0760', 'xbank/wsj_0713', 'xbank/wsj_0709',
  'xbank/wsj_0706', 'xbank/wsj_0662', 'xbank/wsj_0558', 'xbank/wsj_0555',
  'xbank/wsj_0551', 'xbank/wsj_0542', 'xbank/wsj_0541', 'xbank/wsj_0332',
  'xbank/wsj_0292', 'xbank/wsj_0189', 'xbank/wsj_0316', 'xbank/wsj_0175',
  'xbank/wsj_0321', 'xbank/wsj_0176', 'xbank/wsj_0173', 'xbank/wsj_0026',
  'xbank/wsj_0324', 'xbank/wsj_0187', 'xbank/wsj_0356', 'xbank/wsj_0325',
  'xbank/wsj_0340', 'xbank/wsj_0679', 'xbank/wsj_0695', 'xbank/wsj_0661',
  'xbank/wsj_0570', 'xbank/wsj_0557', 'xbank/wsj_0751', 'xbank/wsj_0805',
  'xbank/wsj_0762', 'xbank/wsj_0736', 'xbank/wsj_0806', 'xbank/wsj_1040',
  'xbank/wsj_1039', 'xbank/wsj_1042', 'xbank/wsj_0568', 'xbank/wsj_0778',
  'xbank/wsj_0160', 'xbank/wsj_0136', 'xbank/wsj_0135', 'xbank/wsj_0127',
  'xbank/wsj_0122', 'xbank/wsj_0032', 'xbank/wsj_0150', 'xbank/wsj_0165',
  'xbank/wsj_0157', 'xbank/wsj_0151', 'xbank/wsj_0685', 'xbank/wsj_0168',
  'xbank/wsj_0167', 'xbank/wsj_0161', 'xbank/wsj_0152', 'xbank/wsj_0073',
  'xbank/wsj_0068', 'xbank/wsj_0171', 'xbank/wsj_0144', 'xbank/wsj_0991',
  'xbank/wsj_0923', 'xbank/wsj_0907', 'xbank/wsj_0811', 'xbank/wsj_0667',
  'xbank/wsj_0534', 'xbank/wsj_0924', 'xbank/wsj_0815', 'xbank/wsj_1038',
  'xbank/wsj_1035', 'xbank/wsj_1033', 'xbank/wsj_0527', 'xbank/wsj_0928',
  'xbank/wsj_0973', 'xbank/wsj_0950', 'xbank/wsj_0927', 'xbank/wsj_0376',
  'xbank/wsj_0660', 'xbank/wsj_0650', 'xbank/wsj_0266', 'xbank/wsj_0006',
  'xbank/wsj_0768', 'xbank/wsj_1073', 'xbank/wsj_0816', 'xbank/wsj_0610',
  'xbank/wsj_0583'
]

In [None]:
def clean_text(txt, head):
    if txt == None:
        return None, None
    try:
        txt = re.sub("\n", " ", txt)
        head = re.sub("\n", " ", head)
        txt =  re.sub("\t", " ", txt)
        head =  re.sub("\t", " ", head)
        #

        txt = re.sub("LU_ANNOTATE>", " ", txt)
        txt = re.sub("<LU_ANNOTATE>", " ", txt)

        while txt.find('  ') > -1:
            txt =  re.sub("  ", " ", txt)
        while head.find('  ') > -1:
            head =  re.sub("  ", " ", head)

        return txt, head
    except:
        print(txt, ' $$$ ', head)
        return txt, head


In [None]:
def change_misses(a, b):
    misses = {' ,': ', ',
              ' .': '. ',
              ' \'s': '\'s',
              '  ': ' '
    }

    for miss in misses.keys():
        while a.find(miss) > -1:
            a = a.replace(miss, misses[miss])

        while b.find(miss) > -1:
            b = b.replace(miss, misses[miss])

    return a, b

In [None]:
# Config

IGNORED_DOCS = ULA_SUBSET_DOCS + ULA_LU_SUBSET_DOCS + XBANK_DOCS # e.g. set it to ['non_fbis/06.11.16-17420'] to remove all objects with that specific document id

ANNOT_DICT = {'expressive_subjectivity': 'expressive subjectivity',
              'speculation': 'speculation', 'other_attitude': 'other',
              'intention': 'intention', 'arguing': 'arguing',
              'agreement': 'agreement', 'sentiment': 'sentiment'}

ANNOTS= {'expressive_subjectivity': [1, 0, 0, 0, 0],
              'intention': [0, 1, 0, 0, 0], 'arguing': [0, 0, 1, 0, 0],
              'agreement': [0, 0, 0, 1, 0], 'sentiment': [0, 0, 0, 0, 1]}

CLASSES = {'medium': [0, 1, 0], 'medium-high': [0, 1, 1], 'low': [1, 0, 0],
           'high': [0, 0, 1], 'low-medium': [1, 1, 0],
           'high-extreme': [0, 0, 1], 'extreme': [0, 0, 1]}

MODEL_NAME = 'bert-base-uncased'
PRE_TRAINED_MODEL_NAME = MODEL_NAME
FREEZE_LAYER_COUNT = None
BATCH_SIZE = 16
TRAIN_BATCH_SIZE = 16
VAL_BATCH_SIZE = 64
TEST_BATCH_SIZE = 1
LOGGING_STEPS = 400
EVAL_STRATEGY = 'steps'
SAVE_STRATEGY = 'steps'
LOAD_BEST_MODEL_AT_END = True
METRIC_FOR_BEST_MODEL = 'eval_micro/f1'
DROPOUT = 0.1
BCE_WEIGHT_EXPONENT = 0
NUM_TRAIN_EPOCHS = 4
CALLBACKS = [EarlyStoppingCallback(4)]
SEED = 0
TEST_SPLIT_SEED = 0
VAL_SPLIT_SEED = 0
DATA = ed
VAL_SIZE = 0.2
KEY = b'-TheMostSuperPowerfulKeyAvailableInTheWorld='

In [None]:
data_name_to_google_drive_url = {
    'MPQA2.0_cleaned': '[Replace proper link here.]',
    'IDS': '[Replace proper link here.]'
}


# Get direct download link
def get_download_url_from_google_drive_url(google_drive_url):
    return f'https://drive.google.com/uc?id={google_drive_url.split("/")[5]}&export=download&confirm=t'


In [None]:
DATA = ed
IDS = 'IDS'

In [None]:
# Fetch the dataset
import os
from os import chdir
from google.colab import drive

FETCH_FROM_WEB = True ### Set it to true, to download the datasets from github and google drive ###

if FETCH_FROM_WEB:
    data_url = get_download_url_from_google_drive_url(data_name_to_google_drive_url[DATA])
    response = urlopen(data_url)
    data = json.loads(response.read())
    #

    ids_url = get_download_url_from_google_drive_url(data_name_to_google_drive_url[IDS])
    response = urlopen(ids_url)
    ids = json.loads(response.read())
else:
    file_address = '..\\json2csds\\data.json'
    with open(file_address) as file:
        csds_collection = json.load(file)

In [None]:
k = 1
print('Number of trainset: {}'.format(len(ids['IDs_trainset_fold_{}'.format(k)])))
print('Number of validationset: {}'.format(len(ids['IDs_validationset_fold_{}'.format(k)])))
print('Number of testset: {}'.format(len(ids['IDs_testset_fold_{}'.format(k)])))

In [None]:
def set_seed():
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True

In [None]:
def place_char(a, b, beg, end, c=None):
    if a is None:
        print(a, ' $$ ', b)
        return None, None
    else:

        a = ANNOT_DICT[c] + ' ' + eos_token + ' ' + a[0:beg] + ' ' + eos_token + ' ' +  b + ' ' + eos_token + ' ' + a[end+1:]

        return a, b

In [None]:
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
###########
init_token = ' ' + tokenizer.cls_token
eos_token = ' ' + tokenizer.sep_token

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
for item in data['csds_objects']:
    print(item.keys())
    break

In [None]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
detokenizer = TreebankWordDetokenizer()
nltk.download('punkt')

In [None]:
!pwd

In [None]:
# Read Trainset & Validationset & testset of each fold

for k in range(1, k_fold+1):
    X_text, X_head, X = [], [], []

    X_train, X_text_train, X_head_train, X_annot_train, X_unique_id_train, y_train = [], [], [], [], [], []
    X_val, X_text_val, X_head_val, X_annot_val, X_unique_id_val, y_val = [], [], [], [], [], []
    X_test, X_text_test, X_head_test, X_annot_test, X_unique_id_test, y_test = [], [], [], [], [], []

    for item in data['csds_objects']:
        if item['unique_id'] in ids['IDs_trainset_fold_{}'.format(k)]:

            #
            start, end = item['w_head_span'][0], item['w_head_span'][1]
            start_index = len(detokenizer.detokenize(item['w_text'][0: start]))
            end_index = len(detokenizer.detokenize(item['w_text'][0:end]))
            text = detokenizer.detokenize(item['w_text'])


            doc_id = item['doc_id']
            text = text
            head = text[start_index: end_index]
            annotype = item['annotation_type']
            intensity = item['intensity']
            unique_id = item['unique_id']
            head_start = start_index
            head_end = end_index
            #
            a, b = place_char(text, head, head_start, head_end, c=annotype)

            if a:

                X_train.append(a)
            else:
                print('Error')
                X_train.append(text)

            X_head_train.append(head)
            X_text_train.append(text)
            X_annot_train.append(ANNOTS[annotype])
            y_train.append(CLASSES[intensity])
            X_unique_id_train.append(unique_id)

        elif item['unique_id'] in ids['IDs_validationset_fold_{}'.format(k)]:
            #
            start, end = item['w_head_span'][0], item['w_head_span'][1]
            start_index = len(detokenizer.detokenize(item['w_text'][0: start]))
            end_index = len(detokenizer.detokenize(item['w_text'][0:end]))
            text = detokenizer.detokenize(item['w_text'])


            doc_id = item['doc_id']
            text = text
            head = text[start_index: end_index]
            annotype = item['annotation_type']
            intensity = item['intensity']
            unique_id = item['unique_id']
            head_start = start_index
            head_end = end_index
            #
            a, b = place_char(text, head, head_start, head_end, c=annotype)

            if a:
                X_val.append(a)
            else:
                print('Error')
                X_val.append(text)

            X_head_val.append(head)
            X_text_val.append(text)
            X_annot_val.append(ANNOTS[annotype])
            y_val.append(CLASSES[intensity])
            X_unique_id_val.append(unique_id)


        elif item['unique_id'] in ids['IDs_testset_fold_{}'.format(k)]:
            #
            start, end = item['w_head_span'][0], item['w_head_span'][1]
            start_index = len(detokenizer.detokenize(item['w_text'][0: start]))
            end_index = len(detokenizer.detokenize(item['w_text'][0:end]))
            text = detokenizer.detokenize(item['w_text'])


            doc_id = item['doc_id']
            text = text
            head = text[start_index: end_index]
            annotype = item['annotation_type']
            intensity = item['intensity']
            unique_id = item['unique_id']
            head_start = start_index
            head_end = end_index
            #
            a, b = place_char(text, head, head_start, head_end, c=annotype)

            if a:
                X_test.append(a)
            else:
                print('Error')
                X_test.append(text)

            X_head_test.append(head)
            X_text_test.append(text)
            X_annot_test.append(ANNOTS[annotype])
            y_test.append(CLASSES[intensity])
            X_unique_id_test.append(unique_id)


    # save in json

    path = "/content/drive/My Drive/Overall_Final_Files/kfold/"

    result = {"head": X_head_train, "text": X_text_train, "all": X_train,
              "annotationType": X_annot_train, "target": y_train,
              "uniqueID": X_unique_id_train}


    with open(path + DATA + '_trainset_fold_{}.json'.format(k), 'w', encoding='utf-8') as json_file:
        json.dump(result, json_file, ensure_ascii=False, indent=4)


    result = {"head": X_head_val, "text": X_text_val, "all": X_val,
              "annotationType": X_annot_val, "target": y_val,
              "uniqueID": X_unique_id_val}

    with open(path + DATA + '_validationset_fold_{}.json'.format(k), 'w', encoding='utf-8') as json_file:
        json.dump(result, json_file, ensure_ascii=False, indent=4)


    result = {"head": X_head_test, "text": X_text_test, "all": X_test,
              "annotationType": X_annot_test, "target": y_test,
              "uniqueID": X_unique_id_test}

    with open(path + DATA + '_testset_fold_{}.json'.format(k), 'w', encoding='utf-8') as json_file:
        json.dump(result, json_file, ensure_ascii=False, indent=4)

    k += 1