In [None]:
# Parameters

RUNTIME_TYPE = 'COLAB'
ed = 'MPQA2.0_v221219_cleaned'
k_fold = 5
model_name = "bert-base-uncased"
intensity = {'medium': [0, 1, 0], 'medium-high': [0, 1, 1], 'low': [1, 0, 0], 'high': [0, 0, 1], 'low-medium': [1, 1, 0], 'high-extreme': [0, 0, 1], 'extreme': [0, 0, 1]}
polarity = {'negative': 0, 'neutral': 1, 'positive': 2}
annotation = {'expressive_subjectivity': [1, 0, 0, 0, 0], 'arguing': [0, 1, 0, 0, 0], 'agreement': [0, 0, 1, 0, 0], 'intention': [0, 0, 0, 1, 0], 'sentiment': [0, 0, 0, 0, 1]}
annotation_num = {'arguing': 0, 'expressive_subjectivity': 1, 'agreement': 2, 'sentiment': 3, 'direct': 4, 'intention': 5}

In [None]:
# Dataset

SPLITS_URL = 'https://raw.githubusercontent.com/theSaeed/opinion-mining-using-llms/master/dataset/folds/tpi-folds.json'

DATA_URL = '[replace dataset link here]' # WEB

path = '[Replace save location link here (Fortorch).]'

In [None]:
# Libraries Required for Google Colab

if RUNTIME_TYPE == 'COLAB':
    %pip install jsonlines
    %pip install transformers

In [None]:
!pwd

In [None]:
!nvidia-smi

In [None]:
if RUNTIME_TYPE == 'COLAB':
    from google.colab import drive
    drive.mount('/content/drive')

In [None]:
import torch
from torch import nn, optim
import torch.nn.functional as F
import numpy as np
import pandas as pd
import os
import json
import jsonlines
import os.path
from urllib.request import urlopen
from transformers import BertTokenizer

In [None]:
# Support for third-party widgets

if RUNTIME_TYPE == 'COLAB':
    from google.colab import output
    output.enable_custom_widget_manager()

In [None]:
# Setup device

device_string = 'cuda' if torch.cuda.is_available() else 'cpu'
device_hf = 0 if torch.cuda.is_available() else -1
device = torch.device(device_string)
print("Device:", device)
NUM_WORKERS = 0

In [None]:
# Loading the saved JSON files

response = urlopen(SPLITS_URL)
ids = json.loads(response.read())

response_data = urlopen(DATA_URL)
data = json.loads(response_data.read())

In [None]:
tokenizer = BertTokenizer.from_pretrained(model_name)

In [None]:
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token

In [None]:
# Read Trainset & Validationset & testset of each fold

for k in range(1, k_fold+1):
    X_train, X_head_train, X_annot_train, X_unique_id_train, y_train, y_p_train, mix_train, mix_train_a = [], [], [], [], [], [], [], []
    X_val, X_head_val, X_annot_val, X_unique_id_val, y_val, y_p_val, mix_val, mix_val_a = [], [], [], [], [], [], [], []
    X_test, X_head_test, X_annot_test, X_unique_id_test, y_test, y_p_test, mix_test, mix_test_a = [], [], [], [], [], [], [], []

    for item in data['csds_objects']:
        if item['unique_id'] in ids['IDs_trainset_fold_{}'.format(k)]:
            X_head_train.append(item['clean_head'])
            X_train.append(item['clean_text'])
            X_annot_train.append(item['annotation_type'])
            y_train.append(polarity[item['polarity']])
            y_p_train.append(item['polarity'])
            mix_train.append(item['clean_head'] + eos_token + init_token + item['clean_text'])
            mix_train_a.append(str(annotation_num[item['annotation_type']]) + eos_token + init_token + item['clean_head'] + eos_token + init_token + item['clean_text'])
            X_unique_id_train.append(item['unique_id'])

        elif item['unique_id'] in ids['IDs_validationset_fold_{}'.format(k)]:
            X_head_val.append(item['clean_head'])
            X_val.append(item['clean_text'])
            X_annot_val.append(item['annotation_type'])
            y_val.append(polarity[item['polarity']])
            y_p_val.append(item['polarity'])
            mix_val.append(item['clean_head'] + eos_token + init_token + item['clean_text'])
            mix_val_a.append(str(annotation_num[item['annotation_type']]) + eos_token + init_token + item['clean_head'] + eos_token + init_token + item['clean_text'])
            X_unique_id_val.append(item['unique_id'])


        elif item['unique_id'] in ids['IDs_testset_fold_{}'.format(k)]:
            X_head_test.append(item['clean_head'])
            X_test.append(item['clean_text'])
            X_annot_test.append(item['annotation_type'])
            y_test.append(polarity[item['polarity']])
            y_p_test.append(item['polarity'])
            mix_test.append(item['clean_head'] + eos_token + init_token + item['clean_text'])
            mix_test_a.append(str(annotation_num[item['annotation_type']]) + eos_token + init_token + item['clean_head'] + eos_token + init_token + item['clean_text'])
            X_unique_id_test.append(item['unique_id'])

    # save in json
    with jsonlines.open(path + ed + '_fortorch_trainset_fold_{}.json'.format(k), mode='w') as writer:
        for i in range(len(y_train)):
            writer.write({"head": X_head_train[i], "text":X_train[i], "annotationType": X_annot_train[i] \
                        , "mixHeadText": mix_train[i], "sentiment": y_p_train[i], "target": y_train[i], "uniqueID": X_unique_id_train[i], "mixAnnot": mix_train_a[i]})

    with jsonlines.open(path + ed + '_fortorch_validationset_fold_{}.json'.format(k), mode='w') as writer:
        for i in range(len(y_val)):
            writer.write({"head": X_head_val[i], "text":X_val[i], "annotationType": X_annot_val[i] \
                        , "mixHeadText": mix_val[i], "sentiment": y_p_val[i], "target": y_val[i], "uniqueID": X_unique_id_val[i], "mixAnnot": mix_val_a[i]})

    with jsonlines.open(path + ed + '_fortorch_testset_fold_{}.json'.format(k), mode='w') as writer:
        for i in range(len(y_test)):
            writer.write({"head": X_head_test[i], "text":X_test[i], "annotationType": X_annot_test[i] \
                        , "mixHeadText": mix_test[i], "sentiment": y_p_test[i], "target": y_test[i], "uniqueID": X_unique_id_test[i], "mixAnnot": mix_test_a[i]})

    k += 1