In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%%capture
!pip install transformers

import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer

from tqdm import tqdm
import transformers
import torch
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, AutoTokenizer, BertModel, BertConfig, AutoModel, AdamW
import warnings
warnings.filterwarnings('ignore')

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
df = pd.read_csv("/content/drive/MyDrive/CS546/course-project/SemEval/data/data_en_subtask2/train.csv")
# df_test = pd.read_csv("/content/drive/MyDrive/CS546/course-project/SemEval/data/data_en_subtask2/test.csv")

In [None]:
encoder = MultiLabelBinarizer() #use sklearn binarizer

Y_train = df['frames'].str.split(',')
Y_train_binary = encoder.fit_transform(Y_train.values)

df['labels'] = pd.Series(list(Y_train_binary))

In [None]:
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), 
                                     [int(.8*len(df)), int(.9*len(df))])

print(len(df_train),len(df_val), len(df_test))

346 43 44


In [None]:
df_train

Unnamed: 0,id,text,frames,labels
425,832916508,Dems in Congress Demand Barr Release Full Muel...,"Political,Morality,Crime_and_punishment,Extern...","[0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0]"
75,731063195,One Trillion Stars\n\nThe nearest neighboring ...,"Capacity_and_resources,Quality_of_life","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]"
181,728153988,Illegal alien wanted for attempted murder in N...,"Crime_and_punishment,Security_and_defense","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
30,111111114,U.S. judge frees Indonesian immigrant held by ...,"Crime_and_punishment,Morality,Fairness_and_equ...","[0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0]"
364,787142429,Brett Kavanaugh accused in letter obtained by ...,"Crime_and_punishment,Health_and_safety,Legalit...","[0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]"
...,...,...,...,...
351,790666929,Star Tribune Sues To Unseal Accused Serial Wom...,"Political,Policy_prescription_and_evaluation,L...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0]"
216,741655444,Travis Air Force Base terrorist identified as ...,"Crime_and_punishment,Morality,Security_and_def...","[0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]"
279,755814432,Trump’s Plan for Iran: Put Terrorists in Charg...,"Crime_and_punishment,Morality,External_regulat...","[0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1]"
376,999001226,Broward County Election Supervisor Brenda Snip...,"Political,Crime_and_punishment,Morality","[0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0]"


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

class BERTDataset(torch.utils.data.Dataset):

    def __init__(self, df):
        self.labels = df['labels'].values
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

In [None]:
class BertClassifier(torch.nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = torch.nn.Dropout(dropout)
        self.linear = torch.nn.Linear(768, 14)
        self.relu = torch.nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [None]:
from sklearn.metrics import f1_score

def train(model, train_data, val_data, learning_rate, epochs):

    train, val = BERTDataset(train_data), BERTDataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=1, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = torch.nn.BCEWithLogitsLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

        total_loss_train = 0

        for train_input, train_label in tqdm(train_dataloader):

            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
            output = torch.gt(output, 0.5).int() # multi-label threshold
            
            batch_loss = criterion(output.float(), train_label.float())
            batch_loss.requires_grad=True

            total_loss_train += batch_loss.item()

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()


        
        total_loss_val = 0
        fin_targets=[]
        fin_outputs=[]

        with torch.no_grad():

            for val_input, val_label in val_dataloader:

                val_label = val_label.to(device)
                mask = val_input['attention_mask'].to(device)
                input_id = val_input['input_ids'].squeeze(1).to(device)

                val_output = model(input_id, mask)
                val_output = torch.gt(val_output, 0.5).int()
                
                val_batch_loss = criterion(val_output.float(), train_label.float())
                val_batch_loss.requires_grad=True
               
                total_loss_val += val_batch_loss.item()

                fin_targets.extend(val_label.cpu().detach().numpy().tolist())
                fin_outputs.extend(output.cpu().detach().numpy().tolist())
                
            macro_f1 = f1_score(fin_targets, fin_outputs, average='macro')
        
        print(f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} | Val Loss: {total_loss_val / len(val_data): .3f} | Val macro F1: {macro_f1}')

In [None]:
EPOCHS = 5
model = BertClassifier()
LR = 1e-6

In [None]:
train(model, df_train, df_val, LR, EPOCHS)

100%|██████████| 346/346 [00:13<00:00, 26.26it/s]


Epochs: 1 | Train Loss:  0.757 | Val Loss:  0.750 | Val macro F1: 0.12095740516793148


100%|██████████| 346/346 [00:14<00:00, 24.22it/s]


Epochs: 2 | Train Loss:  0.752 | Val Loss:  0.739 | Val macro F1: 0.03116883116883117


100%|██████████| 346/346 [00:12<00:00, 27.55it/s]


Epochs: 3 | Train Loss:  0.754 | Val Loss:  0.738 | Val macro F1: 0.08234153905795696


100%|██████████| 346/346 [00:12<00:00, 27.57it/s]


Epochs: 4 | Train Loss:  0.755 | Val Loss:  0.763 | Val macro F1: 0.08234153905795696


100%|██████████| 346/346 [00:13<00:00, 26.09it/s]


Epochs: 5 | Train Loss:  0.756 | Val Loss:  0.761 | Val macro F1: 0.17213011305705725


In [None]:
df_test = pd.read_csv("/content/drive/MyDrive/CS546/course-project/SemEval/data/data_en_subtask2/test.csv")

In [None]:
class Dataset_test(torch.utils.data.Dataset):

    def __init__(self, df):
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['text']]

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

In [None]:
def evaluate(model, test_data):

    test = Dataset_test(test_data)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=1)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    test_res = []

    with torch.no_grad():

        for test_input in test_dataloader:

            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
            output = torch.gt(output, 0.5).int()
            
            test_res.extend(output.cpu().detach().numpy().tolist())
    
    return test_res

test_res = evaluate(model, df_test)

In [None]:
out = encoder.inverse_transform(np.array(test_res))
out = list(map(lambda x: ','.join(x), out))

In [None]:
out

['Legality_Constitutionality_and_jurisprudence,Policy_prescription_and_evaluation,Political',
 'Legality_Constitutionality_and_jurisprudence,Policy_prescription_and_evaluation,Political',
 'Policy_prescription_and_evaluation',
 'External_regulation_and_reputation,Legality_Constitutionality_and_jurisprudence,Policy_prescription_and_evaluation,Security_and_defense',
 'Legality_Constitutionality_and_jurisprudence,Policy_prescription_and_evaluation',
 '',
 'Cultural_identity,Policy_prescription_and_evaluation',
 'External_regulation_and_reputation,Policy_prescription_and_evaluation',
 'Legality_Constitutionality_and_jurisprudence',
 'Cultural_identity,External_regulation_and_reputation,Policy_prescription_and_evaluation',
 '',
 'Cultural_identity,Legality_Constitutionality_and_jurisprudence,Policy_prescription_and_evaluation',
 '',
 'Cultural_identity,External_regulation_and_reputation,Legality_Constitutionality_and_jurisprudence,Policy_prescription_and_evaluation',
 'Capacity_and_resource

In [None]:
df_test['output'] = out

In [None]:
df_test

Unnamed: 0,id,text,output
0,820791520,George III Lost America.\n\nTheresa May Could ...,"Legality_Constitutionality_and_jurisprudence,P..."
1,828866387,Brexit in 23 days: EU says still 'no solution'...,"Legality_Constitutionality_and_jurisprudence,P..."
2,821040551,Queen Elizabeth Would Be Evacuated in Event of...,Policy_prescription_and_evaluation
3,813552066,"You insult us, ambassador: Woody Johnson flagr...","External_regulation_and_reputation,Legality_Co..."
4,817176202,"The British People, as Well as the Politicians...","Legality_Constitutionality_and_jurisprudence,P..."
...,...,...,...
78,829815104,Brussels Shows Its Fear\n\nHungarian Prime Min...,"Cultural_identity,Policy_prescription_and_eval..."
79,817147979,BREXIT OR BRINO: U.K.\n\nDeep State Strikes Ba...,"Cultural_identity,Legality_Constitutionality_a..."
80,813623212,'Wishful thinking': Tory MPs dismiss May's hop...,"Crime_and_punishment,Policy_prescription_and_e..."
81,813953273,RICHARD LITTLEJOHN: The capital sees its first...,"Legality_Constitutionality_and_jurisprudence,P..."


In [None]:
output = df_test[["id", "output"]]
output.to_csv("/content/drive/MyDrive/CS546/course-project/SemEval/notebooks/en_subtask2/output_st2.txt", sep='\t', header=None, index=False)