In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture
!pip install transformers

import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer

from tqdm import tqdm
import transformers
import torch
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, AutoTokenizer, BertModel, BertConfig, AutoModel, AdamW
import warnings
warnings.filterwarnings('ignore')

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
df = pd.read_csv("/content/drive/MyDrive/CS546/course-project/SemEval/data/data_en_subtask3/train.csv")
# df_test = pd.read_csv("/content/drive/MyDrive/CS546/course-project/SemEval/data/data_en_subtask3/test.csv")

In [None]:
encoder = MultiLabelBinarizer() #use sklearn binarizer

Y_train = df['labels'].str.split(',')
Y_train_binary = encoder.fit_transform(Y_train.values)

df['labels_binary'] = pd.Series(list(Y_train_binary))

In [None]:
df

Unnamed: 0,id,line,text,labels,labels_binary
0,111111111,3,Geneva - The World Health Organisation chief o...,Doubt,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,111111111,5,"""The next transmission could be more pronounce...",Appeal_to_Authority,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,111111111,13,"But Tedros voiced alarm that ""plague in Madaga...",Repetition,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,111111111,17,He also pointed to the presence of the pneumon...,Appeal_to_Fear-Prejudice,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,111111111,19,He praised the rapid response from WHO and Mad...,Appeal_to_Fear-Prejudice,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...
3755,999001970,4,Also the Left killed comedy. This is what its ...,"Exaggeration-Minimisation,Slogans","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
3756,999001970,5,Saturday Night Live writer and comedian Nimesh...,Exaggeration-Minimisation,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
3757,999001970,6,That's what Columbia snowflakes thought was of...,Name_Calling-Labeling,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
3758,999001970,8,"Comrades, these jokes you have been listening ...","Exaggeration-Minimisation,Name_Calling-Labeling","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, ..."


In [None]:
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), 
                                     [int(.8*len(df)), int(.9*len(df))])

print(len(df_train),len(df_val), len(df_test))

3008 376 376


In [None]:
df_val

Unnamed: 0,id,line,text,labels,labels_binary
894,728343601,13,"""Holy Father, I decided to write this letter t...",Loaded_Language,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
828,727128296,10,"Haig does not blame himself, but within weeks ...",Loaded_Language,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
3655,999001296,17,"The records released by DOS, which is part of ...",Loaded_Language,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
2523,780414700,31,To our seminarians: If you are unchastely prop...,"Loaded_Language,Repetition","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
1683,761969692,20,"In percentage terms, the immigrant working-age...",Exaggeration-Minimisation,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...
3393,832916492,7,"I mean, we obviously can’t jump the gun here.",Loaded_Language,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
1527,759337941,1,Capitol Police “Accidentally” Gave Treasure Tr...,"Loaded_Language,Name_Calling-Labeling","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, ..."
3657,999001297,1,Migrant Caravan Reach Border & Climb Atop Fenc...,Loaded_Language,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
1300,738361208,9,"Again, if you liked Obama-era, world-on-fire, ...","Conversation_Killer,Loaded_Language","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

class BERTDataset(torch.utils.data.Dataset):

    def __init__(self, df):
        self.labels = df['labels_binary'].values
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
num_classes = Y_train_binary.shape[1] # 19

class BertClassifier(torch.nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = torch.nn.Dropout(dropout)
        self.linear = torch.nn.Linear(768, 19)
        self.relu = torch.nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [None]:
from sklearn.metrics import f1_score

def train(model, train_data, val_data, learning_rate, epochs):

    train, val = BERTDataset(train_data), BERTDataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=1, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = torch.nn.BCEWithLogitsLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

        total_loss_train = 0

        for train_input, train_label in tqdm(train_dataloader):

            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
            output = torch.gt(output, 0.5).int() # multi-label threshold
            
            batch_loss = criterion(output.float(), train_label.float())
            batch_loss.requires_grad=True

            total_loss_train += batch_loss.item()

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()


        
        total_loss_val = 0
        fin_targets=[]
        fin_outputs=[]

        with torch.no_grad():

            for val_input, val_label in val_dataloader:

                val_label = val_label.to(device)
                mask = val_input['attention_mask'].to(device)
                input_id = val_input['input_ids'].squeeze(1).to(device)

                val_output = model(input_id, mask)
                val_output = torch.gt(val_output, 0.5).int()
                
                val_batch_loss = criterion(val_output.float(), train_label.float())
                val_batch_loss.requires_grad=True
               
                total_loss_val += val_batch_loss.item()

                fin_targets.extend(val_label.cpu().detach().numpy().tolist())
                fin_outputs.extend(output.cpu().detach().numpy().tolist())
                
            macro_f1 = f1_score(fin_targets, fin_outputs, average='macro')
        
        print(f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} | Val Loss: {total_loss_val / len(val_data): .3f} | Val macro F1: {macro_f1}')

In [None]:
EPOCHS = 5
model = BertClassifier()
LR = 1e-6

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [None]:
train(model, df_train, df_val, LR, EPOCHS)

100%|██████████| 3008/3008 [01:58<00:00, 25.41it/s]


Epochs: 1 | Train Loss:  0.812 | Val Loss:  0.802 | Val macro F1: 0.026476978757553615


100%|██████████| 3008/3008 [01:55<00:00, 26.03it/s]


Epochs: 2 | Train Loss:  0.812 | Val Loss:  0.807 | Val macro F1: 0.006220325150810634


100%|██████████| 3008/3008 [01:56<00:00, 25.90it/s]


Epochs: 3 | Train Loss:  0.811 | Val Loss:  0.807 | Val macro F1: 0.027872711936720678


100%|██████████| 3008/3008 [01:56<00:00, 25.87it/s]


Epochs: 4 | Train Loss:  0.812 | Val Loss:  0.828 | Val macro F1: 0.030406033997566053


100%|██████████| 3008/3008 [01:56<00:00, 25.89it/s]


Epochs: 5 | Train Loss:  0.811 | Val Loss:  0.798 | Val macro F1: 0.04171292649529416


In [None]:
df_test = pd.read_csv("/content/drive/MyDrive/CS546/course-project/SemEval/data/data_en_subtask3/test.csv")

In [None]:
class Dataset_test(torch.utils.data.Dataset):

    def __init__(self, df):
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['text']]

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

In [None]:
def evaluate(model, test_data):

    test = Dataset_test(test_data)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=1)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    test_res = []

    with torch.no_grad():

        for test_input in tqdm(test_dataloader):

            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
            output = torch.gt(output, 0.5).int()
            
            test_res.extend(output.cpu().detach().numpy().tolist())
    
    return test_res

test_res = evaluate(model, df_test)

100%|██████████| 3127/3127 [01:59<00:00, 26.09it/s]


In [None]:
out = encoder.inverse_transform(np.array(test_res))
out = list(map(lambda x: ','.join(x), out))

In [None]:
df_test['output'] = out

In [None]:
df_test

Unnamed: 0,id,line,text,output
0,820791520,1,George III Lost America.,"Appeal_to_Fear-Prejudice,False_Dilemma-No_Choice"
1,820791520,3,Theresa May Could Lose the United Kingdom Over...,"Causal_Oversimplification,Guilt_by_Association..."
2,820791520,5,Britain is locked in the most serious peacetim...,"Guilt_by_Association,Name_Calling-Labeling,Obf..."
3,820791520,6,Brexit has shown the world a British parliamen...,"Guilt_by_Association,Name_Calling-Labeling,Red..."
4,820791520,7,One veteran of Margaret Thatcher’s cabinet sai...,"Causal_Oversimplification,Conversation_Killer,..."
...,...,...,...,...
3122,813953273,43,Rough sleepers and aggressive beggars are a pe...,"False_Dilemma-No_Choice,Obfuscation-Vagueness-..."
3123,813953273,44,"Yet while innocent blood runs in the gutters, ...","Appeal_to_Hypocrisy,Appeal_to_Popularity,Name_..."
3124,813953273,45,And instead of being able to celebrate an opti...,"Appeal_to_Fear-Prejudice,False_Dilemma-No_Choi..."
3125,813953273,46,"The first of many more to come, no doubt.","Appeal_to_Fear-Prejudice,Appeal_to_Hypocrisy,F..."


In [None]:
output = df_test[["id", "line", "output"]]
output.to_csv("/content/drive/MyDrive/CS546/course-project/SemEval/notebooks/en_subtask3/output_st3.txt", sep='\t', header=None, index=False)