In [None]:
neptune.init(project_qualified_name=os.environ["NEPTUNE_PROJECT"], api_token=os.environ["NEPTUNE_API_TOKEN"])

neptune.create_experiment(name="22122020_04", 
                          params=training_params,
                          tags=["huggingface", "bert", "80-20"],
                          upload_source_files=["SemEval-subtask1.ipynb"])

In [127]:
import os
import pandas as pd
import numpy as np
import logging
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch import cuda
import transformers
from transformers import BertTokenizer, BertModel, BertConfig
from pprint import pprint
import random
import neptune
import itertools
import subprocess
import logging
import json
from sklearn.model_selection import train_test_split
device = "cuda" if cuda.is_available() else "cpu"

In [128]:
train_path = "data/training_set_task1.txt"
dev_path = "data/dev_set_task1.txt"

In [130]:
def set_seed(seed):
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)

set_seed(1)

In [131]:
def read_data(path, inference=False):

    with open (path, "r") as f:
        data_raw = f.read()
        data_json = json.loads(data_raw)
        df = pd.DataFrame(data_json)
        
    if inference == False:
        all_labels = list(df["labels"].explode().unique())
        all_labels.pop(1)
        
        def encode_labels(labels):
            encoded = [1 if l in labels else 0 for l in all_labels]
            return encoded
        
        df["label_list"] = df["labels"].map(encode_labels)
        return df,  all_labels
    else:
        return df

In [132]:
df, all_labels = read_data(train_path)
df.head(3)

Unnamed: 0,id,labels,text,label_list
0,128,[Black-and-white Fallacy/Dictatorship],THERE ARE ONLY TWO GENDERS\n\nFEMALE \n\nMALE\n,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,189,[],This is not an accident!,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,96,"[Slogans, Loaded Language, Smears, Name callin...",SO BERNIE BROS HAVEN'T COMMITTED VIOLENCE EH?\...,"[0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [133]:
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42)

In [134]:
training_params = {"model": "BERT",
                   "epochs": 5,
                   "optimizer": "Adam",
                   "learning_rate": 0.001,
                   "train_batch_size": 8,
                   "val_batch_size": 8,
                   "max_len": 200,
                   "shuffle":False,
                   "num_workers": 0,
                   "loss_fn": "BCEWithLogitsLoss",
                   "metric1": "train_f1_micro",
                   "metric2": "train_f1_macro",
                   "metric3": "val_f1_micro",
                   "metric4": "val_f1_macro",
                  }

In [114]:
train_df.reset_index(drop=True, inplace=True)
valid_df.reset_index(drop=True, inplace=True)

In [115]:
class CustomDataset(Dataset):
    
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.label_list
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            padding="max_length",
            max_length=self.max_len,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [116]:
# Creating the dataset and dataloader for the neural network

training_set = CustomDataset(dataframe=train_df, 
                             tokenizer=BertTokenizer.from_pretrained('bert-base-uncased'), 
                             max_len=training_params["max_len"])

training_loader = DataLoader(training_set, 
                             batch_size=training_params["train_batch_size"],
                             num_workers=training_params["num_workers"],
                             worker_init_fn=random.seed(1))

validation_set = CustomDataset(dataframe=valid_df, 
                             tokenizer=BertTokenizer.from_pretrained('bert-base-uncased'), 
                             max_len=training_params["max_len"])

validation_loader = DataLoader(validation_set, 
                             batch_size=training_params["val_batch_size"],
                             num_workers=training_params["num_workers"],
                             worker_init_fn=random.seed(1))

In [117]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 20)
    
    def forward(self, ids, mask, token_type_ids):
#         _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        outputs= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_2 = self.l2(outputs[1])
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [118]:
# Training function

def train(epochs):
    
    def loss_fn(outputs, targets):
        return torch.nn.BCEWithLogitsLoss()(outputs, targets)
    
    def calculate_metrics(outputs, targets):
        f1_micro = metrics.f1_score(targets, outputs, average="micro")
        f1_macro = metrics.f1_score(targets, outputs, average="macro")
        return f1_micro, f1_macro
    
    def make_preds(outputs, targets, fin_outputs, fin_targets):
        outputs = (np.array(torch.sigmoid(outputs).cpu().detach().numpy().tolist()) >= 0.5).astype(int).tolist()
        targets = np.array(targets.cpu().detach().numpy().tolist()).astype(int).tolist()
        fin_targets.extend(targets)
        fin_outputs.extend(outputs)
        return fin_outputs, fin_targets
    
    def validate(model, validation_loader):
        val_targets=[]
        val_outputs=[]
        model.eval()
        
        for _,data in enumerate(validation_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            loss = loss_fn(outputs, targets)
            print("Validation Loss: {}".format(loss.item()))
            neptune.log_metric("Validation epoch", epoch+1)
            neptune.log_metric("Validation batch", _)
            neptune.log_metric("Validation loss", loss.item())
            
            val_outputs, val_targets = make_preds(outputs, targets, val_outputs, val_targets)
        
        return val_outputs, val_targets

        
    # Train model
            
    optimizer = torch.optim.Adam(params=model.parameters(), lr=training_params["learning_rate"])
    model.train()
    train_targets=[]
    train_outputs=[]
    
    for epoch in range(epochs):
        print("Epoch {}".format(epoch+1))
        
        for _,data in enumerate(training_loader, 0):
            optimizer.zero_grad()

            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)

            outputs = model(ids, mask, token_type_ids)
            
            loss = loss_fn(outputs, targets)
            print("Train Loss: {}".format(loss.item()))
            neptune.log_metric("Train epoch", epoch+1)
            neptune.log_metric("Train batch", _)
            neptune.log_metric("Train loss", loss.item())
            
            
            loss.backward()
            optimizer.step()
            
            if epoch == (epochs-1):
                train_outputs, train_targets = make_preds(outputs, targets, train_outputs, train_targets)
                
        val_outputs, val_targets = validate(model, validation_loader)
    
    
    train_f1_micro, train_f1_macro = calculate_metrics(train_outputs, train_targets)
    print("Train F1 Micro score: ", train_f1_micro)
    print("Train F1 Macro score: ", train_f1_macro)
    neptune.log_metric("Train f1_micro", train_f1_micro)
    neptune.log_metric("Train f1_macro", train_f1_macro)
    
    val_f1_micro, val_f1_macro = calculate_metrics(val_outputs, val_targets)
    print("Validation F1 Micro score: ", val_f1_micro)
    print("Validation F1 Macro score: ", val_f1_macro)
    neptune.log_metric("Validation f1_micro", val_f1_micro)
    neptune.log_metric("Validation f1_macro", val_f1_macro)
    print(val_outputs)
    print("")
    print(val_targets)
    return model

In [None]:
model = train(epochs=training_params["epochs"])

In [94]:
def save_model(PATH):
    torch.save(model.state_dict(), PATH)

save_model(PATH="state_dict_model.pt")

In [66]:
def generate_prediction_file(df, preds):
    
    def get_labels(pred_list):
        labels = [all_labels[idx] for idx, pred in enumerate(pred_list) if pred == 1]
        return labels
    
    df["preds"] = preds
    df["pred_labels"] = df["preds"].map(get_labels)
    preds_df = df[["id", "pred_labels"]]
    preds_df.rename(columns={"pred_labels":"labels"}, inplace=True)
    preds_json = preds_df.to_json(orient="records")
    with open('preds.txt', 'w') as f:
        f.write(preds_json)

In [55]:
# neptune.log_artifact("preds.txt")

In [None]:
# neptune.log_artifact("state_dict_model.pt")

In [58]:
# Load model
PATH="state_dict_model.pt"
model = BERTClass()
model.to(device)
model.load_state_dict(torch.load(PATH))

<All keys matched successfully>

In [120]:
class TestDataset(Dataset):
    
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            padding="max_length",
            max_length=self.max_len,
#             pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
        }

In [121]:
# Load dev set
dev_df = read_data(dev_path, inference=True)

dev_set = TestDataset(dataframe=dev_df, 
                             tokenizer=BertTokenizer.from_pretrained('bert-base-uncased'), 
                             max_len=training_params["max_len"])

dev_loader = DataLoader(dev_set, 
                             batch_size=training_params["train_batch_size"],
                             num_workers=training_params["num_workers"],
                             worker_init_fn=random.seed(1))

In [122]:
def test(testing_loader, model):
    model.eval()
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids)
            outputs = (np.array(torch.sigmoid(outputs).cpu().detach().numpy().tolist()) >= 0.5).astype(int).tolist()
            fin_outputs.extend(outputs)
    return fin_outputs


In [123]:
dev_preds = test(dev_loader, model)

In [None]:
generate_prediction_file(dev_df, dev_preds)

In [101]:
neptune.log_artifact("preds.txt")

In [125]:
neptune.stop()