In [None]:
#  !pip install transformers seqeval[gpu]

In [None]:
!pip install transformers==3.1.0

In [None]:
from pprint import pprint
from google.colab import drive
from google.colab import files
import regex as re
import pandas as pd
import numpy as np
from transformers import RobertaTokenizer, RobertaTokenizerFast, RobertaConfig, RobertaForSequenceClassification, AdamW
import torch
from torch.utils.data import Dataset, DataLoader
from torch import cuda
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

import glob
import os.path
import numpy as np
import sys
import codecs

In [None]:
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cpu


In [None]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
#Read Sentences from the stored files
def read_sentences_and_labels(filename):
    sentences, labels = [], []
    with open(filename, 'r') as f:
        for line in f.readlines():
            article_id, sentence, label = line.strip().split('\t')
            sentences.append(sentence.lower())
            labels.append(int(label))
    return sentences, labels

In [None]:
#Load Training and Dev Data

train_sentences, train_labels = read_sentences_and_labels("/content/drive/MyDrive/NLP/train_sentence_classification.txt")
dev_sentences, dev_labels = read_sentences_and_labels("/content/drive/MyDrive/NLP/dev_sentence_classification.txt")

print("Train Size:", len(train_sentences), len(train_labels))
print("Dev Size:", len(dev_sentences), len(dev_labels))
print("Train Data Propanda Sentences:", sum(train_labels))
print("Dev Data Propanda Sentences:", sum(dev_labels))

In [None]:
# Sample the Training data to overcome data imbalance
# Randomize and Pick 5000 Train examples

train_labels_np = np.array(train_labels)
train_sentences_np = np.array(train_sentences)

indices_of_1 = np.where(train_labels_np == 1)[0]
indices_of_0 = np.where(train_labels_np == 0)[0]

subset_of_0 = np.random.choice(indices_of_0, size=5000, replace=False)

final_indices = np.append(subset_of_0, indices_of_1)
np.random.shuffle(final_indices)

train_sentences_final = train_sentences_np[final_indices]
train_labels_final = train_labels_np[final_indices]

idx = train_sentences.index(train_sentences_final[0])
assert train_labels_final[0] == train_labels[idx]

idx = train_sentences.index(train_sentences_final[50])
assert train_labels_final[50] == train_labels[idx]


In [None]:
train_sentences_final

array(['credit for this dangerous scenario — in which getting an ar-15-style rifle is just a matter of a few computer clicks — goes to the trump administration for its inexplicable decision to settle a lawsuit it was on the verge of winning.',
       'and if so, could we have been this wrong?',
       '"the dna of our culture is preserved.', ...,
       'the duran’s alex christoforou and editor-in-chief alexander mercouris discuss the insurrection taking place at the us department of justice, as democrats, ex-obama officials, and doj directors are doing everything in their power to make sure the truth, about how the fisa warrant to spy on carter page was obtained, remains hidden from the eyes of the american public.',
       '8 – in coming to goldman sachs, powell joined a firm that has long been deeply tied to the clintons.',
       'the inspector general made clear when he launched his investigation in january 2017 that “his review will not substitute the oig\'s judgment for the judg

In [None]:
# # Code to perform lemmatization or Stemming
# # Commenting out entire cell as either approach didn't yield higher accuracy. 
# # Rather these approaches reduced the accuracy
# import nltk
# nltk.download('wordnet')
# nltk.download('punkt')
# from nltk.stem import WordNetLemmatizer, PorterStemmer
# stemmer = nltk.stem.SnowballStemmer('english')
# lemmatizer = WordNetLemmatizer()

# train_sentences_stemmed = []
# for sent in train_sentences_final:
#     # words = nltk.word_tokenize(sent)
#     words = sent.split(' ')
#     stemmed_output = ' '.join([lemmatizer.lemmatize(w) for w in words if len(w) > 1])
#     train_sentences_stemmed.append(stemmed_output)


# dev_sentences_stemmed = []
# for sent in dev_sentences:
#     words = nltk.word_tokenize(sent)
#     stemmed_output = ' '.join([lemmatizer.lemmatize(w) for w in words if len(w) > 1])
#     dev_sentences_stemmed.append(stemmed_output)

In [None]:
# print(train_sentences_final[0:3])
# print(train_sentences_stemmed[0:3])
# len(train_sentences_stemmed)

In [None]:
#Torch Dataset definition
class SC_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
DEV_BATCH_SIZE = 4
EPOCHS = 5
LEARNING_RATE = 1e-5
MAX_GRAD_NORM = 10

In [None]:
#Load pretrained tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(set(train_labels)))
model.to(device)

In [None]:
print(tokenizer(train_sentences_final[0]))

train_encodings = tokenizer(train_sentences_final.tolist(), truncation=True, padding=True, max_length=MAX_LEN)
dev_encodings = tokenizer(dev_sentences, truncation=True, padding=True, max_length=MAX_LEN)

In [None]:
train_dataset = SC_Dataset(train_encodings, train_labels_final)
dev_dataset = SC_Dataset(dev_encodings, dev_labels)

In [None]:
len(train_dataset), len(dev_dataset), train_dataset[0]

In [None]:
train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=DEV_BATCH_SIZE, shuffle=True)
optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
#Code to train the model with our dataset
def train(epoch):

    model.train()
    tr_loss, nb_tr_steps  = 0, 0

    for idx, batch in enumerate(train_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        
        tr_loss += loss.item()
        nb_tr_steps += 1
        
        if (idx+1) % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")

        # # gradient clipping
        # torch.nn.utils.clip_grad_norm_(
        #     parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        # )

        loss.backward()
        optimizer.step()
    # # Uncomment to save model after each epoch 
    # torch.save(model, "/content/drive/MyDrive/NLP/RoBERTa_Task_SentClassify.pt")

In [None]:
for epoch in range(EPOCH):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)
    print("Model saved after training for {} epochs".format(epoch+1))
model.eval()

In [None]:
# Save and Load the model 
# Comment/Uncomment as required
torch.save(model, "/content/drive/MyDrive/NLP/RoBERTa_Task_SentClassify.pt.pt")
# model = torch.load("/content/drive/MyDrive/NLP/RoBERTa_Task_SentClassify.pt", map_location=torch.device('cpu'))
# model.eval()

In [None]:
#Get predictions for Dev dataset
import numpy as np
model.eval()
result = 0
predictions,  true_labels = [], []

bat_test = []

with torch.no_grad():
    for batch in dev_loader:
        bat_test = batch
        ids = batch['input_ids'].to(device)
        mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        result = model(ids, attention_mask=mask, labels=labels)
        logits = result[1]

        predictions.append(logits.detach().cpu().numpy())
        true_labels.append(labels.detach().cpu().numpy())


In [None]:
true_labels[0]

In [None]:
flat_predictions = np.concatenate(predictions, axis = 0)
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
flat_true_labels = np.concatenate(true_labels, axis = 0)

In [None]:
#Compute the model metrics
print("Dev Accuracy:", accuracy_score(flat_true_labels, flat_predictions))
print("Dev Precision:", precision_score(flat_true_labels, flat_predictions))
print("Dev Recall:", recall_score(flat_true_labels, flat_predictions))
print("Dev F1 score:", f1_score(flat_true_labels, flat_predictions))