In [17]:
1+2

3

In [None]:
!pip install transformers==3.0.2
!pip install seaborn

Collecting seaborn
  Downloading seaborn-0.11.2-py3-none-any.whl (292 kB)
[K     |################################| 292 kB 23.8 MB/s eta 0:00:01
[?25hCollecting matplotlib>=2.2
  Downloading matplotlib-3.3.4-cp36-cp36m-manylinux1_x86_64.whl (11.5 MB)
[K     |################################| 11.5 MB 32.4 MB/s eta 0:00:01
Collecting kiwisolver>=1.0.1
  Downloading kiwisolver-1.3.1-cp36-cp36m-manylinux1_x86_64.whl (1.1 MB)
[K     |################################| 1.1 MB 29.3 MB/s eta 0:00:01
[?25hCollecting pillow>=6.2.0
  Downloading Pillow-8.3.2-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[K     |################################| 3.0 MB 30.1 MB/s eta 0:00:01
[?25hCollecting cycler>=0.10
  Downloading cycler-0.10.0-py2.py3-none-any.whl (6.5 kB)
Installing collected packages: kiwisolver, pillow, cycler, matplotlib, seaborn
Successfully installed cycler-0.10.0 kiwisolver-1.3.1 matplotlib-3.3.4 pillow-8.3.2 seaborn-0.11.2
You should consider upgrading via the

In [18]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
import logging
logging.basicConfig(level=logging.ERROR)

In [19]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [20]:
train = pd.read_csv('GEN-sarc-notsarc.csv')
data=train.copy()
data.drop(columns=['id'],axis=1,inplace=True)
classes = {"notsarc" : 0,"sarc" : 1}
data["class"] = data["class"].map(classes)
train=data

In [21]:
new_df = train[['text', 'class']]

In [22]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
# EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)

In [23]:
class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data['class']
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [24]:
train_size = 0.8
train_data=new_df.sample(frac=train_size,random_state=200)
test_data=new_df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = SentimentData(train_data, tokenizer, MAX_LEN)
testing_set = SentimentData(test_data, tokenizer, MAX_LEN)

FULL Dataset: (6520, 2)
TRAIN Dataset: (5216, 2)
TEST Dataset: (1304, 2)


In [25]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [26]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [27]:
model = RobertaClass()
model.to(device)

RobertaClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, eleme

In [28]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [29]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [30]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 

In [31]:
EPOCHS = 3
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]

Training Loss per 5000 steps: 1.6288453340530396
Training Accuracy per 5000 steps: 0.0


652it [09:13,  1.18it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 0: 72.60352760736197
Training Loss Epoch: 0.5740465472560902
Training Accuracy Epoch: 72.60352760736197
Training Loss per 5000 steps: 0.6465620398521423
Training Accuracy per 5000 steps: 62.5


652it [09:12,  1.18it/s]
0it [00:00, ?it/s]

The Total Accuracy for Epoch 1: 82.66871165644172
Training Loss Epoch: 0.39820578068129125
Training Accuracy Epoch: 82.66871165644172
Training Loss per 5000 steps: 0.0729588195681572
Training Accuracy per 5000 steps: 100.0


652it [09:14,  1.18it/s]

The Total Accuracy for Epoch 2: 88.24769938650307
Training Loss Epoch: 0.2944144820080259
Training Accuracy Epoch: 88.24769938650307





In [32]:
cuda0 = torch.device('cuda:0')
y_pred=torch.tensor([], dtype=torch.long, device=cuda0)
y_actual=torch.tensor([], dtype=torch.long, device=cuda0)

In [33]:
def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accuracy(big_idx, targets)
            global y_pred
            global y_actual
            y_pred=torch.cat((y_pred,big_idx))
            y_actual=torch.cat((y_actual,targets))

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu


In [34]:
acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

1it [00:00,  5.15it/s]

Validation Loss per 100 steps: 0.025589914992451668
Validation Accuracy per 100 steps: 100.0


326it [00:50,  6.51it/s]

Validation Loss Epoch: 0.4802223685557491
Validation Accuracy Epoch: 81.05828220858896
Accuracy on test data = 81.06%





In [36]:
from sklearn.metrics import classification_report
labels = [0, 1]
print(classification_report(y_actual.cpu().numpy(),y_pred.cpu().numpy(),labels=labels))

              precision    recall  f1-score   support

           0       0.82      0.80      0.81       671
           1       0.80      0.82      0.81       633

    accuracy                           0.81      1304
   macro avg       0.81      0.81      0.81      1304
weighted avg       0.81      0.81      0.81      1304



In [37]:
testdata2 = pd.read_csv('HYP-sarc-notsarc.csv')
testdata2.drop(columns=['id'],axis=1,inplace=True)
classes = {"notsarc" : 0,"sarc" : 1}
testdata2["class"] = testdata2["class"].map(classes)
testdata2 = testdata2[['text', 'class']]

testing_set2 = SentimentData(testdata2, tokenizer, MAX_LEN)
testing_loader2 = DataLoader(testing_set2, **test_params)
acc2 = valid(model, testing_loader2)
print(classification_report(y_actual.cpu().numpy(),y_pred.cpu().numpy(),labels=labels))

2it [00:00,  6.63it/s]

Validation Loss per 100 steps: 1.7272158861160278
Validation Accuracy per 100 steps: 50.0


291it [00:45,  6.43it/s]

Validation Loss Epoch: 0.8704795841498882
Validation Accuracy Epoch: 66.1512027491409
              precision    recall  f1-score   support

           0       0.81      0.64      0.71      1253
           1       0.69      0.84      0.76      1215

    accuracy                           0.74      2468
   macro avg       0.75      0.74      0.74      2468
weighted avg       0.75      0.74      0.74      2468






In [38]:
testdata3 = pd.read_csv('RQ-sarc-notsarc.csv')
testdata3.drop(columns=['id'],axis=1,inplace=True)
classes = {"notsarc" : 0,"sarc" : 1}
testdata3["class"] = testdata3["class"].map(classes)
testdata3 = testdata3[['text', 'class']]

testing_set3 = SentimentData(testdata3, tokenizer, MAX_LEN)
testing_loader3 = DataLoader(testing_set3, **test_params)
acc3 = valid(model,testing_loader3)
print(classification_report(y_actual.cpu().numpy(),y_pred.cpu().numpy(),labels=labels))

2it [00:00,  6.54it/s]

Validation Loss per 100 steps: 0.08930418640375137
Validation Accuracy per 100 steps: 100.0


426it [01:06,  6.42it/s]

Validation Loss Epoch: 0.48683289336390406
Validation Accuracy Epoch: 80.25851938895417
              precision    recall  f1-score   support

           0       0.81      0.70      0.75      2104
           1       0.73      0.83      0.78      2066

    accuracy                           0.77      4170
   macro avg       0.77      0.77      0.76      4170
weighted avg       0.77      0.77      0.76      4170






In [39]:
output_model_file = 'Sarcasm_RoBERTa_Model.bin'
output_vocab_file = './'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')

All files saved
