## Import packages

In [1]:
!cp -r ../input/earlystopping/early-stopping-pytorch-master/* ./
from pytorchtools import EarlyStopping

In [2]:
!pip install transformers

You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
import logging
logging.basicConfig(level=logging.ERROR)

In [4]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

## Import dataset

In [5]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
LEARNING_RATE = 1e-05
tokenizer = RobertaTokenizer.from_pretrained('roberta-large', truncation=True, do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




In [6]:
class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, mode = "train"):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.mode = mode
        if self.mode == "train":
            self.targets = self.data.sentiment_label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        if self.mode == "train":
            return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(mask, dtype=torch.long),
              'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
              'targets': torch.tensor(self.targets[index], dtype=torch.float)
            }
        else:
            return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(mask, dtype=torch.long),
              'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long)
            }

In [7]:
train_data = pd.read_csv('../input/twitter-sentimentmis583/twitter_sentiment/train.csv')
test_data = pd.read_csv('../input/twitter-sentimentmis583/twitter_sentiment/val.csv')

print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = SentimentData(train_data, tokenizer, MAX_LEN)
validation_set = SentimentData(test_data, tokenizer, MAX_LEN)

TRAIN Dataset: (10248, 2)
TEST Dataset: (1464, 2)


In [8]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
validation_loader = DataLoader(validation_set, **test_params)

## Building model (roBERTa)

In [9]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained('roberta-large')
        self.pre_classifier = torch.nn.Linear(1024, 1024)
        self.dropout = torch.nn.Dropout(0.5)
        self.classifier = torch.nn.Linear(1024, 3, bias =True)
        torch.nn.init.normal_(self.classifier.weight, std=0.02)
        torch.nn.init.normal_(self.classifier.bias, 0)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [10]:
model = RobertaClass()
model = model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=482.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1425941629.0, style=ProgressStyle(descr…




In [11]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [12]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [13]:
def train():
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    pbar = tqdm(training_loader)
    for data in pbar:
        pbar.set_description("Training")
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples

    return epoch_accu, epoch_loss

In [14]:
def valid(model, validation_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    with torch.no_grad():
        pbar = tqdm(validation_loader)
        for data in pbar:
            pbar.set_description("Validation")
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accuracy(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    
    return epoch_accu, epoch_loss


## Training

In [15]:
epoch = 10
patience = 3
early_stopping = EarlyStopping(patience=patience, verbose=True, path='checkpoint.pt')
for i in range(epoch):
    print('=' * 20, 'Epoch', i+1, '=' * 20)
    training_accuracy, training_loss = train()
    valid_accuracy, valid_loss = valid(model, validation_loader)
    print('Train Acc: {:.6f} Train Loss: {:.6f}'.format(training_accuracy, training_loss))
    print('  Val Acc: {:.6f}   Val Loss: {:.6f}'.format(valid_accuracy, valid_loss))
    early_stopping(-valid_accuracy, model) 
    if early_stopping.early_stop:
        print("Early stopping")
        break
    

  0%|          | 0/641 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Training:   0%|          | 0/641 [00:00<?, ?it/s]



Training: 100%|██████████| 641/641 [07:53<00:00,  1.35it/s]
Validation: 100%|██████████| 92/92 [00:22<00:00,  4.13it/s]


Train Acc: 80.493755 Train Loss: 0.488431
  Val Acc: 86.133880   Val Loss: 0.371501
Validation loss decreased (inf --> -86.133880).  Saving model ...


Training:   0%|          | 0/641 [00:00<?, ?it/s]



Training: 100%|██████████| 641/641 [07:52<00:00,  1.36it/s]
Validation: 100%|██████████| 92/92 [00:22<00:00,  4.17it/s]
Training:   0%|          | 0/641 [00:00<?, ?it/s]

Train Acc: 87.890320 Train Loss: 0.321105
  Val Acc: 85.382514   Val Loss: 0.385962
EarlyStopping counter: 1 out of 3


Training: 100%|██████████| 641/641 [07:53<00:00,  1.35it/s]
Validation: 100%|██████████| 92/92 [00:22<00:00,  4.18it/s]


Train Acc: 91.081187 Train Loss: 0.244546
  Val Acc: 86.612022   Val Loss: 0.464761
Validation loss decreased (-86.133880 --> -86.612022).  Saving model ...


Training:   0%|          | 0/641 [00:00<?, ?it/s]



Training: 100%|██████████| 641/641 [07:52<00:00,  1.36it/s]
Validation: 100%|██████████| 92/92 [00:22<00:00,  4.17it/s]


Train Acc: 94.115925 Train Loss: 0.177270
  Val Acc: 87.568306   Val Loss: 0.406807
Validation loss decreased (-86.612022 --> -87.568306).  Saving model ...


Training:   0%|          | 0/641 [00:00<?, ?it/s]



Training: 100%|██████████| 641/641 [07:53<00:00,  1.35it/s]
Validation: 100%|██████████| 92/92 [00:22<00:00,  4.18it/s]
Training:   0%|          | 0/641 [00:00<?, ?it/s]

Train Acc: 95.716237 Train Loss: 0.129488
  Val Acc: 86.612022   Val Loss: 0.428446
EarlyStopping counter: 1 out of 3


Training: 100%|██████████| 641/641 [07:53<00:00,  1.35it/s]
Validation: 100%|██████████| 92/92 [00:22<00:00,  4.17it/s]
Training:   0%|          | 0/641 [00:00<?, ?it/s]

Train Acc: 96.955504 Train Loss: 0.099239
  Val Acc: 86.543716   Val Loss: 0.538367
EarlyStopping counter: 2 out of 3


Training: 100%|██████████| 641/641 [07:54<00:00,  1.35it/s]
Validation: 100%|██████████| 92/92 [00:22<00:00,  4.17it/s]

Train Acc: 97.316550 Train Loss: 0.086335
  Val Acc: 86.612022   Val Loss: 0.553511
EarlyStopping counter: 3 out of 3
Early stopping





## Predict testing set

In [16]:
testing_csv = pd.read_csv("../input/twitter-sentimentmis583/twitter_sentiment/test.csv")
testing_set = SentimentData(testing_csv , tokenizer, MAX_LEN, mode = "test")
testing_loader = DataLoader(testing_set, batch_size = 1, shuffle = False)

In [17]:
testing_prediction = []
model.load_state_dict(torch.load('./checkpoint.pt'))
model.eval()
with torch.no_grad():
    for data in testing_loader:
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        outputs = model(ids, mask, token_type_ids)
        _, prediction  = torch.max(outputs, 1)
        testing_prediction.extend(prediction.to('cpu').numpy().tolist())

In [18]:
idx = 0
import csv
with open('submission.csv', 'w', newline='') as csvFile:
    writer = csv.DictWriter(csvFile, fieldnames=['index', 'sentiment_label',])
    writer.writeheader()
    for result in testing_prediction:
        idx+=1
        writer.writerow({'index':idx-1, 'sentiment_label':result})