In [1]:
import numpy as np
import pandas as pd
import string

In [2]:
data = pd.read_csv('data/train.tsv/train.tsv',sep='\t')
#df = data[['Phrase','Sentiment']]
df = data
print(df.head(10))
print('=='*40)
data2 = pd.read_csv('data/test.tsv/test.tsv',sep='\t')
data2 = data2[['Phrase']]
print(data2.head(10))

FileNotFoundError: [Errno 2] No such file or directory: '../Datasets/train.tsv/train.tsv'

In [None]:
df.groupby(['Sentiment']).size().plot.bar()

In [None]:
import torch
import torch.utils.data
import numpy as np
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('prajjwal1/bert-tiny')

class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels = [label for label in df['Sentiment']]
        self.texts = [tokenizer(text, padding='max_length',
            max_length=512, truncation=True, return_tensors="pt")
            for text in df['Phrase']]
    
    def classes(self):
        return self.labels
    
    def __len__(self):
        return len(self.labels)
    
    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])
    
    def get_batch_texts(self, idx):
        return self.texts[idx]
    
    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y

In [None]:
from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.1):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('prajjwal1/bert-tiny')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(128, 5)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer 

In [None]:
train = Dataset(df)
for i in train:
    print(i)

In [None]:
from torch.optim import Adam
from tqdm import tqdm

def train(model, data, learning_rate, epochs):

    train_data, val_data = np.split(data.sample(frac=1), [int(.99*len(data))])
    print(len(train_data), len(val_data))
    train, val = Dataset(train_data), Dataset(val_data)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

        model = model.cuda()
        criterion = criterion.cuda()

    for epoch_num in range(epochs):

        train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
        val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

        total_acc_train = 0
        total_loss_train = 0

        for train_input, train_label in tqdm(train_dataloader):

            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
            
            batch_loss = criterion(output, train_label.long())
            total_loss_train += batch_loss.item()
            
            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()
        
        total_acc_val = 0
        total_loss_val = 0

        with torch.no_grad():

            for val_input, val_label in val_dataloader:

                val_label = val_label.to(device)
                mask = val_input['attention_mask'].to(device)
                input_id = val_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)

                batch_loss = criterion(output, val_label.long())
                total_loss_val += batch_loss.item()
                
                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc
        
        print(
            f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} '\
            f'| Train Accuracy: {total_acc_train / len(train_data): .3f} '\
            f'| Val Loss: {total_loss_val / len(val_data): .3f} '\
            f'| Val Accuracy: {total_acc_val / len(val_data): .3f}')
                  
EPOCHS = 3
model = BertClassifier()
LR = 1e-5

train(model, df, LR, EPOCHS)

In [None]:
torch.save(model.state_dict(), "bert-tiny-movie-review.model")

In [None]:
tokenizer = BertTokenizer.from_pretrained('prajjwal1/bert-tiny')

model = BertClassifier()
model.load_state_dict(torch.load("bert-tiny-movie-review.model"))

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

if use_cuda:
    model = model.cuda()

test_text = "I think this movie is great"
with torch.no_grad():
    test_input = tokenizer(test_text, padding='max_length',
        max_length=512, truncation=True, return_tensors="pt")
    mask = test_input['attention_mask'].to(device)
    input_id = test_input['input_ids'].squeeze(1).to(device)

    output = model(input_id, mask)
    label = output.argmax(dim=1).item()
    print("Label:", label)

In [None]:
df = pd.read_csv('data/test.tsv/test.tsv',sep='\t')
df.head()

In [None]:
answer = pd.DataFrame(columns=['PhraseId', 'Sentiment'])
for index, row in df.iterrows():
    with torch.no_grad():
        test_text = row['Phrase']
        test_input = tokenizer(test_text, padding='max_length',
            max_length=512, truncation=True, return_tensors="pt")
        mask = test_input['attention_mask'].to(device)
        input_id = test_input['input_ids'].squeeze(1).to(device)

        output = model(input_id, mask)
        label = output.argmax(dim=1).item()
        answer_row = {'PhraseId': row['PhraseId'], 'Sentiment': label}
        answer = answer.append(answer_row, ignore_index=True)
        #print(row['PhraseId'], ' ' * 5, end='\r')
answer.head()

In [None]:
answer.to_csv("submission.csv", index=False)