In [4]:
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch
import re
from transformers import RobertaTokenizer

In [6]:
train_data = pd.read_csv('../data/corona_tweets/coronanlp.csv',encoding='latin-1')
df_test = pd.read_csv('../data/corona_tweets/coronanlp.csv', encoding="latin-1")

In [7]:
train_data.head(5)

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [10]:
train_data['OriginalTweet'][2]

'Coronavirus Australia: Woolworths to give elderly, disabled dedicated shopping hours amid COVID-19 outbreak https://t.co/bInCA9Vp8P'

In [3]:
class DataProcessor():
    """
    Preparing data for training
    """
    def __init__(self, data: pd.DataFrame, tokenizer: RobertaTokenizer):
        self.data = data
        self.tokenizer = tokenizer
     
    def _delete_hash(self, hash):
        to_remove = ['\r','\n',',',';',':','.']
        out = re.sub(r'http\S+', '', hash)
        for token in to_remove:
            out = out.replace(token, '')
        return re.sub(' +', ' ', out.lower())
    
    def _remove_extra_char(self, data: pd.DataFrame) -> pd.DataFrame:
        data['OriginalTweet'] = data['OriginalTweet'].apply(lambda x : self._delete_hash(x))
        return data
    
    def _map_reviews_to_num(self, data: pd.DataFrame) -> pd.DataFrame:
        data['Sentiment'] = data['Sentiment'].map(
            {
                "Extremely Negative" : 0,
                "Negative" : 1,
                "Neutral" : 2,
                "Positive" : 3,
                "Extremely Positive" : 4
            }
        )
        return data
    
    def _tokenize(self,x):
        return self.tokenizer.encode(x, padding='max_length')
        
    def _tokenize_data(self, data):
        return list(data["OriginalTweet"].apply(lambda x : self._tokenize(x)))
    
    def process(self):
        result = self._remove_extra_char(self.data)
        result = self._map_reviews_to_num(result)
        tokens = self._tokenize_data(result)
        result = result["Sentiment"]
        return tokens, result
        

In [4]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
tokenizer.model_max_length = 200
data = DataProcessor(train_data, tokenizer)

In [5]:
train,target = data.process()

In [6]:
from torch.utils.data import Dataset

In [7]:
class CreateDataset(Dataset):
    def __init__(self, data, labels):
        super().__init__()
        self.data = data
        self.labels = labels
        
    def __getitem__(self, idx):
        return torch.tensor(self.data[idx]), torch.tensor(self.labels[idx])
    
    def __len__(self):
        return len(self.labels)

In [8]:
train_dataset = CreateDataset(train, target)

In [9]:
from torch.utils.data import DataLoader

In [10]:
train_loader = DataLoader(train_dataset, batch_size = 32)

In [11]:
class NetWork(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size = embedding_dim, 
                            hidden_size = 256, 
                            num_layers = 4,
                            dropout = 0.2,
                            batch_first = True,
                            bidirectional = True)
        self.linear = nn.Linear(512*200, 5)
        
    def forward(self, inputs):
        emb = self.embedding(inputs)
        lstm, _ = self.lstm(emb)
        output = self.linear(lstm.reshape(lstm.size()[0], -1))
        return output

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NetWork(tokenizer.vocab_size, 256).to(device)
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

In [13]:
class Train_text():
    def __init__(self, model, train_loader):
        self.model = model
        self.train_loader = train_loader
        
    def train(self, loss_function, optimizer):
        model.train()
        total_acc, total_count = 0, 0
        log_interval = 100
        for epoch in range(2):
            iterator = enumerate(self.train_loader)
            for i,( inputs, target) in iterator:
                inputs, target = inputs.to(device), target.to(device)       
                self.model.zero_grad()
                outputs = self.model(inputs)
                loss = loss_function(outputs, target)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
                optimizer.step()
                total_acc += (outputs.argmax(1) == target).sum().item()
                total_count += target.size(0)
                if i % log_interval == 0 and i > 0:
                    print('| epoch {:3d} | {:5d}/{:5d} batches '
                          '| accuracy {:8.3f}'.format(epoch, i, len(self.train_loader),
                                                      total_acc/total_count))
                    total_acc, total_count = 0, 0
                    
    def evaluate(dataloader):
        model.eval()
        total_acc, total_count = 0, 0
        with torch.no_grad():
            for idx, (inputs, target) in enumerate(dataloader):
                outputs = model(inputs)
                loss = loss_function(outputs, target)
                total_acc += (outputs.argmax(1) == label).sum().item()
                total_count += label.size(0)
        return total_acc/total_count

In [14]:
trainer = Train_text(model, train_loader)


In [15]:
trainer.train(loss_function,optimizer)

| epoch   0 |   100/ 1287 batches | accuracy    0.277
| epoch   0 |   200/ 1287 batches | accuracy    0.327
| epoch   0 |   300/ 1287 batches | accuracy    0.389
| epoch   0 |   400/ 1287 batches | accuracy    0.439
| epoch   0 |   500/ 1287 batches | accuracy    0.463
| epoch   0 |   600/ 1287 batches | accuracy    0.518
| epoch   0 |   700/ 1287 batches | accuracy    0.544
| epoch   0 |   800/ 1287 batches | accuracy    0.544
| epoch   0 |   900/ 1287 batches | accuracy    0.558
| epoch   0 |  1000/ 1287 batches | accuracy    0.605
| epoch   0 |  1100/ 1287 batches | accuracy    0.632
| epoch   0 |  1200/ 1287 batches | accuracy    0.649
| epoch   1 |   100/ 1287 batches | accuracy    0.666
| epoch   1 |   200/ 1287 batches | accuracy    0.673
| epoch   1 |   300/ 1287 batches | accuracy    0.681
| epoch   1 |   400/ 1287 batches | accuracy    0.681
| epoch   1 |   500/ 1287 batches | accuracy    0.690
| epoch   1 |   600/ 1287 batches | accuracy    0.720
| epoch   1 |   700/ 1287 ba