In [5]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import json

from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

import torch
from torch.utils.data import Dataset

from transformers import BertModel
from transformers import BertTokenizer
from transformers import BertPreTrainedModel
from transformers.models.bert.modeling_bert import BertEmbeddings, BertEncoder, BertPooler
from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions

from typing import *

df_train = pd.read_csv("Corona_NLP_train.csv", encoding='latin-1')
df_train.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [7]:
df = df_train.drop(columns=['ScreenName', 'UserName', 'Location', 'TweetAt'])
df.columns = ['context', 'label']
df.to_csv('Sentiment_analysis.csv')


In [11]:
class Sentiment_Dataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
        self.label_map = {'Extremely Negative' : 0, 'Negative' : 1, 'Neutral' : 2, 'Positive' : 3, 'Extremely Positive' : 4}

    def __getitem__(self, index):
        df = self.df
        EC = self.tokenizer.encode_plus(df['context'][index])

        input_ids = torch.tensor(EC['input_ids'])
        mask = torch.tensor(EC['attention_mask'])
        token = torch.tensor(EC['token_type_ids'])
        label = self.label_map[df['label'][index]]

        return input_ids, mask, token, label
    
    def __len__(self):
        return len(self.df)

In [9]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def collate_batch(sample): #sample is List
    input_ids_batch = [s[0] for s in sample]
    mask_batch = [s[1] for s in sample]
    token_batch = [s[2] for s in sample]
    Label_batch = torch.tensor([s[3] for s in sample])


    input_ids_batch = pad_sequence(input_ids_batch, batch_first=True)
    mask_batch = pad_sequence(mask_batch, batch_first=True)
    token_batch = pad_sequence(token_batch, batch_first=True)

    return input_ids_batch, mask_batch, token_batch, Label_batch

BATCH_SIZE = 2

In [12]:
train_dataset = Sentiment_Dataset(df)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)

data = next(iter(train_loader))
data

(tensor([[  101,  1188,  1110,  1139,  5102,  1124,   188,   170,  4482, 14359,
           1120,   170,  1469, 14206,  2984,  3570,  1152, 17983,  1115,  1155,
           3239,  1538,  4330, 17944,  3570,  1220,  1243,  1141,  7739,  1115,
           1152,  1138,  1106,  4044,  2310,  1321,  1313,  1107,   170,   195,
           9717,  6726,  3821,  1105,  2498,  1111,  1155, 12644,   102,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0],
         [  101,  2408,  1132,  6123,  1725,  8186,   112,   189,   155, 26939,
           1162,  1260,  1643,  1874,  6052,  1906, 23173,   119,  2119,  1103,
           4583,  4882,  1553,  1111,  1412,  4190,  1110,   140, 25980,  1162,
           7352,  1134,  1132,  1120,  4321,  1214,  1822,  1116,   108, 13150,
           2346,  2723,   113,  1743,   119,  3078,   109,  