In [None]:
import pandas as pd
import numpy as np

import torch
import torch.nn.functional as F
import torchtext
import random
import pandas as pd

In [None]:
numepoch=10
DEVICE = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

In [81]:
#Data Extraction
data = pd.read_csv('Tweets.csv')
data =data.drop('textID',axis=1)
data=data.drop('text',axis=1)
data.to_csv('Tweetsbis.csv', index=False)

In [85]:
#We convert the data frame into Tokens
TEXT = torchtext.legacy.data.Field(tokenize='spacy', tokenizer_language='en_core_web_sm')

LABEL = torchtext.legacy.data.LabelField(dtype=torch.long)

fields = [('selected_text', TEXT), ('sentiment', LABEL)]

dataset = torchtext.legacy.data.TabularDataset(path='Tweetsbis.csv', format='csv', skip_header=True, fields=fields)

#We Split the data
train_data, test_data = dataset.split(split_ratio=[0.8, 0.2], random_state=random.seed(123))

TEXT.build_vocab(train_data, max_size=20000)
LABEL.build_vocab(train_data)

#We create loader 
train_loader, test_loader = torchtext.legacy.data.BucketIterator.splits((train_data, test_data), sort_within_batch=False, sort_key=lambda x: len(x.selected_text), batch_size= 64, device=torch.device(DEVICE))

In [86]:
class RNN(torch.nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)
        self.rnn = torch.nn.LSTM(embedding_dim, hidden_dim)              
        self.fc = torch.nn.Linear(hidden_dim, output_dim)
        

    def forward(self, text):      
        
        embedded = self.embedding(text)
        output, (hidden, cell) = self.rnn(embedded)
        hidden.squeeze_(0)        
        output = self.fc(hidden)
        return output
    
def compute_accuracy(model, data_loader, device):

    with torch.no_grad():

        correct_pred, num_examples = 0, 0

        for i, (features, targets) in enumerate(data_loader):

            features = features.to(device)
            targets = targets.float().to(device)

            logits = model(features)
            _, predicted_labels = torch.max(logits, 1)

            num_examples += targets.size(0)
            correct_pred += (predicted_labels == targets).sum()
    return correct_pred.float()/num_examples * 100


#Init of the model
torch.manual_seed(123)
model = RNN(input_dim=len(TEXT.vocab),embedding_dim=128, hidden_dim=256, output_dim=3)

model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

#Training with 10 layer (num of epochs) and 344 sample per layer
for epoch in range(numepoch):
    model.train()
    for batch_idx, batch_data in enumerate(train_loader):
        
        text = batch_data.selected_text.to(DEVICE)
        labels = batch_data.sentiment.to(DEVICE)

        logits = model(text)
        loss = F.cross_entropy(logits, labels)
        optimizer.zero_grad()
        
        loss.backward()
        optimizer.step()
        if not batch_idx % 100:
            print (f'Epoch: {epoch+1:03d}/{numepoch:03d} | 'f'Batch {batch_idx:03d}/{len(train_loader):03d} | 'f'Loss: {loss:.4f}')

    with torch.set_grad_enabled(False):
        print(f'training accuracy: 'f'{compute_accuracy(model, train_loader, DEVICE):.2f}%')
        print(f'testing accuracy: 'f'{compute_accuracy(model, test_loader, DEVICE):.2f}%')

Epoch: 001/010 | Batch 000/344 | Loss: 1.1275
Epoch: 001/010 | Batch 100/344 | Loss: 0.9017
Epoch: 001/010 | Batch 200/344 | Loss: 0.8748
Epoch: 001/010 | Batch 300/344 | Loss: 0.7506
training accuracy: 60.49%
testing accuracy: 51.76%
Epoch: 002/010 | Batch 000/344 | Loss: 0.6898
Epoch: 002/010 | Batch 100/344 | Loss: 0.7689
Epoch: 002/010 | Batch 200/344 | Loss: 0.7246
Epoch: 002/010 | Batch 300/344 | Loss: 0.5803
training accuracy: 78.44%
testing accuracy: 66.18%
Epoch: 003/010 | Batch 000/344 | Loss: 0.6209
Epoch: 003/010 | Batch 100/344 | Loss: 0.6696
Epoch: 003/010 | Batch 200/344 | Loss: 0.3480
Epoch: 003/010 | Batch 300/344 | Loss: 0.4305
training accuracy: 84.00%
testing accuracy: 69.87%
Epoch: 004/010 | Batch 000/344 | Loss: 0.2918
Epoch: 004/010 | Batch 100/344 | Loss: 0.3969
Epoch: 004/010 | Batch 200/344 | Loss: 0.4812
Epoch: 004/010 | Batch 300/344 | Loss: 0.3138
training accuracy: 88.90%
testing accuracy: 75.04%
Epoch: 005/010 | Batch 000/344 | Loss: 0.3043
Epoch: 005/010