In [16]:
import pandas as pd
import numpy as np

In [3]:
dataset = pd.read_csv('data/imdb-reviews-pt-br.csv', encoding='utf-8')

dataset.head()

Unnamed: 0,id,text_en,text_pt,sentiment
0,1,Once again Mr. Costner has dragged out a movie...,"Mais uma vez, o Sr. Costner arrumou um filme p...",neg
1,2,This is an example of why the majority of acti...,Este é um exemplo do motivo pelo qual a maiori...,neg
2,3,"First of all I hate those moronic rappers, who...","Primeiro de tudo eu odeio esses raps imbecis, ...",neg
3,4,Not even the Beatles could write songs everyon...,Nem mesmo os Beatles puderam escrever músicas ...,neg
4,5,Brass pictures movies is not a fitting word fo...,Filmes de fotos de latão não é uma palavra apr...,neg


In [75]:
reviews = dataset['text_pt'].to_numpy()
labels = dataset['sentiment']

labels

0        neg
1        neg
2        neg
3        neg
4        neg
        ... 
49454    pos
49455    pos
49456    pos
49457    pos
49458    pos
Name: sentiment, Length: 49459, dtype: object

In [76]:
import nltk

nltk.download('stopwords')

stopwords = nltk.corpus.stopwords.words('portuguese')


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vinicius/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [77]:
words = list()
for review in reviews:
    review_split = review.split(' ')
    for word in review_split:
        word = word.lower()
        words.append(word)
words

['mais',
 'uma',
 'vez,',
 'o',
 'sr.',
 'costner',
 'arrumou',
 'um',
 'filme',
 'por',
 'muito',
 'mais',
 'tempo',
 'do',
 'que',
 'o',
 'necessário.',
 'além',
 'das',
 'terríveis',
 'seqüências',
 'de',
 'resgate',
 'no',
 'mar,',
 'das',
 'quais',
 'há',
 'muito',
 'poucas,',
 'eu',
 'simplesmente',
 'não',
 'me',
 'importei',
 'com',
 'nenhum',
 'dos',
 'personagens.',
 'a',
 'maioria',
 'de',
 'nós',
 'tem',
 'fantasmas',
 'no',
 'armário,',
 'e',
 'o',
 'personagem',
 'costers',
 'é',
 'realizado',
 'logo',
 'no',
 'início,',
 'e',
 'depois',
 'esquecido',
 'até',
 'muito',
 'mais',
 'tarde,',
 'quando',
 'eu',
 'não',
 'me',
 'importava.',
 'o',
 'personagem',
 'com',
 'o',
 'qual',
 'deveríamos',
 'nos',
 'importar',
 'é',
 'muito',
 'arrogante',
 'e',
 'superconfiante,',
 'ashton',
 'kutcher.',
 'o',
 'problema',
 'é',
 'que',
 'ele',
 'sai',
 'como',
 'um',
 'garoto',
 'que',
 'pensa',
 'que',
 'é',
 'melhor',
 'do',
 'que',
 'qualquer',
 'outra',
 'pessoa',
 'ao',
 'seu',

In [78]:
from collections import Counter

count_words = Counter(words)

total_words = len(words)

sorted_words=count_words.most_common(total_words)

In [79]:
vocab_to_int={w:i+1 for i,(w,c) in enumerate(sorted_words)}


In [14]:
encoded_reviews = list()

for review in reviews:
    encoded_review = list()
    for word in review.split():
        if word not in vocab_to_int.keys():
            encoded_review.append(0)
        else:
            encoded_review.append(vocab_to_int[word])
    encoded_reviews.append(encoded_review)

In [80]:
sequence_length = 256

features = np.zeros((len(encoded_reviews), sequence_length), dtype=int)


for i, review in enumerate(encoded_reviews):
  review_len=len(review)
  if review_len <= sequence_length:
    zeros=list(np.zeros(sequence_length-review_len))
    new=zeros+review
  else:
    new=review[:sequence_length]
features[ i ,:] = np.array( new)

features

array([[  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       ...,
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   5, 611,   1]])

In [87]:
labels_encoded = list()
positive_counts = 0
negative_counts = 0
for label in labels:
    if label == 'pos':
        labels_encoded.append(1.)
        positive_counts += 1
    else:
        labels_encoded.append(0.)
        negative_counts += 1
labels_encoded = np.array(labels_encoded)
print('Valores positivos: ')
print(positive_counts)
print('Valores negativos: ')
print(negative_counts)
print('Balanceamento do dataset: ')
print((positive_counts / negative_counts) * 100)

print(type(labels_encoded))

Valores positivos: 
24694
Valores negativos: 
24765
Balanceamento do dataset: 
99.71330506763577
<class 'numpy.ndarray'>


In [88]:
#split_dataset into 80% training , 10% test and 10% Validation Dataset
train_x=features[:int(0.8*len(features))]
train_y=labels_encoded[:int(0.8*len(features))]
valid_x=features[int(0.8*len(features)):int(0.9*len(features))]
valid_y=labels_encoded[int(0.8*len(features)):int(0.9*len(features))]
test_x=features[int(0.9*len(features)):]
test_y=labels_encoded[int(0.9*len(features)):]
print(len(train_y), len(valid_y), len(test_y), type(train_y))

39567 4946 4946 <class 'numpy.ndarray'>


In [94]:
import torch
from torch.utils.data import DataLoader, TensorDataset

#create Tensor Dataset
train_data=TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data=TensorDataset(torch.from_numpy(valid_x), torch.from_numpy(valid_y))
test_data=TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

#dataloader
batch_size=50
train_loader=DataLoader(train_data, batch_size=batch_size, shuffle=True)
valid_loader=DataLoader(valid_data, batch_size=batch_size, shuffle=True)
test_loader=DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [95]:
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()
print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([50, 256])
Sample input: 
 tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])

Sample label size:  torch.Size([50])
Sample label: 
 tensor([0., 0., 1., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1.,
        1., 1., 1., 0., 1., 1., 0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 1., 1.,
        0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 1., 0., 1., 1.],
       dtype=torch.float64)


In [91]:
import torch.nn as nn

class SentimentalAnalytics(nn.Module):

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        super().__init__()
        self.output_size = vocab_size
        self.n_layers=n_layers
        self.hidden_dim=hidden_dim

        #Embedding and LSTM layers
        self.embedding=nn.Embedding(vocab_size, embedding_dim)
        self.lstm=nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)

        #dropout layer
        self.dropout=nn.Dropout(0.3)

        #Linear and sigmoid layer
        self.fc1=nn.Linear(hidden_dim, 64)
        self.fc2=nn.Linear(64, 16)
        self.fc3=nn.Linear(16,output_size)
        self.sigmoid=nn.Sigmoid()

    def forward(self, x, hidden):
            """
            Perform a forward pass of our model on some input and hidden state.
            """
            batch_size=x.size()

            #Embadding and LSTM output
            embedd=self.embedding(x)
            lstm_out, hidden=self.lstm(embedd, hidden)

            #stack up the lstm output
            lstm_out=lstm_out.contiguous().view(-1, self.hidden_dim)

            #dropout and fully connected layers
            out=self.dropout(lstm_out)
            out=self.fc1(out)
            out=self.dropout(out)
            out=self.fc2(out)
            out=self.dropout(out)
            out=self.fc3(out)
            sig_out=self.sigmoid(out)

            sig_out=sig_out.view(batch_size, -1)
            sig_out=sig_out[:, -1]

            return sig_out, hidden

    def init_hidden(self, batch_size):
        """Initialize Hidden STATE"""
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        train_on_gpu = torch.cuda.is_available()
        if train_on_gpu:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())

        return hidden

In [92]:
# Instantiate the model w/ hyperparams
vocab_size = len(vocab_to_int)+1 # +1 for the 0 padding
output_size = 1
embedding_dim = 400
hidden_dim = 256
n_layers = 2

net = SentimentalAnalytics(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
print(net)

SentimentalAnalytics(
  (embedding): Embedding(312125, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc1): Linear(in_features=256, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=16, bias=True)
  (fc3): Linear(in_features=16, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [96]:
lr = 0.001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)


epochs = 3

counter = 0
train_on_gpu = torch.cuda.is_available()

print_every = 100
clip=5 # gradient clipping

# move model to GPU, if available
if train_on_gpu:
    net.cuda()

net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = net.init_hidden(50)

    # batch loop
    for inputs, labels in train_loader:
        counter += 1

        if(train_on_gpu):
            inputs=inputs.cuda()
            labels=labels.cuda()
        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        net.zero_grad()

        # get the output from the model
        output, h = net(inputs, h)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = net.init_hidden(50)
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                inputs, labels = inputs.cuda(), labels.cuda()
                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

AssertionError: Torch not compiled with CUDA enabled