<a href="https://colab.research.google.com/github/viniciusrpb/natural_language_processing/blob/main/tweet_sentiment_classification_rnn_fastai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()

from fastbook import *
from fastai.text.all import *
import pandas as pd

### Carrega o corpus para um DataFrame

In [9]:
df = pd.read_csv('/content/SemEval2014-task3-train-valid.txt',sep='\t')
df.head()

Unnamed: 0,label,text,is_valid
0,negative,Theo Walcott is still shitc watch Rafa and Johnny deal with him on Saturday.,False
1,negative,its not that Im a GSP fanc i just hate Nick Diaz. cant wait for february.,False
2,negative,Iranian general says Israels Iron Dome cant deal with their missiles (keep talking like that and we may end up finding out),False
3,neutral,Tehranc Mon Amour: Obama Tried to Establish Ties with the Mullahs http://t.co/TZZzrrKa via @PJMedia_com No Barack Obama - Vote Mitt Romney,False
4,neutral,I sat through this whole movie just for Harry and Ron at christmas. ohlawd,False


### Cria um datablock, separando o atributo de texto e o label

Observe que o tamanho da sequencia de palavras a ser considerada é igual a 8 => parâmetro seq_len

O DataBlock já faz a tokenização e a numericalização de palavras e subpalavras

In [10]:
dls_lm = DataBlock(blocks=(TextBlock.from_df('text', seq_len=8), CategoryBlock),
                      get_x=ColReader('text'),
                      get_y=ColReader('label'),
                      splitter=ColSplitter())

### Obtém o dataloader com batch_size (bs) = 64

In [11]:
dls_lm = dls_lm.dataloaders(df, bs=64)
dls_lm.show_batch()

Unnamed: 0,text,category
0,xxbos # # $ $ # # xxmaj black xxmaj friday xxmaj deals xxmaj olympus xxup om - d xxup e - m5 xxmaj digital xxmaj camera - xxmaj black - with xxmaj olympus 12 - 50 mm f / xxunk - 5.6 xxup xxunk xxmaj zoom xxmaj lens - b … http : / / t.co / xxunk,neutral
1,xxbos xxunk cafe xxup one ? ! xxmaj they lose xxmaj dongwoo on a xxmaj tuesday . is the first line xxup is xxup that xxup the xxup right xxup xxunk xxup no xxup it xxup does nt xxup seem xxup like xxup it xxup wait xxup link xxup me ? ! xxup lmao,negative
2,xxbos xxup oct 27th / / xxup one xxup sports xxup lounge / / xxup monta xxup ellis ( of the xxmaj milwaukee xxmaj bucks ) / / xxup xxunk xxup xxunk / / xxup xxunk . / / $ 500 xxup cash xxup pr xxunk http : / / t.co / xxunk,neutral
3,"xxbos { haha z xxrep 3 i n xxrep 3 g } xxup rt xxunk xxrep 3 0 : xxmaj all across the xxup us , with xxmaj live xxmaj feeds down , xxmaj big xxmaj brother fans are seeing the xxmaj sun for the first time in months .",neutral
4,xxbos 4 xxmaj xxunk xxmaj hoops xxmaj xxunk tonight 1 . xxmaj san xxmaj diego xxmaj state @ xxmaj new xxmaj mexico 2 . xxmaj xxunk @ xxmaj west xxmaj virginia 3 . xxmaj st . xxmaj joe @ xxmaj xxunk 4 . xxmaj wichita xxmaj st @ xxup uni,neutral
5,xxbos # xxup new # xxup shit 10 xxmaj things to xxmaj know for xxmaj friday : xxmaj the xxup u.s . xxmaj postal xxmaj xxunk on the brink of default on a second … http : / / t.co / xxunk t # xxup new # xxup swag,negative
6,xxbos xxup november xxup super xxup tour : xxmaj nov 10 - 11 : xxmaj xxunk xxmaj ten ; xxmaj nov 9c 16c 23 : xxup abs - cbn xxmaj friday xxmaj market . xxmaj meet xxmaj super xxmaj xxunk and xxmaj friends ! xxmaj see you ! :-),positive
7,xxbos xxmaj oh and we are playing @ the xxmaj united xxmaj center at 1 pm on xxmaj december 1st xxup come xxup support xxup us xxup and xxup buy a xxup ticket xxup from xxup me xxup to xxup watch xxup the xxup bulls xxrep 7 !,positive
8,xxbos xxup leo xxup or xxup xxunk ? ! xxup club xxup pulse is the xxup move this xxup thursday ! xxmaj leos & xxmaj xxunk are free before 12 ! xxmaj also co - hosted by # xxmaj evalesco ! xxmaj evalesco xxup will be there !,neutral


In [18]:
#print(dls_lm.vocab)

### Imprime o tamanho do vocabulário dos dados de treinamento

In [13]:
print(dls_lm.train_ds.max_vocab)

60000


### Criação do Modelo de Linguagem: Rede Neural Recorrente simples

In [14]:
class RNN_v1(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)  
        self.h_h = nn.Linear(n_hidden, n_hidden)     
        self.h_o = nn.Linear(n_hidden,vocab_sz)
        
    def forward(self, x):
        h = F.relu(self.h_h(self.i_h(x[:,0])))
        h = h + self.i_h(x[:,1])
        h = F.relu(self.h_h(h))
        h = h + self.i_h(x[:,2])
        h = F.relu(self.h_h(h))
        return self.h_o(h)

In [15]:
class RNN(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)  
        self.h_h = nn.Linear(n_hidden, n_hidden)     
        self.h_o = nn.Linear(n_hidden,vocab_sz)
        self.h = 0
        
    def forward(self, x):
        outs = []
        for i in range(sl):
            self.h = self.h + self.i_h(x[:,i])
            self.h = F.relu(self.h_h(self.h))
            outs.append(self.h_o(self.h))
        self.h = self.h.detach()
        return torch.stack(outs, dim=1)
    
    def reset(self): self.h = 0

### Definição do objeto de aprendizado PyTorch para viabilizar o treinamento do Modelo de Linguagem

Considera o vocabulário dos dados de treinamento

In [16]:
learn = Learner(dls_lm, RNN_v1(dls_lm.train_ds.max_vocab, 64), loss_func=F.cross_entropy,metrics=accuracy)
learn.fit_one_cycle(5, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,4.728925,1.196079,0.428312,00:37
1,1.195659,1.088256,0.408953,00:38
2,1.014696,1.093568,0.441621,00:38
3,0.992632,1.081303,0.451906,00:37
4,0.988467,1.070554,0.448881,00:43


In [17]:
sl = 16
learn = Learner(dls_lm, RNN(dls_lm.train_ds.max_vocab, 64), loss_func=F.cross_entropy,metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(10, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,0.0,00:02,,


ValueError: ignored