# RNN in Pytorch
## example : Yelp classifier

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [None]:
import os
import re
import string

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchtext import data as ttdata
from torchtext.data import Dataset as ttDataset
from torchtext.data import Dataset, Example, Field
from torchtext.data import Iterator, BucketIterator
import spacy

from TextDataloader import TextData

from generate_model import *
from utils import *

In [3]:
if torch.cuda.device_count()>1:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
elif torch.cuda.device_count()>0:
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Dataload
* reference : Natural language processing with PYTORCH
1. load data file into a dataframe
2. change text into right form using tokenizer or corpus
3. change modified text into vectors

In [213]:
"""
data shape
rating, review, split
negative, "sentence", train
positive, "sentence", train
"""
f1_s = "/home/bwlee/data/yelp_review_polarity_csv/reviews_with_splits_full.csv"
df = pd.read_csv(f1_s, header=0, skiprows=lambda x: x%10>0)
for i in df.index: 
    if df['rating'].iloc[i] == 'positive':
        df['rating'].iloc[i] = 1
    else:
        df['rating'].iloc[i] = 0

In [214]:
train_df = df[df['split']=='train'][['review', 'rating']]
val_df = df[df['split']=='val'][['review', 'rating']]
test_df = df[df['split']=='test'][['review', 'rating']]
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

In [232]:
print(train_df.review[5])

went here with another couple for restaurant week . the place was fairly busy . we waited for mins and were seated . we had a reservation , but that didn t seem to affect the folks seating tables . we got to our table outside wand waited another half hour just to see our waitress . we noticed everyone around us was complaining about the slow service . we ordered drinks and about another half hour later we ordered our food . my lobster was overcooked and stringy and my steak was mediocre at best . all in all my visit to the mariner s inn was terrible . 


In [104]:
nlp = spacy.load("en_core_web_sm")

In [105]:
TEXT = ttdata.Field(sequential=True, use_vocab=True,
                 tokenize='spacy', lower=True,
                 batch_first=True, fix_length=100,
                 init_token='<SOS>', eos_token='<EOS>')

LABEL = ttdata.Field(sequential=False, use_vocab=False,
                  batch_first=True, is_target=True)

In [268]:
fields0 = (('review', TEXT), ('rating', LABEL))
train_data0 = [ Example.fromlist(
    train_df.iloc[ii].values.tolist(),
                              fields0)
              for ii in range(len(train_df)) ]

In [267]:
train_data[:1]

[<torchtext.data.example.Example at 0x7f85829516a0>]

In [270]:
n_data = len(train_data)
train_data = ttDataset(train_data0[:n_data//5*4], fields=fields0)
test_data = ttDataset(train_data0[n_data//5*4:], fields=fields0)
TEXT.build_vocab(train_data, min_freq=10, max_size=10000)

In [271]:
iter_train = Iterator(train_data, batch_size=100, shuffle=True)
iter_test = Iterator(test_data, batch_size=100, shuffle=False)

In [275]:
class Net(nn.Module):
    def __init__(self, model=None, loss=None, 
                 optimizer=None, device='cuda'):
        super(Net, self).__init__()
        self.model = model
        self.loss = loss
        self.optimizer = optimizer
        self.device = device
    
    def run_batch(self, i_batch, data):
        self.optimizer.zero_grad()
        data_in, tgt = data
        data_in = data_in.to(self.device)
        tgt = tgt.to(self.device)
        out = self.model(data_in)
        loss = self.loss(out, tgt)
        loss.backward()
        self.optimizer.step()
        return loss.detach().cpu().item()
    
    def run_train(self, n_epoch, data, test_data=None):
        self.model.train()
        for i_epoch in range(n_epoch):
            loss = 0
            for i_batch, data_batch in enumerate(data):
                loss_temp = self.run_batch(i_batch, data_batch)
                loss += loss_temp
            loss /= 1.0*len(data)
            print('epoch', i_epoch, 'loss', loss)
            
        if test_data is None:
            return self.run_eval(test_data)
        else:
            return self.run_eval(data)
        
    def run_eval(self, data):
        self.model.eval()
        loss = 0
        outs = None
        with torch.no_grad():
            for i_batch, data_batch in enumerate(data):
                data_in, tgt = data_batch
                data_in = data_in.to(self.device)
                tgt = tgt.to(self.device)
                out = self.model(data_in)
                loss += self.loss(out, tgt).detach().cpu()
                if outs is None:
                    outs = out
                else:
                    outs = torch.cat((outs, out), dim=0)
        loss /= 1.0*(i_batch+1)
        print('evaluate', 'loss', loss)
        return outs, loss

In [276]:
class RNN_classifier(Net):
    def __init__(self, embed = None, rnn=None, downnet=None,
                 loss=None, optimizer=None, device='cuda'):
        super(RNN_classifier, self).__init__()
        self.embed = embed
        self.rnn = rnn
        self.downnet = downnet
        self.loss = loss
        self.optimizer = optimizer
        self.device = device
        self.model = self.rnn
        self.init_weights()
    
    def init_weights(self):
        initrange = 0.5
        self.embed.weight.data.uniform_(-initrange, initrange)
        torch.nn.init.xavier_uniform_(self.downnet[1].weight)
        #self.fc.weight.data.uniform_(-initrange, initrange)
        #self.fc.bias.data.zero_()
    
    def forward(self, x):
        if self.embed is not None:
            x = self.embed(x)
        out, hidden = self.rnn(x)
        out = self.downnet(out[:,-1]) # choose last output
    
    def run_batch(self, i_batch, data):
        self.optimizer.zero_grad()
        data_in, tgt = data
        data_in = data_in.to(device)
        data_temp = data_in
        tgt = tgt.to(device)        
        if self.embed is not None:
            data_in = self.embed(data_in)
        out, hidden = self.rnn(data_in)
        temp = out[:,-1]
        out = self.downnet(out[:,-1]) # choose last output
        #"""
        if i_batch % 10 == 0:
            print('in', data_temp[:2,:4])
            print(temp[:2,:3])
            print(data_in[:2,:4,:3])
            print('out', out[:5], tgt[:5])
            print()
        #"""
        loss = self.loss(out, tgt)
        loss.backward()
        self.optimizer.step()
        return loss.detach().cpu().item()
    
    def run_train(self, n_epoch, data, test_data=None):
        self.embed.train()
        self.rnn.train()
        self.downnet.train()
        return super().run_train(n_epoch, data, test_data)
    
    def run_eval(self, data):
        self.embed.eval()
        self.rnn.eval()
        self.downnet.eval()
        loss = 0
        outs = None
        tgts = None
        with torch.no_grad():
            for i_batch, data_batch in enumerate(data):
                data_in, tgt = data_batch
                data_in = data_in.to(self.device)
                tgt = tgt.to(self.device)
                
                if self.embed is not None:
                    data_in = self.embed(data_in)
                out, hidden = self.rnn(data_in)
                out = self.downnet(out[:,-1]) # choose last output
                
                if outs is None:
                    outs = out
                    tgts = tgt
                else:
                    outs = torch.cat((outs, out), dim=0)
                    tgts = torch.cat((tgts, tgt), dim=0)
        loss /= 1.0*i_batch
        print('evaluate---', 'loss', loss)
        outs_np = outs.cpu().numpy()
        tgts_np = tgts.cpu().numpy()
        
        print('accuracy', accuracy(outs_np, tgts_np))
        return outs, loss

In [277]:
dim_embed = 200
dim_hidden = 200
embed = nn.Embedding(num_embeddings=len(TEXT.vocab),
                    embedding_dim=dim_embed,
                    padding_idx=TEXT.vocab.stoi['<pad>']).to(device)

lstm = nn.LSTM(input_size=dim_embed, 
        hidden_size=dim_hidden, 
        num_layers= 2,
        batch_first=True,
        bidirectional=True).to(device)

#classifier = get_MLP([dim_hidden*2, dim_hidden*3, dim_hidden, 2])
classifier = get_MLP([dim_hidden*2, 2], end=True)
classifier = classifier.to(device)

#net = nn.Sequential(embed, lstm, classifier)
#get_RNN(10, 20, 4)

In [278]:
loss = nn.CrossEntropyLoss()
parms = list(embed.parameters())
parms += list(lstm.parameters())
parms += list(classifier.parameters())

optimizer = optim.Adam(parms)
seq_class = RNN_classifier(embed, lstm, classifier, loss, optimizer)

In [272]:
seq_class.run_train(1, iter_train, iter_test)

TypeError: run_train() takes 3 positional arguments but 4 were given

In [256]:
for _ in range(20):
    seq_class.run_train(1, iter1)

epoch 0 loss 0.6377113946542448
evaluate--- loss 0.0
accuracy 0.7318367346938776
epoch 0 loss 0.43713363054759646
evaluate--- loss 0.0
accuracy 0.8792091836734693
epoch 0 loss 0.27339637579814513
evaluate--- loss 0.0
accuracy 0.898469387755102
epoch 0 loss 0.21460443157322553
evaluate--- loss 0.0
accuracy 0.9386734693877551
epoch 0 loss 0.17209600320808133
evaluate--- loss 0.0
accuracy 0.9575255102040816
epoch 0 loss 0.1389554777486744
evaluate--- loss 0.0
accuracy 0.9680102040816326
epoch 0 loss 0.10917216761284793
evaluate--- loss 0.0
accuracy 0.9788775510204082
epoch 0 loss 0.08162904679727721
evaluate--- loss 0.0
accuracy 0.9871683673469388
epoch 0 loss 0.05522642822755615
evaluate--- loss 0.0
accuracy 0.9913265306122448
epoch 0 loss 0.05046790648056954
evaluate--- loss 0.0
accuracy 0.9929591836734694
epoch 0 loss 0.032294202867980897
evaluate--- loss 0.0
accuracy 0.9910459183673469
epoch 0 loss 0.025824555308868093
evaluate--- loss 0.0
accuracy 0.993545918367347
epoch 0 loss 0.022

In [None]:
accuracy