# RNN in Pytorch
## example : Yelp classifier

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [2]:
import os
import re
import string
import random

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchtext import data as ttdata
from torchtext.data import Dataset as ttDataset
from torchtext.data import Dataset, Example, Field
from torchtext.data import Iterator, BucketIterator
import spacy

from TextDataloader import TextData

from generate_model import *
from kbutils.evaluation import accuracy

In [3]:
if torch.cuda.device_count()>1:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
elif torch.cuda.device_count()>0:
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Dataload
* reference : Natural language processing with PYTORCH
1. load data file into a dataframe
2. change text into right form using tokenizer or corpus
3. change modified text into vectors

In [213]:
"""
data shape
rating, review, split
negative, "sentence", train
positive, "sentence", train
"""
f1_s = "/home/bwlee/data/yelp_review_polarity_csv/reviews_with_splits_full.csv"
df = pd.read_csv(f1_s, header=0, skiprows=lambda x: x%10>0)
for i in df.index: 
    if df['rating'].iloc[i] == 'positive':
        df['rating'].iloc[i] = 1
    else:
        df['rating'].iloc[i] = 0

In [214]:
train_df = df[df['split']=='train'][['review', 'rating']]
val_df = df[df['split']=='val'][['review', 'rating']]
test_df = df[df['split']=='test'][['review', 'rating']]
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

In [232]:
print(train_df.review[5])

went here with another couple for restaurant week . the place was fairly busy . we waited for mins and were seated . we had a reservation , but that didn t seem to affect the folks seating tables . we got to our table outside wand waited another half hour just to see our waitress . we noticed everyone around us was complaining about the slow service . we ordered drinks and about another half hour later we ordered our food . my lobster was overcooked and stringy and my steak was mediocre at best . all in all my visit to the mariner s inn was terrible . 


In [104]:
nlp = spacy.load("en_core_web_sm")

In [105]:
TEXT = ttdata.Field(sequential=True, use_vocab=True,
                 tokenize='spacy', lower=True,
                 batch_first=True, fix_length=100,
                 init_token='<SOS>', eos_token='<EOS>')

LABEL = ttdata.Field(sequential=False, use_vocab=False,
                  batch_first=True, is_target=True)

In [321]:
fields0 = (('review', TEXT), ('rating', LABEL))
iis = list(range(len(train_df)))
random.shuffle(iis)
train_data0 = [ Example.fromlist(
    train_df.iloc[ii].values.tolist(),
                              fields0) for ii in iis ]

In [323]:
n_data = len(train_data)
train_data = ttDataset(train_data0[:n_data//5*4], fields=fields0)
test_data = ttDataset(train_data0[n_data//5*4:], fields=fields0)
TEXT.build_vocab(train_data, min_freq=10, max_size=10000)

In [324]:
iter_train = Iterator(train_data, batch_size=100, shuffle=True)
iter_test = Iterator(test_data, batch_size=100, shuffle=False)

In [392]:
class RNN_classifier(Net):
    def __init__(self, embed = None, rnn=None, downnet=None,
                 loss=None, optimizer=None, device='cuda'):
        """
        net is consists of [embed, rnn, downnet]
        :param downnet: define downstream job
        """   
        super(RNN_classifier, self).__init__(loss=loss,
                                            optimizer=optimizer,
                                            device=device)
        self.embed = embed
        self.rnn = rnn
        self.downnet = downnet
        self.softmax = nn.Softmax(dim=-1)
        
        self.init_weights()
    
    def set_train(self):
        self.embed.train()
        self.rnn.train()
        self.downnet.train()
        
    def set_eval(self):
        self.embed.eval()
        self.rnn.eval()
        self.downnet.eval()
    
    def init_weights(self):
        initrange = 0.5
        self.embed.weight.data.uniform_(-initrange, initrange)
        torch.nn.init.xavier_uniform_(self.downnet[1].weight)

    def forward(self, x):
        if self.embed is not None:
            x = self.embed(x)
        out, hidden = self.rnn(x)
        out = self.downnet(out[:,-1]) # choose last output
        return out

In [393]:
dim_embed = 100
dim_hidden = 200
embed = nn.Embedding(num_embeddings=len(TEXT.vocab),
                    embedding_dim=dim_embed,
                    padding_idx=TEXT.vocab.stoi['<pad>']).to(device)

lstm = nn.LSTM(input_size=dim_embed, 
        hidden_size=dim_hidden, 
        num_layers= 2,
        batch_first=True,
        bidirectional=True).to(device)

classifier = get_MLP([dim_hidden*2, dim_hidden*3, dim_hidden, 2], end=True)
#classifier = get_MLP([dim_hidden*2, 2], end=True)
classifier = classifier.to(device)

In [394]:
loss = nn.CrossEntropyLoss()
parms = list(embed.parameters())
parms += list(lstm.parameters())
parms += list(classifier.parameters())

optimizer = optim.Adam(parms)
seq_class = RNN_classifier(embed, lstm, classifier, loss, optimizer)

In [396]:
for _ in range(5):
    seq_class.run_train(1, iter_train, iter_test)

epoch 0 loss 0.6797516005379813
evaluate mom loss tensor(0.6868)
accuracy 0.5300241921548298
epoch 0 loss 0.5731219926976269
evaluate mom loss tensor(0.4280)
accuracy 0.8170468290997063
epoch 0 loss 0.33886991765188135
evaluate mom loss tensor(0.3308)
accuracy 0.8513046483497494
epoch 0 loss 0.229146563118289
evaluate mom loss tensor(0.3187)
accuracy 0.8649991359944703
epoch 0 loss 0.17484407871961594
evaluate mom loss tensor(0.3177)
accuracy 0.8749783998617591
