<a href="https://colab.research.google.com/github/wanyun-yang/RNN_Movie_Review_Sentiment_Analysis/blob/main/RNN_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files, drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
import os, sys
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer


pos_root_dir = './drive/My Drive/reviews/pos/'
neg_root_dir = './drive/My Drive/reviews/neg/'
pos_files = os.listdir(pos_root_dir)
neg_files = os.listdir(neg_root_dir)

max_features = 200

corpus = []
labels = []
for i in range(500):
  f = pos_files[i]
  with open(pos_root_dir+f) as fh:
    corpus.append(fh.read().replace('\n',' '))
    labels.append([1,0])
for i in range(500):
  f = neg_files[i]
  with open(neg_root_dir+f) as fh:
    corpus.append(fh.read().replace('\n',' '))
    labels.append([0,1])

vectorizer = TfidfVectorizer(max_features = max_features, stop_words = 'english')
X = vectorizer.fit_transform(corpus)
y = np.array(labels)
print(X.shape,y.shape)
print(X[0,5])

(1000, 200) (1000, 2)
0.0


In [3]:
import torch
from sklearn.model_selection import train_test_split

seq_length = -1

word_tokenizer = vectorizer.build_tokenizer()
vocab = vectorizer.vocabulary_

doc_terms_list_train = [word_tokenizer(doc_str) for doc_str in corpus]
docs = []
for i in range(len(doc_terms_list_train)):
  terms = []
  for j in range(len(doc_terms_list_train[i])):
    w = doc_terms_list_train[i][j]
    if w in vocab:
      terms.append(w)
  if len(terms) > seq_length:
    seq_length = len(terms)
  docs.append(terms)

datasets = np.zeros((X.shape[0],seq_length,max_features))

for i in range(len(docs)):
  # Padding
  n_padding = seq_length - len(docs[i])

  for j in range(len(docs[i])):
    w = docs[i][j]
    idx = vocab[w]
    tfidf_val = X[i,idx]
    datasets[i,j+n_padding,idx] = tfidf_val

datasets = datasets.astype(np.float32)
y = y.astype(np.float32)

X_train,X_val,y_train,y_val = train_test_split(datasets, y, test_size=0.2, random_state=2020)
print(X_train.shape,y_train.shape,X_val.shape,y_val.shape)

(800, 312, 200) (800, 2) (200, 312, 200) (200, 2)


In [4]:
from torch.utils.data import DataLoader, TensorDataset

batch_size = 16

train_data = TensorDataset(torch.from_numpy(X_train),torch.from_numpy(y_train))
val_data = TensorDataset(torch.from_numpy(X_val),torch.from_numpy(y_val))

train_loader = DataLoader(train_data, shuffle=True, batch_size = batch_size)
val_loader = DataLoader(val_data,shuffle=True,batch_size=batch_size)

In [5]:
from __future__ import unicode_literals, print_function, division

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class Model(nn.Module):
  
  def __init__(self, input_size, output_size, hidden_size, n_layers):
    super().__init__()

    self.hidden_size = hidden_size
    self.n_layers = n_layers

# Define three types of layer.
    self.rnn = nn.RNN(input_size,hidden_size,n_layers,batch_first=True)
    self.fc1 = nn.Linear(hidden_size,output_size)
    self.fc2 = nn.Linear(output_size,2)
  
  def forward(self,x,hidden):
    batch_size = x.size()[0]

    hidden = self.init_hidden(batch_size)

    rnn_out,hidden = self.rnn(x,hidden)
    rnn_out = self.fc1(rnn_out)
    last_out = rnn_out[:,-1,:].view(batch_size,-1)
    out = F.softmax(self.fc2(last_out))

    return out,hidden
  
  def init_hidden(self,batch_size):
    hidden = torch.zeros(self.n_layers, batch_size, self.hidden_size).cuda()
    return hidden

model = Model(200,32,256,2)   
print(model) 

Model(
  (rnn): RNN(200, 256, num_layers=2, batch_first=True)
  (fc1): Linear(in_features=256, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=2, bias=True)
)


In [None]:
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
  model.to(device)

# Define hyperparameters
n_epochs = 10
lr = 1e-4
counter = 0
clip = 5

#Define Loss, Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=lr)

model.train()
for epoch in range(n_epochs):
  # Initialize hidden state
  h = model.init_hidden(batch_size)

  # Batch loop
  for inputs,labels in train_loader:
    counter +=1
    inputs,labels = inputs.to(device),labels.to(device)

    model.zero_grad()

    output,h = model(inputs,h)

    loss = criterion(outputs,torch.max(labels,1)[1])
    loss.backward()

    nn.utils.clip_grad_norm(model.parameters(),clip)
    optimizer.step()

    #Validation loss
    if counter% 10 ==0:
      val_h = model.init_hidden(batch_size).cuda()
      val_losses = []

      model.eval()

      for inputs,labels in val_loader:
        inputs, labels = inputs.to(device),labels.to(device)
        val_outputs, val_h = model(inputs,val_h)
        val_loss = criterion(val_outputs,torch.max(labels,1)[1])
        val_losses.append(val_loss.item())

      model.train()

      print('Epoch:{}/{}'.format(epoch+1,n_epochs),
            'Batch:()'.format(counter),
            'Train Loss:{:.5f}'.format(loss.item()),
            'Val Loss:(:.5f)'.format(np.mean(val_losses)))


