<a href="https://colab.research.google.com/github/yala/introML_chem/blob/master/lab3/cnn_and_rnn_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction to CNNs and RNNs in PyTorch
In this tutorial, we'll take you through developing convolutional neural networks (CNNs) and recurrent neural networks (RNNs) in PyTorch to classify beer reviews.

Let's get started!

# Preliminaries

The next few sections will set up the necessary components of the tutorial, including:


1.   Installing PyTorch
2.   Importing dependencies
3.   Downloading and processing data
4.   Defining training and evaluation procedures



## Download PyTorch

In [0]:
# http://pytorch.org/
from os import path
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag

platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
accelerator = 'cu100' if path.exists('/opt/bin/nvidia-smi') else 'cpu'

print(f'Platform = {platform}, Accelerator = {accelerator}')

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-1.1.0-{platform}-linux_x86_64.whl
!pip install -q torchvision

import torch
print(f'Torch version = {torch.__version__}')
print(f'Cuda available = {torch.cuda.is_available()}')
print(f'Cuda version = {torch.version.cuda}')
print(f'Cuda devices = {torch.cuda.get_device_name(0)}')

## Imports

In [0]:
import argparse
from collections import Counter
import pickle
import re

import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms
from tqdm import tqdm

## Download and Process Data

In [0]:
!apt-get install wget
!wget https://raw.githubusercontent.com/yala/MLCodeLab/master/lab1/data/beer/overall_train.p
!wget https://raw.githubusercontent.com/yala/MLCodeLab/master/lab1/data/beer/overall_dev.p
!wget https://raw.githubusercontent.com/yala/MLCodeLab/master/lab1/data/beer/overall_test.p

train_path = "overall_train.p"
dev_path   = "overall_dev.p"
test_path  = "overall_test.p"

train_set =  pickle.load(open(train_path, 'rb'))
dev_set =  pickle.load(open(dev_path, 'rb'))
test_set =  pickle.load(open(test_path, 'rb'))

def preprocess_data(data):
    for indx, sample in enumerate(data):
        text, label = sample['text'], sample['y']
        text = re.sub('\W+', ' ', text).lower().strip()
        data[indx] = text, label
    return data

train_set = preprocess_data(train_set)
dev_set = preprocess_data(dev_set)
test_set =  preprocess_data(test_set)

print(f'Num Train = {len(train_set):,}')
print(f'Num Dev   = {len(dev_set):,}')
print(f'Num Test  = {len(test_set):,}')
print()

trainText = [t[0] for t in train_set]
trainY = [t[1] for t in train_set]

devText = [t[0] for t in dev_set]
devY = [t[1] for t in dev_set]

testText = [t[0] for t in test_set]
testY = [t[1] for t in test_set]

print('Train class balance')
y_count = Counter(trainY)
for y in sorted(y_count.keys()):
    print(f'{y} = {100. * y_count[y] / len(trainY):.2f}%')

## Define Dataset Class

In [0]:
class BeerReviewDataset(Dataset):
    def __init__(self, X, Y):
      self.X, self.Y = X, Y
      assert len(X) == len(Y)

    def __len__(self):
       return len(self.X)

    def __getitem__(self, i):
      return np.array(self.X[i]), self.Y[i]

## Define Training Procedure




In [0]:
# Training settings
batch_size = 64
epochs = 10
lr = 1e-3
dropout = 0.5
use_cuda = True

In [0]:
def train_epoch(model, train_loader, optimizer, epoch):
    model.train() # Set the nn.Module to train mode. 
    total_loss = 0
    correct = 0
    num_samples = len(train_loader.dataset)
    for batch_idx, (data, target) in tqdm(enumerate(train_loader), total=len(train_loader)):  #1) get batch
        # Move to cuda
        if next(model.parameters()).is_cuda:
            data, target = data.cuda(), target.cuda()
      
        # Reset gradient data to 0
        optimizer.zero_grad()
        # Get prediction for batch
        output = model(data)
        # 2) Compute loss
        loss = F.cross_entropy(output, target)
        # 3) Do backprop
        loss.backward()
        # 4) Update model
        optimizer.step()
        
        # Do book-keeping to track accuracy and avg loss
        pred = output.max(1, keepdim=True)[1]  # get the index of the max log-probability
        correct += pred.eq(target.view_as(pred)).sum().item()
        total_loss += loss.detach()  # Don't keep computation graph 

    print(f'Train Epoch: {epoch} \t'
          f'Loss: {total_loss / num_samples:.4f}, '
          f'Accuracy: {correct}/{num_samples} ({100. * correct / num_samples:.0f}%)')

## Define Evaluation Procedure

In [0]:
def eval_epoch(model, test_loader, name):
    model.eval()
    test_loss = 0
    correct = 0
    for data, target in test_loader:
        # Move to cuda
        if next(model.parameters()).is_cuda:
            data, target = data.cuda(), target.cuda()
        
        output = model(data)
        test_loss += F.cross_entropy(output, target).item() # sum up batch loss
        pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability
        correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    print(f'\n{name} set: '
          f'Average loss: {test_loss:.4f}, '
          f'Accuracy: {correct}/{len(test_loader.dataset)} ({100. * correct / len(test_loader.dataset):.0f}%)\n')

## Define Utility Functions

In [0]:
def param_count(model):
    return sum(param.numel() for param in model.parameters() if param.requires_grad)

# Word Embeddings

In [0]:
# Download pre-trained embeddings
!wget https://github.com/yala/introML_chem/raw/master/lab1/data/beer/words.p
!wget https://github.com/yala/introML_chem/raw/master/lab1/data/beer/embeddings.npz

In [0]:
# Load embeddings
with open('words.p', 'rb') as f:
    words = pickle.load(f)

embeddings = np.load('embeddings.npz')['features']

word_to_embedding = {word: embedding for word, embedding in zip(words, embeddings)}

In [0]:
print(f'Vocab size = {len(word_to_embedding):,}')

print(f'Embedding of "good" = {word_to_embedding["good"][:5]}')
print(f'Embedding of "bad" = {word_to_embedding["bad"][:5]}')

In [0]:
# Map words to embeddings
trainX = [[word_to_embedding[word] for word in text.split()] for text in trainText]
devX =   [[word_to_embedding[word] for word in text.split()] for text in devText]
testX =  [[word_to_embedding[word] for word in text.split()] for text in testText]

In [0]:
# Define padding
max_len = 150
pad = np.zeros(300)

# Add padding
trainX = [seq[:max_len] + [pad] * (max_len - len(seq)) for seq in trainX]
devX =   [seq[:max_len] + [pad] * (max_len - len(seq)) for seq in devX]
testX =  [seq[:max_len] + [pad] * (max_len - len(seq)) for seq in testX]

In [0]:
# Convert to Dataset
train = BeerReviewDataset(trainX, trainY)
dev = BeerReviewDataset(devX, devY)
test = BeerReviewDataset(testX, testY)

# Convert to DataLoader
train_loader = DataLoader(train, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test, batch_size=batch_size, shuffle=True)

# Multi-Layer Perceptron (MLP)

## Define MLP

In [0]:
class MLP(nn.Module):
    def __init__(self, embedding_dim=300, hidden_size=300, output_size=3, dropout=0.0):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(embedding_dim, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = x.float() # batch_size x seq_length x embedding_dim
        
        # Sum embeddings
        x = x.sum(dim=1)  # batch_size x embedding_dim
        
        # MLP
        hidden = self.dropout(F.relu(self.fc1(x)))
        hidden = self.dropout(F.relu(self.fc2(hidden)))
        logit = self.fc3(hidden)
        
        return logit

## Build MLP and Optimizer

In [0]:
model = MLP(dropout=dropout)

print(model)
print(f'Number of parameters = {param_count(model):,}')

# Move to cuda
if use_cuda and torch.cuda.is_available():
    model = model.cuda()

optimizer = optim.Adam(model.parameters(), lr=lr) 

## Train MLP

In [0]:
for epoch in range(1, epochs + 1):
    train_epoch(model, train_loader, optimizer, epoch)
    eval_epoch(model,  dev_loader, "Dev")
    print("---")

## Test MLP

In [0]:
eval_epoch(model,  test_loader, "Test")

# Recurrent Neural Network (RNN)

## Define RNN

In [0]:
class RNN(nn.Module):
    def __init__(self, embedding_dim=300, hidden_size=300, output_size=3, dropout=0.0):
        super(RNN, self).__init__()
        self.rnn = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            batch_first=True
        )
        self.output = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        x = x.float()  # batch_size x seq_length x embedding_dim
        
        # Apply RNN
        o, _ = self.rnn(x)  # batch_size x seq_length x hidden_size
        
        # Dropout
        o = self.dropout(o)
        
        # Max pooling across sequence
        o, _ = torch.max(o, dim=1)    # batch_size x hidden_size
        
        # Output layer
        logit = self.output(o)  # batch_size x output_size
        
        return logit

## Build RNN and Optimizer

In [0]:
model = RNN(dropout=dropout)

print(model)
print(f'Number of parameters = {param_count(model):,}')

# Move to cuda
if use_cuda and torch.cuda.is_available():
    model = model.cuda()
    
optimizer = optim.Adam(model.parameters(), lr=lr) 

## Train RNN

In [0]:
for epoch in range(1, epochs + 1):
    train_epoch(model, train_loader, optimizer, epoch)
    eval_epoch(model,  dev_loader, "Dev")
    print("---")

## Test RNN

In [0]:
eval_epoch(model,  test_loader, "Test")

# Convolutional Neural Network (CNN)

## Define CNN

In [0]:
class CNN(nn.Module):
    def __init__(self, embedding_dim=300, hidden_size=100, output_size=3, dropout=0.0):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=embedding_dim, out_channels=hidden_size, kernel_size=3, padding=0)
        self.conv2 = nn.Conv1d(in_channels=embedding_dim, out_channels=hidden_size, kernel_size=5, padding=1)
        self.conv3 = nn.Conv1d(in_channels=embedding_dim, out_channels=hidden_size, kernel_size=7, padding=2)
        self.output = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):  # batch_size x seq_length x embedding_dim
        x = x.float()
        x = x.permute(0, 2, 1)  # batch_size x embedding_dim x seq_length
        
        # Convolutional layers
        hidden_1 = self.dropout(F.relu(self.conv1(x)))
        hidden_2 = self.dropout(F.relu(self.conv2(x)))
        hidden_3 = self.dropout(F.relu(self.conv3(x)))
        
        # Sum
        hidden = hidden_1 + hidden_2 + hidden_3
        
        # Max pooling
        hidden, _ = hidden.max(dim=-1)
        

        # Output layer
        x = self.output(hidden)  # batch_size x output_size
        
        return x

## Build CNN and Optimizer

In [0]:
model = CNN(dropout=dropout)

print(model)
print(f'Number of parameters = {param_count(model):,}')

if use_cuda and torch.cuda.is_available():
    model = model.cuda()

optimizer = optim.Adam(model.parameters(), lr=lr) 

## Train CNN

In [0]:
for epoch in range(1, epochs + 1):
    train_epoch(model, train_loader, optimizer, epoch)
    eval_epoch(model,  dev_loader, "Dev")
    print("---")

## Test CNN

In [0]:
eval_epoch(model,  test_loader, "Test")