<a href="https://colab.research.google.com/github/yala/introML_chem/blob/master/lab2/beer_review_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Beer Review Exercise!
In this exercise, you'll extend on the tutorial from lab1 to implement neural networks to learn to analyze beer reviews. 

Let's get started!

In [0]:
# http://pytorch.org/
from os import path
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())

accelerator = 'cu80' if path.exists('/opt/bin/nvidia-smi') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.0-{platform}-linux_x86_64.whl torchvision==0.2.0
import torch
print(torch.__version__)
print(torch.cuda.is_available())

In [0]:
import argparse
from sklearn.feature_extraction.text import CountVectorizer
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import pickle
import re


In [0]:
!apt-get install wget
!wget https://raw.githubusercontent.com/yala/MLCodeLab/master/lab1/data/beer/overall_train.p
!wget https://raw.githubusercontent.com/yala/MLCodeLab/master/lab1/data/beer/overall_dev.p
!wget https://raw.githubusercontent.com/yala/MLCodeLab/master/lab1/data/beer/overall_test.p

train_path = "overall_train.p"
dev_path   = "overall_dev.p"
test_path  = "overall_test.p"

train_set =  pickle.load(open(train_path, 'rb'))
dev_set =  pickle.load(open(dev_path, 'rb'))
test_set =  pickle.load(open(test_path, 'rb'))



def preprocess_data(data):
    for indx, sample in enumerate(data):
        text, label = sample['text'], sample['y']
        text = re.sub('\W+', ' ', text).lower().strip()
        data[indx] = text, label
    return data

train_set = preprocess_data(train_set)
dev_set = preprocess_data(dev_set)
test_set =  preprocess_data(test_set)


print("Num Train: {}".format(len(train_set)))
print("Num Dev: {}".format(len(dev_set)))
print("Num Test: {}".format(len(test_set)))

In [0]:
# Define Beer review dataset
class BeerReviewDataset(torch.utils.data.Dataset):
    def __init__(self, X, Y):
      self.dataset = (X, Y)
      assert X.shape[0] == len(Y)
    def __len__(self):
       return self.dataset[0].shape[0]

    def __getitem__(self, i):
      return np.array(self.dataset[0][i].todense()[0]), self.dataset[1][i]
        

## Prepare your dataset (Feature Engineering)

In [0]:
#Extract tweets and labels into 2 lists
trainText = [t[0] for t in train_set]
trainY = [t[1] for t in train_set]

devText = [t[0] for t in dev_set]
devY = [t[1] for t in dev_set]


testText = [t[0] for t in test_set]
testY = [t[1] for t in test_set]

# Set that word has to appear at least 5 times to be in vocab
min_df = 5
max_features = 1000
countVec = CountVectorizer(min_df = min_df, max_features = max_features )
# Learn vocabulary from train set
countVec.fit(trainText)

# Transform list of review to matrix of bag-of-word vectors
trainX = countVec.transform(trainText)
devX = countVec.transform(devText)
testX = countVec.transform(testText)

In [0]:

train = BeerReviewDataset(trainX, trainY)
dev =   BeerReviewDataset(devX, devY)
test =   BeerReviewDataset(testX, testY)

train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
dev_loader = torch.utils.data.DataLoader(dev, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=True)

In [0]:

for batch in train_loader:
  print(batch[0].shape)
  print(batch[1].shape)
  
  break


## Define your model

In [0]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.fc = nn.Linear(1000, 3)

    def forward(self, x):
        return self.fc(x)


## Define your training procedure




In [0]:
# Training settings
batch_size = 64
epochs = 10
lr = .01
momentum = 0.5

model = Model()
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)


To train our model:

1) we'll randomly sample batches from our train loader

2) compute our loss (using standard `cross_entropy`)

3) compute our gradients (by calling `backward()` on our loss)

4) update our neural network with an `optimizer.step()`, and go back to 1)

I've added some extra stuff here to log our accuracy and avg loss for the epoch.


In [0]:
def train_epoch( model, train_loader, optimizer, epoch):
    model.train() # Set the nn.Module to train mode. 
    total_loss = 0
    correct = 0
    num_samples = len(train_loader.dataset)
    for batch_idx, (x, target) in enumerate(train_loader): #1) get batch
        x = x.float().squeeze(1)
        # Reset gradient data to 0
        optimizer.zero_grad()
        # Get prediction for batch
        output = model(x)
        # 2) Compute loss
        loss = F.cross_entropy(output, target)
        #3) Do backprop
        loss.backward()
        #4) Update model
        optimizer.step()
        
        ## Do book-keeping to track accuracy and avg loss
        pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability
        correct += pred.eq(target.view_as(pred)).sum().item()
        total_loss += loss.detach() # Don't keep computation graph 

    print('Train Epoch: {} \tLoss: {:.4f}, Accuracy: {}/{} ({:.0f}%)'.format(
            epoch, total_loss / num_samples, 
            correct, 
            num_samples,
            100. * correct / num_samples))


## Define our evaluation loop
Similar to above, we'll also loop through our dev or test set, and compute our loss and accuracy. 
This lets us see how well our model is generalizing. 

In [0]:
def eval_epoch(model, test_loader, name):
    model.eval()
    test_loss = 0
    correct = 0
    for data, target in test_loader:
        data = data.float().squeeze(1)
        target = target.long()
        output = model(data)
        test_loss += F.cross_entropy(output, target).item() # sum up batch loss
        pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability
        correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    print('\n{} set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        name,
        test_loss, 
        correct, 
        len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))


## Train your model

In [0]:

for epoch in range(1, epochs + 1):
    train_epoch(model, train_loader, optimizer, epoch)
    eval_epoch(model,  dev_loader, "Dev")
    print("---")

In [0]:
eval_epoch(model,  test_loader, "Test")