# Simple CNN

# Readings
1. (torchtext 1) https://towardsdatascience.com/use-torchtext-to-load-nlp-datasets-part-ii-f146c8b9a496
1. (torchtext 2) http://mlexplained.com/2018/02/08/a-comprehensive-tutorial-to-torchtext/
1. (torchtext 3) http://mlexplained.com/2018/02/15/language-modeling-tutorial-in-torchtext-practical-torchtext-part-2/
1. (torchtext 4) http://anie.me/On-Torchtext/
1. (conv 1) https://medium.com/@TalPerry/convolutional-methods-for-text-d5260fd5675f
1. (conv 2) http://debajyotidatta.github.io/nlp/deep/learning/word-embeddings/2016/11/27/Understanding-Convolutions-In-Text/

In [1]:
import pandas as pd
import numpy as np
from sklearn.externals import joblib
import nltk
import gensim
import spacy

from sklearn import metrics

import torch as tt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchtext.data import Field, LabelField, BucketIterator, TabularDataset, Iterator



SEED = 42
np.random.seed(SEED)

In [2]:
!head Tweets.csv

tweet_id,airline_sentiment,airline,retweet_count,text
570306133677760513,neutral,Virgin America,0,@VirginAmerica What @dhepburn said.
570301130888122368,positive,Virgin America,0,@VirginAmerica plus you've added commercials to the experience... tacky.
570301083672813571,neutral,Virgin America,0,@VirginAmerica I didn't today... Must mean I need to take another trip!
570301031407624196,negative,Virgin America,0,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse"
570300817074462722,negative,Virgin America,0,@VirginAmerica and it's a really big bad thing about it
570300767074181121,negative,Virgin America,0,"@VirginAmerica seriously would pay $30 a flight for seats that didn't have this playing.
it's really the only bad thing about flying VA"
570300616901320704,positive,Virgin America,0,"@VirginAmerica yes, nearly every time I fly VX this “ear worm” won’t go away :)"
570300248553349120,neutral,Virgin Am

# 1 TorchText

In [3]:
import spacy


spacy_en = spacy.load('en')

def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text) if tok.text.isalpha()]            

In [4]:
classes={
    'negative':0,
    'neutral':1,
    'positive':2
}

TEXT = Field(include_lengths=True, batch_first=True, 
             tokenize=tokenizer,
             eos_token='<eos>',
             lower=True,
             stop_words=nltk.corpus.stopwords.words('english'))
LABEL = LabelField(dtype=tt.int64, use_vocab=True, preprocessing=lambda x: classes[x])

dataset = TabularDataset('Tweets.csv', format='csv', 
                         fields=[(None, None),('label', LABEL), (None, None),(None, None),('text', TEXT)], 
                         skip_header=True)

In [5]:
# TEXT.build_vocab(dataset, min_freq=10, vectors="glove.6B.100d")
TEXT.build_vocab(dataset, min_freq=5)
len(TEXT.vocab.itos)

2748

In [6]:
TEXT.vocab.itos[:10]

['<unk>',
 '<pad>',
 '<eos>',
 'flight',
 'get',
 'thanks',
 'cancelled',
 'service',
 'help',
 'time']

In [7]:
LABEL.build_vocab(dataset)

In [8]:
train, test = dataset.split(0.7, stratified=True)
train, valid = train.split(0.7, stratified=True)

In [9]:
np.unique([x.label for x in train.examples], return_counts=True)

(array([0, 1, 2]), array([4498, 1518, 1158]))

In [10]:
np.unique([x.label for x in valid.examples], return_counts=True)

(array([0, 1, 2]), array([1927,  651,  496]))

In [11]:
np.unique([x.label for x in test.examples], return_counts=True)

(array([0, 1, 2]), array([2753,  930,  709]))

# 3 Convolutional NN for text classification

Formal definition of convolution of functions $f$ and $g$
$$ (f∗g)(t)= \int_0^{\infty} f(\tau)g(t−\tau) d{\tau} $$

![img](http://www.stokastik.in/wp-content/uploads/2016/09/convolution_ilustration.png)

In [39]:
class MyModel(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size, kernels):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        
        self.convs = nn.ModuleList([nn.Conv1d(embed_size, hidden_size, k, padding=5) for k in kernels])
        
        self.fc = nn.Linear(hidden_size * len(kernels), 3)
        
    def forward(self, x):
        
        x = self.embedding(x)
        x = x.transpose(1,2)
        
        concatenated = []
        for conv in self.convs:
            z = conv(x)
            z = F.avg_pool1d(z, kernel_size=z.size(2))
            z = z.squeeze(2)
            concatenated.append(z)
            
        x = tt.cat(concatenated, 1)
        x = self.fc(x)
        return x

In [40]:
tt.cuda.empty_cache()

batch_size = 32

model = MyModel(len(TEXT.vocab.itos),
                embed_size=100,
                hidden_size=128,
                kernels=[2,3,4,5]
               )

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train, valid, test),
    batch_sizes=(batch_size, batch_size, batch_size),
    shuffle=True,
    sort_key=lambda x: len(x.text),
#     sort_within_batch=True,
)

optimizer = optim.Adam(model.parameters())
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True, cooldown=5)
criterion = nn.CrossEntropyLoss()

In [41]:
# train ...

HBox(children=(IntProgress(value=0, description='epoch 0', max=225, style=ProgressStyle(description_width='ini…

validation loss 0.94024


HBox(children=(IntProgress(value=0, description='epoch 1', max=225, style=ProgressStyle(description_width='ini…

validation loss 0.92357


HBox(children=(IntProgress(value=0, description='epoch 2', max=225, style=ProgressStyle(description_width='ini…

validation loss 0.86877
