In [1]:
import torch 
import torch.autograd as autograd 
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np

## Lab1 
Create a bag of word model for a text classification problem. Note that this is not the same as the continous bag of word problem that we solved here but you can reuse the tokenization part.

https://github.com/yanneta/ML-notebooks/blob/master/cbow.ipynb

###  Download data

In [2]:
def get_data():
    ! wget http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz
    ! mkdir data
    ! tar -xvf rotten_imdb.tar.gz -C data

In [3]:
get_data()
! ls data

--2020-05-20 18:21:57--  http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz
Resolving www.cs.cornell.edu (www.cs.cornell.edu)... 132.236.207.20
Connecting to www.cs.cornell.edu (www.cs.cornell.edu)|132.236.207.20|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 519599 (507K) [application/x-gzip]
Saving to: ‘rotten_imdb.tar.gz.2’


2020-05-20 18:21:59 (452 KB/s) - ‘rotten_imdb.tar.gz.2’ saved [519599/519599]

mkdir: data: File exists
x quote.tok.gt9.5000
x plot.tok.gt9.5000
x subjdata.README.1.0
plot.tok.gt9.5000   quote.tok.gt9.5000  subjdata.README.1.0


### Split data

In [4]:
def read_file(path):
    """ Read file returns a list of lines.
    """
    with open(path, encoding = "ISO-8859-1") as f:
        content = f.readlines()
    return content

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
sub_content = read_file("data/quote.tok.gt9.5000")
obj_content = read_file("data/plot.tok.gt9.5000")
sub_content = np.array([line.strip().lower() for line in sub_content])
obj_content = np.array([line.strip().lower() for line in obj_content])
sub_y = np.zeros(len(sub_content))
obj_y = np.ones(len(obj_content))
X = np.append(sub_content, obj_content)
y = np.append(sub_y, obj_y)

In [7]:
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)

In [8]:
x_train.shape, y_train.shape

((8000,), (8000,))

In [9]:
x_train[0]

"both lead performances are oscar-size . quaid is utterly fearless as the tortured husband living a painful lie , and moore wonderfully underplays the long-suffering heroine with an unflappable '50s dignity somewhere between jane wyman and june cleaver ."

### Compute a vocabulary
* Split your sentences in tokens by spliting on spaces.
* Compute the frequency of every word.
* Pick top frequency words (4000 or so) to be part of your vocabulary.
* Create a map from each word to an index. Keep 0 for out of the vocabulary workds (<UNK>).

In [10]:
from collections import defaultdict

In [11]:
# split the sentences and count the frequency of every word
def get_vocab(content):
    """Computes Dict of counts of words.
    
    Computes the number of times a word is on a document.
    """
    vocab = defaultdict(float)
    for line in content:
        words = set(line.split())
        for word in words:
            vocab[word] += 1
    return vocab

In [12]:
word_count = get_vocab(x_train)

In [13]:
len(word_count.keys())

21242

In [14]:
# pick top frequency word by dropping words appears less than 5 times
for word in list(word_count):
    if word_count[word] < 5:
        del word_count[word]

In [15]:
len(word_count)

4008

In [16]:
## Finally we need an index for each word in the vocab
vocab2index = {"UNK":0} # init with padding and unknown
words = ["UNK"]
for word in word_count:
    vocab2index[word] = len(words)
    words.append(word)

### Bag of word representation

* Given a piece of text compute the following features $x$.
$x_i = 1$ if word with index $i$ appears in the text. Otherwise $x_i = 0$. Note that length $x$ is the size of the vocabulary.  

In [17]:
# encode the sentence to an numpy array of 0s and 1s
def encode_sentence(s, vocab2index):
    N=len(vocab2index)
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in s.split()])
    for i in enc1:
        enc[i]=1
    return enc

In [18]:
len(encode_sentence(x_train[0], vocab2index))

4009

###  Dataset and dataloaders
Write a dataset for this problem

In [19]:
class BOW(Dataset):
    def __init__(self, x, y, vocab2index):
        self.x = x
        self.y = y
        self.vocab2index = vocab2index
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        x = encode_sentence(self.x[idx], self.vocab2index)
        return x, self.y[idx]

In [20]:
train_ds = BOW(x_train, y_train, vocab2index)
val_ds = BOW(x_val, y_val, vocab2index)
train_dl = DataLoader(train_ds, batch_size=500, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=500, shuffle=False)

### Model

Define a simpler linear model or a two layer neural network.

In [21]:
class BOWModel(nn.Module):
    def __init__(self, vocab_size, hidden=50):
        super(BOWModel, self).__init__()
        self.linear1 = nn.Linear(vocab_size, hidden)
        self.linear2 = nn.Linear(hidden, 1)
        
    def forward(self, x):
        x = self.linear1(x)
        x = F.relu(x)
        x = self.linear2(x)
        return x

## Training and valid functions

In [22]:
def val_metrics(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    loss_sum = 0
    for x, y in valid_dl:
        y_hat = model(x.float()).squeeze(1)
        loss = F.binary_cross_entropy_with_logits(y_hat, y.float())
        y_pred = y_hat > 0
        correct += (y_pred.float() == y.float()).float().sum()
        total += x.size(0)
        loss_sum += loss.item()*x.size(0)
    accuracy = correct.item()/total
    return loss_sum/total, accuracy

###  Training loop

In [23]:
def train_epocs(model, train_dl, valid_dl, epochs=10, lr=0.001):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    for i in range(epochs):
        model.train()
        total = 0
        loss_sum = 0
        for x, y in train_dl:
            y_hat = model(x.float()).squeeze(1)
            loss = F.binary_cross_entropy_with_logits(y_hat, y.float())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total += x.size(0)
            loss_sum += loss.item()*x.size(0)
        val_loss, val_acc = val_metrics(model, valid_dl)
        print("train loss %.3f val loss %.3f and accuracy %.3f" % (loss_sum/total, val_loss, val_acc))

In [24]:
model = BOWModel(vocab_size=len(vocab2index.keys()))
train_epocs(model,train_dl, val_dl, 15, 0.005)

train loss 0.516 val loss 0.316 and accuracy 0.898
train loss 0.213 val loss 0.227 and accuracy 0.905
train loss 0.129 val loss 0.229 and accuracy 0.904
train loss 0.089 val loss 0.242 and accuracy 0.900
train loss 0.064 val loss 0.264 and accuracy 0.899
train loss 0.049 val loss 0.283 and accuracy 0.899
train loss 0.037 val loss 0.307 and accuracy 0.900
train loss 0.029 val loss 0.335 and accuracy 0.895
train loss 0.023 val loss 0.356 and accuracy 0.898
train loss 0.018 val loss 0.375 and accuracy 0.895
train loss 0.014 val loss 0.398 and accuracy 0.895
train loss 0.011 val loss 0.418 and accuracy 0.895
train loss 0.009 val loss 0.428 and accuracy 0.894
train loss 0.008 val loss 0.445 and accuracy 0.895
train loss 0.006 val loss 0.464 and accuracy 0.893


### Word importance
To get the words that affect the most the positive label we find the words with higher weights. Similarly to get the words that affect the most the 0 label we find the words with lower weights.

In [25]:
parms = [p for p in model.parameters()]
parms

[Parameter containing:
 tensor([[ 0.1674,  0.0476,  0.2405,  ...,  0.1993, -0.2115, -0.2042],
         [ 0.1217,  0.0367,  0.2313,  ...,  0.1845, -0.2021, -0.2154],
         [-0.0281,  0.1071, -0.2015,  ..., -0.1542,  0.1943,  0.2144],
         ...,
         [-0.0297,  0.0991, -0.1881,  ..., -0.1780,  0.1667,  0.2032],
         [-0.0354,  0.0850, -0.2251,  ..., -0.1934,  0.2254,  0.2239],
         [-0.0237,  0.1089, -0.2301,  ..., -0.1776,  0.2014,  0.2473]],
        requires_grad=True), Parameter containing:
 tensor([0.0794, 0.0718, 0.0858, 0.0717, 0.0948, 0.0900, 0.0788, 0.0955, 0.0668,
         0.0969, 0.0814, 0.0713, 0.0801, 0.0871, 0.0753, 0.1152, 0.0575, 0.0674,
         0.0780, 0.0720, 0.0780, 0.0678, 0.0751, 0.0756, 0.0801, 0.0636, 0.0977,
         0.0825, 0.0846, 0.1001, 0.1029, 0.0747, 0.0980, 0.0740, 0.0787, 0.1016,
         0.0570, 0.0766, 0.0916, 0.1093, 0.1020, 0.0752, 0.0774, 0.0786, 0.0919,
         0.0612, 0.2007, 0.0948, 0.0989, 0.1111], requires_grad=True), Parameter

In [26]:
weights = parms[0].detach().numpy()
weights

array([[ 0.16735557,  0.04764399,  0.24045864, ...,  0.19933857,
        -0.21148476, -0.20415619],
       [ 0.12170083,  0.03665189,  0.23129152, ...,  0.18451473,
        -0.20206094, -0.21537167],
       [-0.02809834,  0.10714494, -0.20148   , ..., -0.1541967 ,
         0.19432327,  0.21444435],
       ...,
       [-0.02966496,  0.0991099 , -0.18811   , ..., -0.17804377,
         0.1667381 ,  0.20321971],
       [-0.0354009 ,  0.08496693, -0.22514299, ..., -0.19336474,
         0.22538666,  0.22393951],
       [-0.02368307,  0.10891414, -0.23014365, ..., -0.17760113,
         0.20144713,  0.24732086]], dtype=float32)

In [27]:
weights[0].shape

(4009,)

In [28]:
sorted_indeces = np.argsort(weights[0])

In [29]:
weights[0, sorted_indeces[0]], weights[0, sorted_indeces[-1]],

(-0.33721027, 0.41792864)

In [30]:
[words[i] for i in sorted_indeces[:10]]

['finds',
 'obsessed',
 'birth',
 'devdas',
 'discover',
 'kung-fu',
 'murders',
 'door',
 'army',
 'discovers']

In [31]:
[words[i] for i in sorted_indeces[3998:]]

['material',
 'worth',
 'performances',
 'dull',
 'laughs',
 'screenwriters',
 'flick',
 'enjoy',
 '--',
 'solid',
 'entertaining']