In [9]:
import torch 
import torch.autograd as autograd 
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np

## Lab1 
Create a bag of word model for a text classification problem. Note that this is not the same as the continous bag of word problem that we solved here but you can reuse the tokenization part.

https://github.com/yanneta/ML-notebooks/blob/master/cbow.ipynb

###  Download data

In [10]:
def get_data():
    ! wget http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz
    ! mkdir data
    ! tar -xvf rotten_imdb.tar.gz -C data

In [3]:
get_data()
! ls data

--2020-05-14 11:42:27--  http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz
Resolving www.cs.cornell.edu (www.cs.cornell.edu)... 132.236.207.20
Connecting to www.cs.cornell.edu (www.cs.cornell.edu)|132.236.207.20|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 519599 (507K) [application/x-gzip]
Saving to: ‘rotten_imdb.tar.gz.1’


2020-05-14 11:42:29 (557 KB/s) - ‘rotten_imdb.tar.gz.1’ saved [519599/519599]

mkdir: data: File exists
x quote.tok.gt9.5000
x plot.tok.gt9.5000
x subjdata.README.1.0
plot.tok.gt9.5000   quote.tok.gt9.5000  subjdata.README.1.0


### Split data

In [11]:
def read_file(path):
    """ Read file returns a list of lines.
    """
    with open(path, encoding = "ISO-8859-1") as f:
        content = f.readlines()
    return content

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
sub_content = read_file("data/quote.tok.gt9.5000")
obj_content = read_file("data/plot.tok.gt9.5000")
sub_content = np.array([line.strip().lower() for line in sub_content])
obj_content = np.array([line.strip().lower() for line in obj_content])
sub_y = np.zeros(len(sub_content))
obj_y = np.ones(len(obj_content))
X = np.append(sub_content, obj_content)
y = np.append(sub_y, obj_y)

In [14]:
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)

In [15]:
x_train.shape, y_train.shape

((8000,), (8000,))

In [17]:
x_train[0]

"both lead performances are oscar-size . quaid is utterly fearless as the tortured husband living a painful lie , and moore wonderfully underplays the long-suffering heroine with an unflappable '50s dignity somewhere between jane wyman and june cleaver ."

### Compute a vocabulary
* Split your sentences in tokens by spliting on spaces.
* Compute the frequency of every word.
* Pick top frequency words (4000 or so) to be part of your vocabulary.
* Create a map from each word to an index. Keep 0 for out of the vocabulary workds (<UNK>).

### Bag of word representation

* Given a piece of text compute the following features $x$.
$x_i = 1$ if word with index $i$ appears in the text. Otherwise $x_i = 0$. Note that length $x$ is the size of the vocabulary.  

###  Dataset and dataloaders
Write a dataset for this problem

In [20]:
class BOW(Dataset):
    def __init__(self, ):
        self.x = None
        self.y = None
        
    def __len__(self):
        return None
    
    def __getitem__(self, idx):
        
        return None

### Model

Define a simpler linear model or a two layer neural network.

## Training and valid functions

In [33]:
def val_metrics(model):
    model.eval()
    correct = 0
    total = 0
    loss_sum = 0
    for x, y in valid_dl:
        y_hat = model(x.float())
        loss = F.binary_cross_entropy_with_logits(y_hat, y.float())
        y_pred = y_hat > 0
        correct += (y_pred.float() == y.float()).float().sum()
        total += x.size(0)
        loss_sum += loss.item()*x.size(0)
    accuracy = correct.item()/total
    return loss_sum/total, accuracy

###  Training loop

In [35]:
def train_epocs(model, epochs=10, lr=0.001):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    for i in range(epochs):
        model.train()
        total = 0
        loss_sum = 0
        for x, y in train_dl:
            y_hat = model(x.float())
            loss = F.binary_cross_entropy_with_logits(y_hat, y.float())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total += x.size(0)
            loss_sum += loss.item()*x.size(0)
        val_loss, val_acc = val_metrics(model)
        print("train loss %.3f val loss %.3f and accuracy %.3f" % (loss_sum/total, val_loss, val_acc))

In [36]:
model = BOWModel(vocab_size=len(vocab2index.keys()))
train_epocs(model, 15, 0.005)

train loss 0.656 val loss 0.615 and accuracy 0.875
train loss 0.576 val loss 0.551 and accuracy 0.889
train loss 0.512 val loss 0.501 and accuracy 0.893
train loss 0.461 val loss 0.461 and accuracy 0.897
train loss 0.422 val loss 0.429 and accuracy 0.898
train loss 0.390 val loss 0.404 and accuracy 0.899
train loss 0.364 val loss 0.383 and accuracy 0.898
train loss 0.342 val loss 0.366 and accuracy 0.901
train loss 0.324 val loss 0.352 and accuracy 0.902
train loss 0.308 val loss 0.340 and accuracy 0.903
train loss 0.294 val loss 0.329 and accuracy 0.904
train loss 0.282 val loss 0.320 and accuracy 0.906
train loss 0.271 val loss 0.312 and accuracy 0.907
train loss 0.261 val loss 0.305 and accuracy 0.908
train loss 0.253 val loss 0.299 and accuracy 0.910


### Word importance
To get the words that affect the most the positive label we find the words with higher weights. Similarly to get the words that affect the most the 0 label we find the words with lower weights.

In [37]:
parms = [p for p in model.parameters()]
parms

[Parameter containing:
 tensor([[-0.1818, -0.1209, -0.0691,  ..., -0.1578,  0.2273,  0.2485]],
        requires_grad=True), Parameter containing:
 tensor([-0.0347], requires_grad=True)]

In [38]:
weights = parms[0].detach().numpy()
weights

array([[-0.18177307, -0.12088173, -0.06908546, ..., -0.15775996,
         0.22728996,  0.24849562]], dtype=float32)

In [39]:
weights[0].shape

(4009,)

In [40]:
sorted_indeces = np.argsort(weights[0])

In [41]:
weights[0, sorted_indeces[0]], weights[0, sorted_indeces[-1]],

(-0.4535758, 0.44974813)

In [42]:
[words[i] for i in sorted_indeces[:10]]

['material',
 'performance',
 'actors',
 'movie',
 'its',
 'interesting',
 'script',
 'beautifully',
 'movies',
 "film's"]

In [43]:
[words[i] for i in sorted_indeces[3998:]]

['obsessed',
 'kill',
 'secret',
 'school',
 'patricia',
 'sam',
 'however',
 'she',
 'they',
 '-',
 'discover']