In [1]:
import torch 
import torch.autograd as autograd 
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np

# Bag of word model for text classification

###  Download data

In [2]:
def get_data():
    ! wget http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz
    ! mkdir data
    ! tar -xvf rotten_imdb.tar.gz -C data

In [3]:
get_data()
! ls data

--2020-06-20 13:01:57--  http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz
Resolving www.cs.cornell.edu (www.cs.cornell.edu)... 132.236.207.20
Connecting to www.cs.cornell.edu (www.cs.cornell.edu)|132.236.207.20|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 519599 (507K) [application/x-gzip]
Saving to: ‘rotten_imdb.tar.gz’


2020-06-20 13:01:58 (539 KB/s) - ‘rotten_imdb.tar.gz’ saved [519599/519599]

x quote.tok.gt9.5000
x plot.tok.gt9.5000
x subjdata.README.1.0
plot.tok.gt9.5000   quote.tok.gt9.5000  subjdata.README.1.0


### Split data

In [4]:
def read_file(path):
    """ Read file returns a list of lines.
    """
    with open(path, encoding = "ISO-8859-1") as f:
        content = f.readlines()
    return content

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
sub_content = read_file("data/quote.tok.gt9.5000")
obj_content = read_file("data/plot.tok.gt9.5000")
sub_content = np.array([line.strip().lower() for line in sub_content])
obj_content = np.array([line.strip().lower() for line in obj_content])
sub_y = np.zeros(len(sub_content))
obj_y = np.ones(len(obj_content))
X = np.append(sub_content, obj_content)
y = np.append(sub_y, obj_y)

In [7]:
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)

In [8]:
x_train.shape, y_train.shape

((8000,), (8000,))

### Compute a vocabulary

In [9]:
from collections import defaultdict

In [10]:
def get_vocab(content):
    """Computes Dict of counts of words.
    
    Computes the number of times a word is on a document.
    """
    vocab = defaultdict(float)
    for line in content:
        words = set(line.split())
        for word in words:
            vocab[word] += 1
    return vocab

In [11]:
word_count = get_vocab(x_train)

In [12]:
for word in list(word_count):
    if word_count[word] < 5:
        del word_count[word]

In [13]:
len(word_count.keys())

4008

In [14]:
vocab2index = {"UNK":0} # init with unknown
words = ["UNK"]
for word in word_count:
    vocab2index[word] = len(words)
    words.append(word)

In [15]:
#vocab2index

### Bag of word representation

In [16]:
def bow(x, vocab2index):
    enc = np.zeros(len(vocab2index.keys()))
    words = set(x.split())
    for word in words:
        enc[vocab2index.get(word, 0)] = 1 # 0 if the UNK index
    return enc

In [17]:
x_train[0]

"both lead performances are oscar-size . quaid is utterly fearless as the tortured husband living a painful lie , and moore wonderfully underplays the long-suffering heroine with an unflappable '50s dignity somewhere between jane wyman and june cleaver ."

In [18]:
x = bow(x_train[0], vocab2index)
x, x.shape

(array([1., 1., 1., ..., 0., 0., 0.]), (4009,))

###  Dataset

In [19]:
class BOW(Dataset):
    def __init__(self, x, y, vocab2index):
        self.x = x
        self.y = y
        self.vocab2index = vocab2index
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x = self.x[idx]
        x = bow(x, self.vocab2index)
        return x, self.y[idx, None]

In [20]:
train_ds = BOW(x_train, y_train, vocab2index)
val_ds = BOW(x_val, y_val, vocab2index)

In [21]:
train_dl = DataLoader(train_ds, batch_size=1000, shuffle=True)
valid_dl = DataLoader(val_ds, batch_size=1000)

In [22]:
x, y = next(iter(train_dl))
x.shape, y.shape

(torch.Size([1000, 4009]), torch.Size([1000, 1]))

### Validation accuracy

Two ways of defining a model

In [23]:
vocab_size=len(vocab2index.keys())

In [24]:
x, y = next(iter(valid_dl))
x = x.float()
y = y.float()

In [25]:
class BOWModel(nn.Module):
    def __init__(self, vocab_size):
        super(BOWModel, self).__init__()
        self.linear = nn.Linear(vocab_size, 1)
        
    def forward(self, x):
        x = self.linear(x)
        return x

In [26]:
model = BOWModel(vocab_size=len(vocab2index.keys()))

In [27]:
def val_metrics(model):
    model.eval()
    correct = 0
    total = 0
    loss_sum = 0
    for x, y in valid_dl:
        y_hat = model(x.float())
        loss = F.binary_cross_entropy_with_logits(y_hat, y.float())
        y_pred = y_hat > 0
        correct += (y_pred.float() == y.float()).float().sum()
        total += x.size(0)
        loss_sum += loss.item()*x.size(0)
    accuracy = correct.item()/total
    return loss_sum/total, accuracy

In [28]:
val_metrics(model)

(0.6922248601913452, 0.494)

###  Training loop

In [29]:
def train_epocs(model, epochs=10, lr=0.001):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    for i in range(epochs):
        model.train()
        total = 0
        loss_sum = 0
        for x, y in train_dl:
            y_hat = model(x.float())
            loss = F.binary_cross_entropy_with_logits(y_hat, y.float())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total += x.size(0)
            loss_sum += loss.item()*x.size(0)
        val_loss, val_acc = val_metrics(model)
        print("train loss %.3f val loss %.3f and accuracy %.3f" % (loss_sum/total, val_loss, val_acc))

In [30]:
model = BOWModel(vocab_size=len(vocab2index.keys()))
train_epocs(model, 15, 0.005)

train loss 0.660 val loss 0.619 and accuracy 0.875
train loss 0.579 val loss 0.555 and accuracy 0.885
train loss 0.515 val loss 0.504 and accuracy 0.896
train loss 0.464 val loss 0.464 and accuracy 0.899
train loss 0.424 val loss 0.431 and accuracy 0.896
train loss 0.392 val loss 0.406 and accuracy 0.900
train loss 0.365 val loss 0.385 and accuracy 0.902
train loss 0.344 val loss 0.368 and accuracy 0.904
train loss 0.325 val loss 0.353 and accuracy 0.903
train loss 0.309 val loss 0.341 and accuracy 0.906
train loss 0.296 val loss 0.330 and accuracy 0.907
train loss 0.283 val loss 0.321 and accuracy 0.909
train loss 0.272 val loss 0.313 and accuracy 0.909
train loss 0.263 val loss 0.306 and accuracy 0.909
train loss 0.254 val loss 0.300 and accuracy 0.910


### Word importance
To get the words that affect the most the positive label we find the words with higher weights. Similarly to get the words that affect the most the 0 label we find the words with lower weights.

In [31]:
parms = [p for p in model.parameters()]
parms

[Parameter containing:
 tensor([[-0.1923,  0.1539, -0.3411,  ..., -0.2199,  0.2210,  0.2873]],
        requires_grad=True), Parameter containing:
 tensor([-0.0330], requires_grad=True)]

In [32]:
weights = parms[0].detach().numpy()
weights

array([[-0.19232845,  0.15388994, -0.34107742, ..., -0.21992844,
         0.22099222,  0.28726828]], dtype=float32)

In [33]:
weights[0].shape

(4009,)

In [34]:
sorted_indeces = np.argsort(weights[0])

In [35]:
weights[0, sorted_indeces[0]], weights[0, sorted_indeces[-1]],

(-0.4617616, 0.4669453)

In [36]:
[words[i] for i in sorted_indeces[:10]]

['performance',
 'material',
 'performances',
 'fascinating',
 'actors',
 'beautifully',
 'screen',
 'melodrama',
 'movie',
 'audience']

In [37]:
[words[i] for i in sorted_indeces[3998:]]

['learns',
 'realize',
 'him',
 'school',
 'known',
 'secret',
 'relationship',
 '-',
 'they',
 'however',
 'discover']