# Subjectivity classification with CNNs

In [1]:
from pathlib import Path
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split
import re
import _pickle as pickle 

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

### Data Import

In [2]:
# !wget http://www.cs.cornell.edu/people/pabo/movie-review-data/rotten_imdb.tar.gz -P ~/data/rotten_imdb/

In [3]:
# !ls ~/data/wikitext/

In [4]:
# !gunzip /home/paperspace/data/rotten_imdb/rotten_imdb.tar.gz

In [5]:
# !tar -xf /home/paperspace/data/rotten_imdb/rotten_imdb.tar -C /home/paperspace/data/rotten_imdb/

In [6]:
# !wget http://nlp.stanford.edu/data/glove.6B.zip -P ~/data/glove/

In [7]:
# !unzip ~/data/glove/glove.6B.zip -d ~/data/rotten_imdb/

In [8]:
from pathlib import Path
PATH = Path("/home/paperspace/data/rotten_imdb/")
list(PATH.iterdir())

[PosixPath('/home/paperspace/data/rotten_imdb/glove.6B.100d.txt'),
 PosixPath('/home/paperspace/data/rotten_imdb/quote.tok.gt9.5000'),
 PosixPath('/home/paperspace/data/rotten_imdb/glove.6B.50d.txt'),
 PosixPath('/home/paperspace/data/rotten_imdb/plot.tok.gt9.5000'),
 PosixPath('/home/paperspace/data/rotten_imdb/glove.6B.200d.txt'),
 PosixPath('/home/paperspace/data/rotten_imdb/rotten_imdb.tar.gz'),
 PosixPath('/home/paperspace/data/rotten_imdb/subjdata.README.1.0'),
 PosixPath('/home/paperspace/data/rotten_imdb/glove.6B.300d.txt'),
 PosixPath('/home/paperspace/data/rotten_imdb/rotten_imdb.tar')]

## sample Subjective

In [9]:
!head -5 /home/paperspace/data/rotten_imdb/quote.tok.gt9.5000

smart and alert , thirteen conversations about one thing is a small gem . 
color , musical bounce and warm seas lapping on island shores . and just enough science to send you home thinking . 
it is not a mass-market entertainment but an uncompromising attempt by one artist to think about another . 
a light-hearted french film about the spiritual quest of a fashion model seeking peace of mind while in a love affair with a veterinarian who is a non-practicing jew . 
my wife is an actress has its moments in looking at the comic effects of jealousy . in the end , though , it is only mildly amusing when it could have been so much more . 


## sample Objective

In [10]:
!head -5 /home/paperspace/data/rotten_imdb/plot.tok.gt9.5000

the movie begins in the past where a young boy named sam attempts to save celebi from a hunter . 
emerging from the human psyche and showing characteristics of abstract expressionism , minimalism and russian constructivism , graffiti removal has secured its place in the history of modern art while being created by artists who are unconscious of their artistic achievements . 
spurning her mother's insistence that she get on with her life , mary is thrown out of the house , rejected by joe , and expelled from school as she grows larger with child . 
amitabh can't believe the board of directors and his mind is filled with revenge and what better revenge than robbing the bank himself , ironic as it may sound . 
she , among others excentricities , talks to a small rock , gertrude , like if she was alive . 


## Functions

In [11]:
# ======================================================
# Data cleaning
# ======================================================
 
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Every dataset is lower cased except for TREC
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)     
    string = re.sub(r"\'s", " \'s", string) 
    string = re.sub(r"\'ve", " \'ve", string) 
    string = re.sub(r"n\'t", " n\'t", string) 
    string = re.sub(r"\'re", " \'re", string) 
    string = re.sub(r"\'d", " \'d", string) 
    string = re.sub(r"\'ll", " \'ll", string) 
    string = re.sub(r",", " , ", string) 
    string = re.sub(r"!", " ! ", string) 
    string = re.sub(r"\(", " \( ", string) 
    string = re.sub(r"\)", " \) ", string) 
    string = re.sub(r"\?", " \? ", string) 
    string = re.sub(r"\s{2,}", " ", string)    
    return string.strip().lower()


def read_file(path):
    """ Read file returns a shuttled list.
    """
    with open(path, encoding = "ISO-8859-1") as f:
        content = np.array(f.readlines())
    return content


def get_vocab(list_of_content):
    """Computes Dict of counts of words.
    
    Computes the number of times a word is on a document.
    """
    vocab = defaultdict(float)
    for content in list_of_content:
        for line in content:
            line = clean_str(line.strip())
            words = set(line.split())
            for word in words:
                vocab[word] += 1
    return vocab


def add_unknown_words(word_vecs, vocab, min_df=1, D=300):
    """
    For words that occur in at least min_df documents, create a separate word vector.    
    0.25 is chosen so the unknown vectors have (approximately) same variance 
    as pre-trained ones
    """
    for word in vocab:
        if word not in word_vecs and vocab[word] >= min_df:
            word_vecs[word] = np.random.uniform(-0.25,0.25,D)
    # here for rare words we will use UNK
    word_vecs["UNK"] = np.random.uniform(-0.25,0.25,D)

    
def create_embedding_matrix(word_vecs, D=300):
    """
    Creates embedding matrix from word vectors. 
    Embedding Matrix - word vectors in numpy form stacked
    """
    V = len(word_vecs.keys())
    vocab2index = {}
    vocab = []
    W = np.zeros((V+1, D), dtype="float32")
    W[0] = np.zeros(D, dtype='float32')
    i = 1
    for word in word_vecs:
        W[i] = word_vecs[word]
        vocab2index[word] = i
        vocab.append(word)
        i += 1
    return W, np.array(vocab), vocab2index


def encode_sentence(s, vocab2index, N=40):
    """
    takes in a sentence and replaces words with indices otherwise UNK
    
    encode_sentence(X_tr[0])
    array([    44,   1534,    887,     72,    808,     47,    456,     72,
            8,     51,   2819, 400001,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0],
      dtype=int32)
    
    """
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in s.split()])
    l = min(N, len(enc1))
    enc[:l] = enc1[:l]
    return enc

# ======================================================
# Data Prep XY
# ======================================================

def make_XY():
    sub_content = read_file(PATH/"quote.tok.gt9.5000")
    obj_content = read_file(PATH/"plot.tok.gt9.5000")
    sub_content = np.array([clean_str(line.strip()) for line in sub_content])
    obj_content = np.array([clean_str(line.strip()) for line in obj_content])
    sub_y = np.zeros(len(sub_content))
    obj_y = np.ones(len(obj_content))
    X = np.append(sub_content, obj_content)
    y = np.append(sub_y, obj_y)
    return X,y


def make_train_val(X,y):
    X_tr, X_vl, y_tr, y_vl = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_tr, X_vl, y_tr, y_vl

# ======================================================
# Load pretrained wordvecs
# ======================================================

def loadGloveModel(gloveFile="/home/paperspace/data/rotten_imdb/glove.6B.300d.txt"):
    """ Loads word vectors into a dictionary."""
    f = open(gloveFile,'r')
    word_vecs = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        word_vecs[word] = np.array([float(val) for val in splitLine[1:]])
    return word_vecs


# ======================================================
# Sentence Model
# ======================================================

class SentenceCNN(nn.Module):
    
    def __init__(self, V, D, glove_weights):
        """
        This model is based on the glove weights (naive)
        """
        super(SentenceCNN, self).__init__()
        
        # 
        self.glove_weights = glove_weights
        self.embedding = nn.Embedding(V, D, padding_idx=0)
        self.embedding.weight.data.copy_(torch.from_numpy(self.glove_weights))
        self.embedding.weight.requires_grad = False ## freeze embeddings

        self.conv_3 = nn.Conv1d(in_channels=D, out_channels=100, kernel_size=3)
        self.conv_4 = nn.Conv1d(in_channels=D, out_channels=100, kernel_size=4)
        self.conv_5 = nn.Conv1d(in_channels=D, out_channels=100, kernel_size=5)
        
        self.dropout = nn.Dropout(p=0.5)
        self.fc = nn.Linear(300, 1)
        
    def forward(self, x):
        x = self.embedding(x)
        x = x.transpose(1,2)
        x3 = F.relu(self.conv_3(x))
        x4 = F.relu(self.conv_4(x))
        x5 = F.relu(self.conv_5(x))
        x3 = nn.MaxPool1d(kernel_size = 38)(x3)
        x4 = nn.MaxPool1d(kernel_size = 37)(x4)
        x5 = nn.MaxPool1d(kernel_size = 36)(x5)
        out = torch.cat([x3, x4, x5], 2)
        out = out.view(out.size(0), -1)
        out = self.dropout(out)
        return self.fc(out)
    
     
# ======================================================
# Training functions
# ======================================================

def train_epocs(model, x_train, y_train, x_test, y_test, epochs=10, lr=0.01):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    model.train()
    for i in range(epochs):
        x = Variable(torch.LongTensor(x_train)).cuda()
        y = Variable(torch.Tensor(y_train)).cuda().unsqueeze(1)
        y_hat = model(x)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(loss.data[0])
    test_metrics(model, x_test, y_test)


def test_metrics(m, x_test, y_test):
    model.eval()
    x = Variable(torch.LongTensor(x_test)).cuda()
    y = Variable(torch.Tensor(y_test)).cuda().unsqueeze(1)
    y_hat = m(x)
    loss = F.binary_cross_entropy_with_logits(y_hat, y)
    y_pred = y_hat > 0
    correct = (y_pred.float() == y).float().sum()
    accuracy = correct/y_pred.shape[0]
    print("test loss %.3f and accuracy %.3f" % (loss.data[0], accuracy.data[0]))

In [12]:
X, y = make_XY()
X_tr, X_vl, y_tr, y_vl = make_train_val(X,y)
vocab = get_vocab([X_tr])

word_vecs = loadGloveModel()
add_unknown_words(word_vecs, vocab, min_df=10, D=300)
pretrained_weight, vocab, vocab2index = create_embedding_matrix(word_vecs)

In [13]:
# initializing the embedding matrix with the word vectors
D = 300
V = len(pretrained_weight)
emb = nn.Embedding(V, D)
emb.weight.data.copy_(torch.from_numpy(pretrained_weight))

# finding the longest sentence
x_len = np.array([len(x.split()) for x in X_tr])
print(np.percentile(x_len, 95)) # let set the max sequence len to N=40

# encode all the training and validation with the correct tokens Ids
x_tr_enc = np.vstack([encode_sentence(x, vocab2index) for x in X_tr])
x_tr_enc.shape
x_vl = np.vstack([encode_sentence(x, vocab2index) for x in X_vl])
x_vl.shape

42.0


(2000, 40)

In [14]:
# number of word vec terms
V = len(pretrained_weight)

# dimension of the embedding vector
D = 300

# 
N = 40
model = SentenceCNN(V, D, glove_weights=pretrained_weight).cuda()

In [15]:
for lr in [0.01, 0.01, 0.001,0.001, 0.0001, 0.0001]:
    train_epocs(model, x_tr_enc, y_tr, x_vl, y_vl, epochs=10, lr=lr)

0.7096273899078369
2.2384562492370605
0.3440512418746948
0.870844841003418
0.6296365857124329
0.4508425295352936
0.42016124725341797
0.4354369342327118
0.4553316533565521
0.4700770974159241
test loss 0.473 and accuracy 0.795
0.4713665246963501
0.4935859441757202
0.3727800250053406
0.2967967092990875
0.3181613087654114
0.2531818151473999
0.2877346873283386
0.23348352313041687
0.2424101084470749
0.21849432587623596
test loss 0.237 and accuracy 0.908
0.18760180473327637
0.19064520299434662
0.1845017820596695
0.17814302444458008
0.1783904731273651
0.17854417860507965
0.16784361004829407
0.16526371240615845
0.16534225642681122
0.162178173661232
test loss 0.230 and accuracy 0.910
0.16207671165466309
0.1612558811903
0.15554997324943542
0.15317478775978088
0.1477155089378357
0.14563331007957458
0.1436101496219635
0.1422356516122818
0.13875055313110352
0.13828939199447632
test loss 0.227 and accuracy 0.912
0.13667140901088715
0.13471026718616486
0.13348564505577087
0.13348692655563354
0.1324147

In [16]:
# load the encoding dictionary from the language model
with open('dict17.pkl','rb') as f:
    word2idx_lm, idx2word_lm = pickle.load(f)
    
with open('lm_emb_np_trained.pkl','rb') as f:
    learned_emb_np = pickle.load(f)
    
print(learned_emb_np.shape)

x_tr_enc_lm = np.vstack([encode_sentence(x, word2idx_lm) for x in X_tr])
x_tr_enc_lm.shape

x_vl_lm = np.vstack([encode_sentence(x, word2idx_lm) for x in X_vl])
x_vl_lm.shape

(33279, 300)


(2000, 40)

In [17]:
x_tr_enc_lm[0]

array([ 301, 5011, 7292,  362, 6357,  311, 2194,  362,   28,  579, 9429,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0], dtype=int32)

In [18]:
X_tr[0]

'will god let her fall or give her a new path \\?'

In [19]:
x_tr_enc[0]

array([    44,   1534,    887,     72,    808,     47,    456,     72,
            8,     51,   2819, 400001,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0],
      dtype=int32)

In [20]:
pretrained_weight.shape

(400007, 300)

In [21]:
learned_emb_np.shape

(33279, 300)

In [22]:
# number of word vec terms
V = learned_emb_np.shape[0]

# dimension of the embedding vector
D = 300

# 
N = 40
model_lm = SentenceCNN(V, D, glove_weights=learned_emb_np).cuda()

In [25]:
for lr in [0.01, 0.01, 0.001,0.001, 0.0001, 0.0001]:
    train_epocs(model_lm, x_tr_enc_lm, y_tr, x_vl_lm, y_vl, epochs=10, lr=lr)

0.08356434106826782
0.13237479329109192
0.8512473702430725
0.09559396654367447
0.38252323865890503
0.480146199464798
0.2070934921503067
0.10103637725114822
0.15601587295532227
0.2621101140975952
test loss 0.722 and accuracy 0.779
0.25699707865715027
0.5025117993354797
0.18561911582946777
0.11384200304746628
0.2420462816953659
0.2445473074913025
0.1383257508277893
0.10515493154525757
0.13474813103675842
0.1756541132926941
test loss 0.522 and accuracy 0.821
0.1668291687965393
0.1251341998577118
0.1057162657380104
0.09714513272047043
0.09732341766357422
0.09979969263076782
0.10536742210388184
0.10613448917865753
0.1069868728518486
0.09919300675392151
test loss 0.467 and accuracy 0.841
0.09513245522975922
0.08595099300146103
0.08557414263486862
0.08511672914028168
0.08385423570871353
0.07995212823152542
0.07866086065769196
0.0755494087934494
0.07421944290399551
0.07373911142349243
test loss 0.448 and accuracy 0.843
0.07112368196249008
0.07097770273685455
0.07182207703590393
0.0717436969280