In [8]:
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import jdc

In [9]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
embeddingFile = "model/glove.6B.50d.txt"
trainFile = "data/train.tsv"
models = []  # A collection of models

In [10]:
# Get all of the words and create a bijection between word and index
def getAll(text):
    vocab = set()
    MAX_PADDING = 0 
    for line in text:
        wds = line.lower().split()
        MAX_PADDING = max(MAX_PADDING, len(wds))
        vocab.update(wds)
    
    vocab.add("<pad>")  # <pad> for padding
    ivocab = {idx: w for idx, w in enumerate(vocab)}
    vocab = {w: idx for idx, w in enumerate(vocab)}
    return vocab, ivocab, MAX_PADDING

In [11]:
# Sentence to vector
def encode(text, vocab, MAX_PADDING):
    vectors = []
    for line in text:
        line = line.lower().split()
        line += ['<pad>'] * (MAX_PADDING - len(line))
        vector = [vocab[word] for word in line]
        vectors.append(vector)
    return np.array(vectors)

In [12]:
# Load the GloVe pretrained embeddings
class GloVe(object):
    def __init__(self):
        with open(embeddingFile, 'r', encoding='utf-8') as f:
            words = [x.rstrip().split(' ')[0] for x in f.readlines()]
        with open(embeddingFile, 'r', encoding='utf-8') as f:
            vectors = {}
            for line in f:
                vals = line.rstrip().split(' ')
                vectors[vals[0]] = [float(x) for x in vals[1:]]
        
        words.append("<pad>")
        vectors["<pad>"] = None
        vocab_size = len(words)
        self.vocab = {w: idx for idx, w in enumerate(words)}
        self.ivocab = {idx: w for idx, w in enumerate(words)}

        vector_dim = len(vectors[self.ivocab[0]])
        W = np.zeros((vocab_size, vector_dim))
        for word, v in vectors.items():
            if word == "<pad>":
                v = [0 for _ in range(vector_dim)]
            W[self.vocab[word], :] = v
        
        # normalize each word vector to unit variance
        self.W_norm = np.zeros(W.shape)
        d = (np.sum(W ** 2, 1) ** (0.5))
        d[-1] = 1 # zero-divisor
        self.W_norm = (W.T / d).T
    
    def __getitem__(self, word):
        return self.W_norm[self.vocab[word], :]

In [14]:
%%add_to GloVe
# Create the embedding
def createEmbedding(self, target_vocab, freeze=True):
    num_vocab = len(target_vocab)
    num_feature = self.W_norm.shape[1]
    W = torch.zeros((num_vocab, num_feature))
    
    for i, word in enumerate(target_vocab):
        try: 
            W[i] = torch.from_numpy(self[word])
        except KeyError:
            W[i] = torch.from_numpy(np.random.normal(scale=0.6, size=(num_feature)))
        except IndexError:
            print(word)
    
    emb = nn.Embedding.from_pretrained(W, freeze=freeze)
    return emb, num_vocab, num_feature

In [15]:
# Initialize with a random embedding
def createRandomEmbedding(target_vocab, num_feature=100):
    num_vocab = len(target_vocab)
    W = torch.zeros((num_vocab, num_feature))
    
    for i, word in enumerate(target_vocab):
        W[i] = torch.from_numpy(np.random.normal(scale=0.6, size=(num_feature)))
    
    emb = nn.Embedding.from_pretrained(W)
    return emb, num_vocab, num_feature

In [16]:
allData = pd.read_csv(trainFile, delimiter="\t", index_col="PhraseId")
allText = allData.Phrase
allLabel = allData.Sentiment
vocab, ivocab, MAX_PADDING = getAll(allText)
allID = encode(allText, vocab, MAX_PADDING)

In [17]:
# Split Dataset
from sklearn.model_selection import train_test_split

# Train Test Split
trainInput, testInput, trainLabel, testLabel = train_test_split(
    allID, allLabel, test_size=0.2, random_state=42)

trainInput, valInput, trainLabel, valLabel = train_test_split(
    trainInput, trainLabel, test_size=0.25, random_state=42)

trainInd = np.arange(trainLabel.shape[0])
trainInput = torch.from_numpy(trainInput)
trainLabel = torch.from_numpy(trainLabel.to_numpy())

valInput = torch.from_numpy(valInput)
valLabel = torch.from_numpy(valLabel.to_numpy())

testInput = torch.from_numpy(testInput)
testLabel = torch.from_numpy(testLabel.to_numpy())

In [18]:
glove = GloVe()

### Model Construction

In [19]:
class CNN(nn.Module):
    def __init__(self, emb, emb_dim, pad_dim, num_cls, dropout=0.5, ker_size=[3, 4, 5, 6], num_ker=[100, 100, 100, 100]):
        super(CNN, self).__init__()
        self.emb = emb
        self.conv1 = nn.ModuleList([nn.Sequential(
                                    nn.Conv1d(emb_dim, n, k),
                                    nn.ReLU(),
                                    nn.MaxPool1d(kernel_size=pad_dim-k+1)
                                    ) for n, k in zip(num_ker, ker_size)])
        self.fc = nn.Linear(in_features=np.sum(num_ker),
                            out_features=num_cls)
        self.dropout = nn.Dropout(p=dropout)

        
    def forward(self, X):
        X = self.emb(X)
        X = X.permute(0, 2, 1)
        X = [conv(X) for conv in self.conv1]
        X = torch.cat(X, dim=1)
        X = X.view(-1, X.size(1))
        X = self.fc(self.dropout(X))
        return X
        

In [20]:
class LSTM(nn.Module):
    def __init__(self, emb, emb_dim, pad_dim, num_cls, dropout=0.5, h_size=100):
        super(LSTM, self).__init__()
        self.emb = emb
        self.h_size = h_size
        self.lstm = nn.LSTM(input_size=emb_dim,
                            hidden_size=h_size,
                            num_layers=1,
                            batch_first=True)
        self.fc = nn.Linear(in_features=h_size,
                            out_features=num_cls)
        self.dropout = nn.Dropout(p=dropout)

        
    def forward(self, X):
        h = torch.zeros(1, X.size(0), self.h_size)
        c = torch.zeros(1, X.size(0), self.h_size)
        X = self.emb(X)
        X, _ = self.lstm(X, )
        X = X[:, -1, :]
        X = self.fc(self.dropout(X))
        return X

### Model Train

#### Random Embedding + CNN

In [13]:
emb, num_size, num_feature = createRandomEmbedding(vocab)
dataloader = DataLoader(trainInd, batch_size=100, shuffle=True)

model = CNN(emb, num_feature, MAX_PADDING, 5).to(device)
models.append(model)
loss = nn.CrossEntropyLoss()
optimizer = optim.Adadelta(model.parameters(), lr=0.05)

for i in tqdm(range(10)):
    totalLoss = 0
    model.train()
    for batch in dataloader:
        SELECT = batch.tolist()
        X, y = trainInput[SELECT].to(device), trainLabel[SELECT].to(device)
        model.zero_grad()
        logits = model(X)
        L = loss(logits, y)
        totalLoss += L.item()
        L.backward()
        optimizer.step()
    
    totalLoss /= len(dataloader)
    if not (i+1)%1:
        print(f"epoch {i+1}, training loss:{totalLoss}")

  return torch.max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode)
 10%|████████▎                                                                          | 1/10 [00:36<05:25, 36.15s/it]

epoch 1, training loss:1.2405600439649762


 20%|████████████████▌                                                                  | 2/10 [01:13<04:53, 36.70s/it]

epoch 2, training loss:1.2058526189121137


 30%|████████████████████████▉                                                          | 3/10 [01:51<04:20, 37.19s/it]

epoch 3, training loss:1.1857372168670215


 40%|█████████████████████████████████▏                                                 | 4/10 [02:28<03:43, 37.21s/it]

epoch 4, training loss:1.1696165574271244


 50%|█████████████████████████████████████████▌                                         | 5/10 [03:05<03:06, 37.24s/it]

epoch 5, training loss:1.1535215012037385


 60%|█████████████████████████████████████████████████▊                                 | 6/10 [03:43<02:30, 37.65s/it]

epoch 6, training loss:1.1389119273316008


 70%|██████████████████████████████████████████████████████████                         | 7/10 [04:21<01:53, 37.68s/it]

epoch 7, training loss:1.1254214150165163


 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [04:59<01:15, 37.86s/it]

epoch 8, training loss:1.1119742578605196


 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [05:37<00:37, 37.82s/it]

epoch 9, training loss:1.0990055622006174


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [06:15<00:00, 37.55s/it]

epoch 10, training loss:1.0854281590995056





#### Untrainable GloVe Embedding + CNN

In [14]:
pretrained, num_size, num_feature = glove.createEmbedding(vocab)
dataloader = DataLoader(trainInd, batch_size=100, shuffle=True)

model = CNN(pretrained, num_feature, MAX_PADDING, 5).to(device)
models.append(model)
loss = nn.CrossEntropyLoss()
optimizer = optim.Adadelta(model.parameters(), lr=0.05)

for i in tqdm(range(10)):
    totalLoss = 0
    model.train()
    for batch in dataloader:
        SELECT = batch.tolist()
        X, y = trainInput[SELECT].to(device), trainLabel[SELECT].to(device)
        model.zero_grad()
        logits = model(X)
        L = loss(logits, y)
        totalLoss += L.item()
        L.backward()
        optimizer.step()
    
    totalLoss /= len(dataloader)
    if not (i+1)%1:
        print(f"epoch {i+1}, training loss:{totalLoss}")

 10%|████████▎                                                                          | 1/10 [00:26<04:02, 26.91s/it]

epoch 1, training loss:1.317599594020538


 20%|████████████████▌                                                                  | 2/10 [00:54<03:38, 27.27s/it]

epoch 2, training loss:1.2019674664754878


 30%|████████████████████████▉                                                          | 3/10 [01:22<03:14, 27.78s/it]

epoch 3, training loss:1.165718745332899


 40%|█████████████████████████████████▏                                                 | 4/10 [01:50<02:46, 27.71s/it]

epoch 4, training loss:1.1423982229151364


 50%|█████████████████████████████████████████▌                                         | 5/10 [02:18<02:18, 27.72s/it]

epoch 5, training loss:1.1202839004828047


 60%|█████████████████████████████████████████████████▊                                 | 6/10 [02:45<01:50, 27.68s/it]

epoch 6, training loss:1.1028792397571348


 70%|██████████████████████████████████████████████████████████                         | 7/10 [03:13<01:22, 27.62s/it]

epoch 7, training loss:1.0896853110579061


 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [03:41<00:55, 27.75s/it]

epoch 8, training loss:1.080199436800457


 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [04:09<00:27, 27.82s/it]

epoch 9, training loss:1.0725859490758962


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [04:37<00:00, 27.76s/it]

epoch 10, training loss:1.066162345121866





#### Trainable GloVe Embedding + CNN

In [15]:
pretrained, num_size, num_feature = glove.createEmbedding(vocab, freeze=False)
dataloader = DataLoader(trainInd, batch_size=100, shuffle=True)

model = CNN(pretrained, num_feature, MAX_PADDING, 5).to(device)
models.append(model)
loss = nn.CrossEntropyLoss()
optimizer = optim.Adadelta(model.parameters(), lr=0.05)

for i in tqdm(range(10)):
    totalLoss = 0
    model.train()
    for batch in dataloader:
        SELECT = batch.tolist()
        X, y = trainInput[SELECT].to(device), trainLabel[SELECT].to(device)
        model.zero_grad()
        logits = model(X)
        L = loss(logits, y)
        totalLoss += L.item()
        L.backward()
        optimizer.step()
    
    totalLoss /= len(dataloader)
    if not (i+1)%1:
        print(f"epoch {i+1}, training loss:{totalLoss}")

 10%|████████▎                                                                          | 1/10 [00:47<07:07, 47.48s/it]

epoch 1, training loss:1.2978745772719003


 20%|████████████████▌                                                                  | 2/10 [01:35<06:21, 47.73s/it]

epoch 2, training loss:1.196729633953172


 30%|████████████████████████▉                                                          | 3/10 [02:23<05:35, 47.94s/it]

epoch 3, training loss:1.1609844944520338


 40%|█████████████████████████████████▏                                                 | 4/10 [03:11<04:47, 47.95s/it]

epoch 4, training loss:1.1286219759837286


 50%|█████████████████████████████████████████▌                                         | 5/10 [03:59<03:59, 47.87s/it]

epoch 5, training loss:1.096937464357313


 60%|█████████████████████████████████████████████████▊                                 | 6/10 [04:47<03:11, 47.94s/it]

epoch 6, training loss:1.0673280503986484


 70%|██████████████████████████████████████████████████████████                         | 7/10 [05:35<02:23, 47.87s/it]

epoch 7, training loss:1.0440866806591078


 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [06:23<01:36, 48.02s/it]

epoch 8, training loss:1.024910254183294


 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [07:11<00:47, 47.92s/it]

epoch 9, training loss:1.011315764268918


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [07:58<00:00, 47.87s/it]

epoch 10, training loss:0.9968385993480173





#### RandomEmbedding + LSTM

In [16]:
emb, num_size, num_feature = createRandomEmbedding(vocab)
dataloader = DataLoader(trainInd, batch_size=100, shuffle=True)

model = LSTM(emb, num_feature, MAX_PADDING, 5).to(device)
models.append(model)
loss = nn.CrossEntropyLoss()
optimizer = optim.Adadelta(model.parameters(), lr=0.5)

for i in tqdm(range(10)):
    totalLoss = 0
    model.train()
    for batch in dataloader:
        SELECT = batch.tolist()
        X, y = trainInput[SELECT].to(device), trainLabel[SELECT].to(device)
        model.zero_grad()
        logits = model(X)
        L = loss(logits, y)
        totalLoss += L.item()
        L.backward()
        optimizer.step()
    
    totalLoss /= len(dataloader)
    if not (i+1)%1:
        print(f"epoch {i+1}, training loss:{totalLoss}")

 10%|████████▎                                                                          | 1/10 [01:02<09:25, 62.89s/it]

epoch 1, training loss:1.289078432187963


 20%|████████████████▌                                                                  | 2/10 [02:05<08:22, 62.77s/it]

epoch 2, training loss:1.284036505693051


 30%|████████████████████████▉                                                          | 3/10 [03:08<07:20, 62.97s/it]

epoch 3, training loss:1.2825565992068328


 40%|█████████████████████████████████▏                                                 | 4/10 [04:12<06:18, 63.11s/it]

epoch 4, training loss:1.2824663519223192


 50%|█████████████████████████████████████████▌                                         | 5/10 [05:16<05:17, 63.43s/it]

epoch 5, training loss:1.2818492161171668


 60%|█████████████████████████████████████████████████▊                                 | 6/10 [06:20<04:15, 63.81s/it]

epoch 6, training loss:1.2811234588938563


 70%|██████████████████████████████████████████████████████████                         | 7/10 [07:25<03:12, 64.01s/it]

epoch 7, training loss:1.2812424572453045


 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [08:30<02:09, 64.53s/it]

epoch 8, training loss:1.280995098605609


 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [09:36<01:04, 64.90s/it]

epoch 9, training loss:1.2806394264308723


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [10:42<00:00, 64.29s/it]

epoch 10, training loss:1.2806248742462858





#### Untrainable GloVe Embedding + LSTM

In [21]:
pretrained, num_size, num_feature = glove.createEmbedding(vocab)
dataloader = DataLoader(trainInd, batch_size=100, shuffle=True)

model = LSTM(pretrained, num_feature, MAX_PADDING, 5).to(device)
models.append(model)
loss = nn.CrossEntropyLoss()
optimizer = optim.Adadelta(model.parameters(), lr=0.5)

for i in tqdm(range(10)):
    totalLoss = 0
    model.train()
    for batch in dataloader:
        SELECT = batch.tolist()
        X, y = trainInput[SELECT].to(device), trainLabel[SELECT].to(device)
        model.zero_grad()
        logits = model(X)
        L = loss(logits, y)
        totalLoss += L.item()
        L.backward()
        optimizer.step()
    
    totalLoss /= len(dataloader)
    if not (i+1)%1:
        print(f"epoch {i+1}, training loss:{totalLoss}")

 10%|████████▎                                                                          | 1/10 [01:00<09:06, 60.70s/it]

epoch 1, training loss:1.2905629374555998


 20%|████████████████▌                                                                  | 2/10 [02:01<08:08, 61.03s/it]

epoch 2, training loss:1.2832786895676636


 30%|████████████████████████▉                                                          | 3/10 [03:03<07:09, 61.36s/it]

epoch 3, training loss:1.282492292856076


 40%|█████████████████████████████████▏                                                 | 4/10 [04:05<06:09, 61.56s/it]

epoch 4, training loss:1.281854307511573


 50%|█████████████████████████████████████████▌                                         | 5/10 [05:06<05:07, 61.46s/it]

epoch 5, training loss:1.2816361412803543


 60%|█████████████████████████████████████████████████▊                                 | 6/10 [06:07<04:05, 61.32s/it]

epoch 6, training loss:1.2813271632698329


 70%|██████████████████████████████████████████████████████████                         | 7/10 [07:09<03:03, 61.29s/it]

epoch 7, training loss:1.2809435143160337


 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [08:10<02:02, 61.17s/it]

epoch 8, training loss:1.2811414924477056


 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [09:11<01:01, 61.15s/it]

epoch 9, training loss:1.2808794019825303


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [10:12<00:00, 61.21s/it]

epoch 10, training loss:1.280553351217044





### Model Selection

In [22]:
valInput, valLabel = valInput.to(device), valLabel.to(device)
bestAcc = 0
bestInd = 0
for i, model in enumerate(models):
    with torch.no_grad():
        logits = model(valInput)
        preds = torch.argmax(logits, dim=1).flatten()
        accuracy = (preds == valLabel).cpu().numpy().mean() * 100
        if bestAcc < accuracy:
            bestAcc = accuracy
            bestInd = i
    print(f'Model: {i}; Accuracy on validation set: {accuracy}')

Model: 0; Accuracy on validation set: 55.17108804306036
Model: 1; Accuracy on validation set: 56.455850313981806
Model: 2; Accuracy on validation set: 59.24323977957195
Model: 3; Accuracy on validation set: 50.88747917467641
Model: 4; Accuracy on validation set: 50.88747917467641


In [23]:
bestModel = models[bestInd]
testInput, testLabel = testInput.to(device), testLabel.to(device)
with torch.no_grad():
    logits = bestModel(testInput)
    preds = torch.argmax(logits, dim=1).flatten()
    accuracy = (preds == testLabel).cpu().numpy().mean() * 100
print(f'Best Model {bestModel}. Accuracy on test set: {accuracy}')

Best Model CNN(
  (emb): Embedding(16532, 50)
  (conv1): ModuleList(
    (0): Sequential(
      (0): Conv1d(50, 100, kernel_size=(3,), stride=(1,))
      (1): ReLU()
      (2): MaxPool1d(kernel_size=50, stride=50, padding=0, dilation=1, ceil_mode=False)
    )
    (1): Sequential(
      (0): Conv1d(50, 100, kernel_size=(4,), stride=(1,))
      (1): ReLU()
      (2): MaxPool1d(kernel_size=49, stride=49, padding=0, dilation=1, ceil_mode=False)
    )
    (2): Sequential(
      (0): Conv1d(50, 100, kernel_size=(5,), stride=(1,))
      (1): ReLU()
      (2): MaxPool1d(kernel_size=48, stride=48, padding=0, dilation=1, ceil_mode=False)
    )
    (3): Sequential(
      (0): Conv1d(50, 100, kernel_size=(6,), stride=(1,))
      (1): ReLU()
      (2): MaxPool1d(kernel_size=47, stride=47, padding=0, dilation=1, ceil_mode=False)
    )
  )
  (fc): Linear(in_features=400, out_features=5, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
). Accuracy on test set: 58.80430603613994


In [21]:
pretrained, num_size, num_feature = glove.createEmbedding(vocab)
dataloader = DataLoader(trainInd, batch_size=100, shuffle=True)

model = CNN(pretrained, num_feature, MAX_PADDING, 5).to(device)
models.append(model)
loss = nn.CrossEntropyLoss()
optimizer = optim.Adadelta(model.parameters(), lr=0.05)

for i in tqdm(range(10)):
    totalLoss = 0
    model.train()
    for batch in dataloader:
        SELECT = batch.tolist()
        X, y = trainInput[SELECT].to(device), trainLabel[SELECT].to(device)
        model.zero_grad()
        logits = model(X)
        L = loss(logits, y)
        totalLoss += L.item()
        L.backward()
        optimizer.step()
    
    totalLoss /= len(dataloader)
    if not (i+1)%1:
        print(f"epoch {i+1}, training loss:{totalLoss}")

  return torch.max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode)
 10%|████████▎                                                                          | 1/10 [00:27<04:07, 27.52s/it]

epoch 1, training loss:1.322865056126451


 20%|████████████████▌                                                                  | 2/10 [00:58<03:55, 29.41s/it]

epoch 2, training loss:1.2007749759400286


 30%|████████████████████████▉                                                          | 3/10 [01:27<03:24, 29.17s/it]

epoch 3, training loss:1.1623185905473845


 40%|█████████████████████████████████▏                                                 | 4/10 [01:54<02:51, 28.58s/it]

epoch 4, training loss:1.137987034615992


 50%|█████████████████████████████████████████▌                                         | 5/10 [02:23<02:22, 28.45s/it]

epoch 5, training loss:1.1153583380939358


 60%|█████████████████████████████████████████████████▊                                 | 6/10 [02:50<01:52, 28.19s/it]

epoch 6, training loss:1.0970954977079288


 70%|██████████████████████████████████████████████████████████                         | 7/10 [03:18<01:24, 28.05s/it]

epoch 7, training loss:1.086183250586531


 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [03:49<00:57, 28.98s/it]

epoch 8, training loss:1.077152532854355


 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [04:17<00:28, 28.69s/it]

epoch 9, training loss:1.0689304706381122


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [04:45<00:00, 28.51s/it]

epoch 10, training loss:1.0646403490415632





#### ref:
(1): https://chriskhanhtran.github.io/posts/cnn-sentence-classification/

(2): https://www.youtube.com/watch?v=0_PgWWmauHk