In [81]:
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import jdc

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
embeddingFile = "model/glove.6B.50d.txt"
trainFile = "data/train.tsv"

In [97]:
def getAll(text):
    vocab = set()
    MAX_PADDING = 0 
    for line in text:
        wds = line.lower().split()
        MAX_PADDING = max(MAX_PADDING, len(wds))
        vocab.update(wds)
    
    vocab.add("<pad>")
    ivocab = {idx: w for idx, w in enumerate(vocab)}
    vocab = {w: idx for idx, w in enumerate(vocab)}
    return vocab, ivocab, MAX_PADDING

In [109]:
def encode(text, vocab, MAX_PADDING):
    vectors = []
    for line in text:
        line = line.lower().split()
        line += ['<pad>'] * (MAX_PADDING - len(line))
        vector = [vocab[word] for word in line]
        vectors.append(vector)
    return np.array(vectors)

In [50]:
class GloVe(object):
    def __init__(self):
        with open(embeddingFile, 'r', encoding='utf-8') as f:
            words = [x.rstrip().split(' ')[0] for x in f.readlines()]
        with open(embeddingFile, 'r', encoding='utf-8') as f:
            vectors = {}
            for line in f:
                vals = line.rstrip().split(' ')
                vectors[vals[0]] = [float(x) for x in vals[1:]]
        
        words.append("<pad>")
        vectors["<pad>"] = None
        vocab_size = len(words)
        self.vocab = {w: idx for idx, w in enumerate(words)}
        self.ivocab = {idx: w for idx, w in enumerate(words)}

        vector_dim = len(vectors[self.ivocab[0]])
        W = np.zeros((vocab_size, vector_dim))
        for word, v in vectors.items():
            if word == "<pad>":
                v = [0 for _ in range(vector_dim)]
            W[self.vocab[word], :] = v
        
        # normalize each word vector to unit variance
        self.W_norm = np.zeros(W.shape)
        d = (np.sum(W ** 2, 1) ** (0.5))
        d[-1] = 1 # zero-divisor
        self.W_norm = (W.T / d).T
    
    def __getitem__(self, word):
        return self.W_norm[self.vocab[word], :]

In [69]:
%%add_to GloVe 
def createEmbedding(self, target_vocab, freeze=True):
    num_vocab = len(target_vocab)
    num_feature = self.W_norm.shape[1]
    W = torch.zeros((num_vocab, num_feature))
    
    for i, word in enumerate(target_vocab):
        try: 
            W[i] = torch.from_numpy(glove[word])
        except KeyError:
            W[i] = torch.from_numpy(np.random.normal(scale=0.6, size=(num_feature)))
        except IndexError:
            print(word)
    
    emb = nn.Embedding.from_pretrained(W, freeze=freeze)
    return emb, num_vocab, num_feature

In [110]:
allData = pd.read_csv(trainFile, delimiter="\t", index_col="PhraseId")
allText = allData.Phrase
allLabel = allData.Sentiment
vocab, ivocab, MAX_PADDING = getAll(allText)
allID = encode(allText, vocab, MAX_PADDING)

In [134]:
from sklearn.model_selection import train_test_split

# Train Test Split
trainInput, testInput, trainLabel, testLabel = train_test_split(
    allID, allLabel, test_size=0.2, random_state=42)

trainInput, valInput, trainLabel, valLabel = train_test_split(
    trainInput, trainLabel, test_size=0.25, random_state=42)

trainInd = np.arange(trainLabel.shape[0])
trainInput = torch.from_numpy(trainInput)
trainLabel = torch.from_numpy(trainLabel.to_numpy())

valInput = torch.from_numpy(valInput)
valLabel = torch.from_numpy(valLabel.to_numpy())

testInput = torch.from_numpy(testInput)
testLabel = torch.from_numpy(testLabel.to_numpy())

In [117]:
trainInput

tensor([[12310,  5433,    77,  ..., 15280, 15280, 15280],
        [ 6349, 12023,  2664,  ..., 15280, 15280, 15280],
        [13089, 11293, 15309,  ..., 15280, 15280, 15280],
        ...,
        [15431,  5346,  6349,  ..., 15280, 15280, 15280],
        [12144,  4252,  6101,  ..., 15280, 15280, 15280],
        [ 6349,  2752,  7883,  ..., 15280, 15280, 15280]], dtype=torch.int32)

In [52]:
glove = GloVe()

In [151]:
class CNN(nn.Module):
    def __init__(self, emb, emb_dim, pad_dim, num_cls, dropout=0.5, ker_size=[3, 4, 5, 6], num_ker=[100, 100, 100, 100]):
        super(CNN, self).__init__()
        self.emb = emb
        self.conv1 = nn.ModuleList([nn.Sequential(
                                    nn.Conv1d(emb_dim, n, k),
                                    nn.ReLU(),
                                    nn.MaxPool1d(kernel_size=pad_dim-k+1)
                                    ) for n, k in zip(num_ker, ker_size)])
        self.fc = nn.Linear(in_features=np.sum(num_ker),
                            out_features=num_cls)
        self.dropout = nn.Dropout(p=dropout)

        
    def forward(self, X):
        X = self.emb(X)
        X = X.permute(0, 2, 1)
        X = [conv(X) for conv in self.conv1]
        X = torch.cat(X, dim=1)
        X = X.view(-1, X.size(1))
        X = self.fc(self.dropout(X))
        return X
        

In [73]:
pretrained, num_size, num_feature = glove.createEmbedding(vocab)

In [152]:
dataloader = DataLoader(trainInd, batch_size=100, shuffle=True)

model = CNN(pretrained, num_feature, MAX_PADDING, 5).to(device)
loss = nn.CrossEntropyLoss()
optimizer = optim.Adadelta(model.parameters(), lr=0.05)

for i in tqdm(range(10)):
    totalLoss = 0
    model.train()
    for batch in dataloader:
        SELECT = batch.tolist()
        X, y = trainInput[SELECT].to(device), trainLabel[SELECT].to(device)
        model.zero_grad()
        logits = model(X)
        L = loss(logits, y)
        totalLoss += L.item()
        L.backward()
        optimizer.step()
    
    totalLoss /= len(dataloader)
    if not (i+1)%1:
        print(f"epoch {i+1}, training loss:{totalLoss}")

 10%|████████▎                                                                          | 1/10 [00:29<04:21, 29.02s/it]

epoch 1, training loss:1.3161859748966538


 20%|████████████████▌                                                                  | 2/10 [01:00<04:03, 30.40s/it]

epoch 2, training loss:1.2025967747959028


 30%|████████████████████████▉                                                          | 3/10 [01:31<03:34, 30.71s/it]

epoch 3, training loss:1.1680877704884989


 40%|█████████████████████████████████▏                                                 | 4/10 [02:02<03:04, 30.67s/it]

epoch 4, training loss:1.1423503851941454


 50%|█████████████████████████████████████████▌                                         | 5/10 [02:32<02:33, 30.70s/it]

epoch 5, training loss:1.1208051841948559


 60%|█████████████████████████████████████████████████▊                                 | 6/10 [03:04<02:04, 31.12s/it]

epoch 6, training loss:1.1035974214018474


 70%|██████████████████████████████████████████████████████████                         | 7/10 [03:34<01:32, 30.75s/it]

epoch 7, training loss:1.0885081354810944


 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [04:05<01:01, 30.80s/it]

epoch 8, training loss:1.0784949139953295


 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [04:36<00:30, 30.70s/it]

epoch 9, training loss:1.0717084618998949


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [05:06<00:00, 30.65s/it]

epoch 10, training loss:1.063072437664997





In [153]:
valInput, valLabel = valInput.to(device), valLabel.to(device)
with torch.no_grad():
    logits = model(valInput)
    preds = torch.argmax(logits, dim=1).flatten()
    accuracy = (preds == valLabel).cpu().numpy().mean() * 100
print(accuracy)

56.52953992054338


#### ref:
(1): https://chriskhanhtran.github.io/posts/cnn-sentence-classification/