In [1]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

from torch.autograd import Variable

from sklearn import preprocessing

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

import numpy as np
import pandas as pd

In [2]:
EPOCHS = 10
BATCH_SIZE = 5
LEARNING_RATE = 0.01

In [3]:
class TwoGramLanguageModeler(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(TwoGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)
    
    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [4]:
data = pd.read_csv('../data/SMSSpamCollection', sep='\t', header=None)
data['Text'] = data[1].str.replace('[^\w\s]','')
data.columns = ['label', 'Full Text', 'Text']
data['Lower Case Text'] = data['Text'].str.lower()

In [5]:
labels, counts = np.unique(data['label'], return_counts=True)
encoder = preprocessing.LabelEncoder()
encoder.fit(labels[np.argsort(-counts)])
data['y'] = encoder.transform(data['label'])

In [6]:
np.random.seed(42)
mask_train = np.random.random(data.shape[0]) < 0.8
data_train = data[mask_train]
data_test = data.iloc[~mask_train, :]


#up sample data train for word2vec vocabulary
countToIncrease_word = data_train[data_train['y'] == 0].shape[0] - data_train[data_train['y'] == 1].shape[0]
if countToIncrease_word % BATCH_SIZE != 0:
    countToIncrease_word = countToIncrease_word + (BATCH_SIZE - countToIncrease_word % BATCH_SIZE) + 1
spamupsampled_word = data_train[data_train['y'] == 1].sample(n=countToIncrease_word, replace=True)
data_train_upsample_word2vec = pd.concat([spamupsampled_word, data_train])



count_vect_sing_word = CountVectorizer(stop_words=ENGLISH_STOP_WORDS)
count_vect_sing_word.fit(data_train_upsample_word2vec['Lower Case Text'])
tokenizer_word = count_vect_sing_word.build_tokenizer()

In [7]:
EMBEDDING_SIZE = 300
CONTEXT_SIZE = 1
VOCAB_SIZE = len(count_vect_sing_word.vocabulary_)
word_to_ix = count_vect_sing_word.vocabulary_

In [8]:
word2vec_model = TwoGramLanguageModeler(VOCAB_SIZE, EMBEDDING_SIZE, CONTEXT_SIZE)

In [9]:
MODEL_PATH = '../data/word_2vec_model'
word_embeddings = word2vec_model.embeddings
word2vec_model.load_state_dict(torch.load(MODEL_PATH))
word2vec_model.eval()

word_embeddings = word2vec_model.embeddings
# TO FIX EMBEDDINGS
word_embeddings.weight.requires_grad = False

In [10]:
# count_vect_sing_word is a CountVectorizer
def _indicesForSentence(input_str, tokenizer = tokenizer_word, count_vect = count_vect_sing_word):
    input_str = list(filter(lambda x: x in count_vect.vocabulary_, tokenizer(input_str)))
    return torch.tensor([[word_to_ix[word]] for word in input_str], dtype=torch.long)

def sentenceToNumpyInstance(input_str, embedder):
    embeddings = embedder(_indicesForSentence(input_str))
    if embeddings.shape == torch.Size([0]):
        return np.zeros(EMBEDDING_SIZE)
    else:
        return torch.Tensor.numpy(embeddings.detach())
    
def word2vec_transform(data, embeddings, field = 'Lower Case Text'):
    return np.array(data[field].apply(sentenceToNumpyInstance, embedder=embeddings).values.tolist())

In [11]:
trans_data = word2vec_transform(data_train_upsample_word2vec, embeddings=word_embeddings)

In [12]:
def generateSentenceLengths(data):
    sentence_lengths= []
    for i in range(len(trans_data)):
        e = trans_data[i]
        if len(e.shape) > 1:
            sentence_lengths.append(e.shape[0])
        else:
            sentence_lengths.append(1)
    return sentence_lengths

sentence_lengths = np.array(generateSentenceLengths(trans_data))
indices_rv = np.argsort(-sentence_lengths)
sentence_lengths = sentence_lengths[indices_rv]
trans_data = trans_data[indices_rv]

In [13]:
max_len_so_far = -1
for i in range(len(trans_data)):
    e = trans_data[i]
    if e.shape[0] >= max_len_so_far and len(e.shape) > 1:
        max_len_so_far = e.shape[0]

In [14]:
SENTENCE_LEN = max_len_so_far

In [15]:
def adjust_learning_rate(optimizer, epoch):
    lr = learning_rate * (0.1 ** (epoch // 10))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    return optimizer

In [16]:
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim = EMBEDDING_SIZE, hidden_dim = 50, \
                 label_size = 2, batch_size = BATCH_SIZE):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.hidden2label = nn.Linear(hidden_dim, label_size)
        self.hidden = self.init_hidden()
        
    def init_hidden(self):
        h0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
        c0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
        return (h0, c0)
    
    def forward(self, embeds, sentence_lengths):
        embeds = torch.nn.utils.rnn.pack_padded_sequence(embeds, sentence_lengths, batch_first=True)
        #embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out, _ = torch.nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)
        y = self.hidden2label(lstm_out.float()[:, -1, :])
        return y

In [17]:
model = LSTMClassifier()

In [18]:
def padSentences(data):
    trans_data_reshape = np.zeros((data.shape[0], SENTENCE_LEN, EMBEDDING_SIZE))
    # this will also do padding
    for i in range(data.shape[0]):
        e = trans_data[i]
        if len(e.shape) > 1:
            sentence_len_sofar = e.shape[0]
            for j in range(sentence_len_sofar):
                trans_data_reshape[i, j] = e[j][0]
    return trans_data_reshape

In [19]:
trans_data = padSentences(trans_data)

In [20]:
def prediction(data, sen_len):
    return model(torch.tensor(data,dtype=torch.float), sen_len)

In [21]:
train_y = data_train_upsample_word2vec['y']

In [22]:
loss_function = nn.CrossEntropyLoss()
train_loss_ = []
test_loss_ = []
train_acc_ = []
test_acc_ = []

In [23]:
def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [40]:
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)
for epoch in range(EPOCHS):
    print(epoch)
    total_acc = 0.0
    total_loss = 0.0
    total = 0.0
    batch_indices = chunks(range(trans_data.shape[0]), 5)
    for this_batch in batch_indices:
        this_batch = list(this_batch)
        inner_data = trans_data[this_batch]
        inner_y = torch.tensor(train_y.iloc[this_batch].values)
        batch_lengths = sentence_lengths[this_batch]
        model.zero_grad()
        
        model.hidden = model.init_hidden()
        
        output = prediction(inner_data, batch_lengths)
        loss = loss_function(output, Variable(inner_y))
        loss.backward()
        optimizer.step()
        _, predicted = torch.max(output.data, 1)
        total_acc += (predicted == inner_y).sum()
        total += len(inner_y)
        total_loss += loss.data[0]
        
    train_loss_.append(1.0 * total_loss / total)
    train_acc_.append(1.0 * total_acc.float() / total)

0




1
2
3
4
5
6
7
8
9
