In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_curve, auc


import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from sklearn.linear_model import LogisticRegression
import functools

In [2]:
data = pd.read_csv('../data/SMSSpamCollection', sep='\t', header=None)
data['Text'] = data[1].str.replace('[^\w\s]','')
data.columns = ['label', 'Full Text', 'Text']
data['Lower Case Text'] = data['Text'].str.lower()

In [3]:
data

Unnamed: 0,label,Full Text,Text,Lower Case Text
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,nah i dont think he goes to usf he lives aroun...
5,spam,FreeMsg Hey there darling it's been 3 week's n...,FreeMsg Hey there darling its been 3 weeks now...,freemsg hey there darling its been 3 weeks now...
6,ham,Even my brother is not like to speak with me. ...,Even my brother is not like to speak with me T...,even my brother is not like to speak with me t...
7,ham,As per your request 'Melle Melle (Oru Minnamin...,As per your request Melle Melle Oru Minnaminun...,as per your request melle melle oru minnaminun...
8,spam,WINNER!! As a valued network customer you have...,WINNER As a valued network customer you have b...,winner as a valued network customer you have b...
9,spam,Had your mobile 11 months or more? U R entitle...,Had your mobile 11 months or more U R entitled...,had your mobile 11 months or more u r entitled...


In [4]:
labels, counts = np.unique(data['label'], return_counts=True)
encoder = preprocessing.LabelEncoder()
encoder.fit(labels[np.argsort(-counts)])
data['y'] = encoder.transform(data['label'])

In [5]:
np.random.seed(42)
mask_train = np.random.random(data.shape[0]) < 0.8
data_train = data[mask_train]
data_test = data.iloc[~mask_train, :]

In [6]:
count_vect_sing = CountVectorizer()
count_vect_ngram = CountVectorizer(min_df=2, ngram_range=(2,2))
count_vect_sing.fit(data_train['Lower Case Text'])
count_vect_ngram.fit(data_train['Lower Case Text'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=2,
        ngram_range=(2, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [7]:
print(count_vect_sing.build_tokenizer()(data_test.loc[1, 'Lower Case Text']))
print(count_vect_ngram.build_tokenizer()(data_test.loc[1, 'Lower Case Text']))


['ok', 'lar', 'joking', 'wif', 'oni']
['ok', 'lar', 'joking', 'wif', 'oni']


In [8]:
print(len(count_vect_ngram.vocabulary_))

8238


In [9]:
VOCAB_SIZE = len(count_vect_sing.vocabulary_)
EMBEDDING_SIZE = 300
word_to_ix = count_vect_sing.vocabulary_

### replicate the n-gram stuff on our corpus

### understand n-gram stuff here

In [10]:
CONTEXT_SIZE = 1
EMBEDDING_DIM = EMBEDDING_SIZE

In [11]:
class TwoGramLanguageModeler(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(TwoGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)
    
    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [12]:
losses = []
loss_function = nn.NLLLoss()
model = TwoGramLanguageModeler(VOCAB_SIZE, EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [59]:
count_vect_ngram.vocabulary_
ngram_tokenizer = count_vect_ngram.build_tokenizer()

def convertToBigramList(input_str, ngram_tokenizer):
    token_list = ngram_tokenizer(input_str)
    return np.array([np.array([token_list[i-1], token_list[i]]) for i in range(1, len(token_list))])

word2vec_train = data_train['Lower Case Text'].apply(convertToBigramList, ngram_tokenizer = ngram_tokenizer)

In [60]:
word2vec_train_list = []
for entry in word2vec_train:
    for e in entry:
        word2vec_train_list.append(e)
        
word2vec_train = word2vec_train_list

In [68]:
def trainCycle():  
    total_loss = 0
    for entry in word2vec_train:
        context = entry[0]
        target = entry[1]
        context_idxs = torch.tensor([word_to_ix[context]], dtype=torch.long)
        
        model.zero_grad()
        
        log_probs = model(context_idxs)
        loss = loss_function(log_probs, torch.tensor([word_to_ix[target]], dtype=torch.long))
        
        loss.backward()
        
        optimizer.step()
        
        total_loss += loss.item()
    return total_loss

In [69]:
TRAIN_MODEL = True
MODEL_PATH = '../data/word_2vec_model'

In [70]:
if TRAIN_MODEL:
    for epoch in range(10):
        print(epoch)
        losses.append(trainCycle())
    print(losses)
    torch.save(model.state_dict(), MODEL_PATH)
else:
    model.load_state_dict(torch.load(MODEL_PATH))
    model.eval()

0
go ['go' 'until']
until ['until' 'jurong']
jurong ['jurong' 'point']
point ['point' 'crazy']
crazy ['crazy' 'available']
available ['available' 'only']
only ['only' 'in']
in ['in' 'bugis']
bugis ['bugis' 'great']
great ['great' 'world']
world ['world' 'la']
la ['la' 'buffet']
buffet ['buffet' 'cine']
cine ['cine' 'there']
there ['there' 'got']
got ['got' 'amore']
amore ['amore' 'wat']
free ['free' 'entry']
entry ['entry' 'in']
in ['in' 'wkly']
wkly ['wkly' 'comp']
comp ['comp' 'to']
to ['to' 'win']
win ['win' 'fa']
fa ['fa' 'cup']
cup ['cup' 'final']
final ['final' 'tkts']
tkts ['tkts' '21st']
21st ['21st' 'may']
may ['may' '2005']
2005 ['2005' 'text']
text ['text' 'fa']
fa ['fa' 'to']
to ['to' '87121']
87121 ['87121' 'to']
to ['to' 'receive']
receive ['receive' 'entry']
entry ['entry' 'questionstd']
questionstd ['questionstd' 'txt']
txt ['txt' 'ratetcs']
ratetcs ['ratetcs' 'apply']
apply ['apply' '08452810075over18s']
dun ['dun' 'say']
say ['say' 'so']
so ['so' 'early']
early ['earl

KeyboardInterrupt: 

In [None]:
embeddings = model.embeddings
tokenizer = count_vect_sing.build_tokenizer()

In [None]:
def _indicesForSentence(input_str):
    input_str = list(filter(lambda x: x in count_vect_sing.vocabulary_, tokenizer(input_str)))
    return torch.tensor([[word_to_ix[word]] for word in input_str], dtype=torch.long)

def getEmbedding(word_index_tensor, embedder):
    embedder(word_index_tensor)
def sentenceToNumpyInstance(input_str, embedder):
    embeddings = embedder(_indicesForSentence(input_str))
    if embeddings.shape == torch.Size([0]):
        return np.zeros(EMBEDDING_SIZE)
    else:
        x = torch.sum(embeddings, dim = 0)
        return torch.Tensor.numpy(x.detach())[0]

In [None]:
data_train_embedding = np.array(data_train['Lower Case Text'].apply(sentenceToNumpyInstance, \
                                                                    embedder=embeddings).values.tolist())

In [None]:
data_test_embedding = np.array(data_test['Lower Case Text'].apply(sentenceToNumpyInstance, \
                                                                  embedder=embeddings).values.tolist())

In [None]:
classifier = LogisticRegression()

In [None]:
classifier.fit(data_train_embedding, data_train['y'])

In [None]:
word_2vec_fpr, word_2_vec_tpr, word_2vec_thresholds = roc_curve(data_train['y'], \
                                                                classifier.predict_proba(data_train_embedding)[:,1])
auc(word_2vec_fpr, word_2_vec_tpr)

In [None]:
word_2vec_fpr, word_2_vec_tpr, word_2vec_thresholds = roc_curve(data_test['y'], \
                                                   classifier.predict_proba(data_test_embedding)[:,1])
auc(word_2vec_fpr, word_2_vec_tpr)