### SkipThought Vectorizer  Based KNN Classifier
This notebook creates SkipThought vectors out of the dataset provided. Tensorflow has a pretrained skip-thought vector model. we will use it.

### Adding skip_thoughts library to path

In [1]:
import sys
sys.path.append("/mnt/magnetar/data/models/skip_thoughts") 

### Imports

In [2]:
import numpy as np
import pandas as pd
import csv
import os.path 
from skip_thoughts import configuration
from skip_thoughts import encoder_manager
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import sklearn.metrics as metrics

### Set paths to the model.

In [3]:
VOCAB_FILE = "/mnt/magnetar/data/skip_thoughts_bi_2017_02_16/vocab.txt"
EMBEDDING_MATRIX_FILE = "/mnt/magnetar/data/skip_thoughts_bi_2017_02_16/embeddings.npy"
CHECKPOINT_PATH = "/mnt/magnetar/data/skip_thoughts_bi_2017_02_16/model.ckpt-500008" 

### Setting up the encoder. Here we are using a bidirectional model. 

In [4]:
encoder = encoder_manager.EncoderManager()
encoder.load_model(configuration.model_config(bidirectional_encoder=True),
                   vocabulary_file=VOCAB_FILE,
                   embedding_matrix_file=EMBEDDING_MATRIX_FILE,
                   checkpoint_path=CHECKPOINT_PATH)

INFO:tensorflow:Reading vocabulary from /mnt/magnetar/data/skip_thoughts_bi_2017_02_16/vocab.txt
INFO:tensorflow:Loaded vocabulary with 930914 words.
INFO:tensorflow:Loading embedding matrix from /mnt/magnetar/data/skip_thoughts_bi_2017_02_16/embeddings.npy
INFO:tensorflow:Loaded embedding matrix with shape (930914, 620)
INFO:tensorflow:Building model.
INFO:tensorflow:Loading model from checkpoint: /mnt/magnetar/data/skip_thoughts_bi_2017_02_16/model.ckpt-500008
INFO:tensorflow:Restoring parameters from /mnt/magnetar/data/skip_thoughts_bi_2017_02_16/model.ckpt-500008
INFO:tensorflow:Successfully loaded checkpoint: model.ckpt-500008


### Reading the dataset

In [5]:
df = pd.read_csv('data/bible_data_set_full.csv',sep='\t')
np.random.seed(seed=45)

### Creating encodings for Text

In [6]:
y = df['testment']
X_train,X_test,Y_train,Y_test = train_test_split(df,y,test_size = 0.3,random_state = 45)

### Creating KNN classifier

In [7]:
neigh = KNeighborsClassifier(n_neighbors=10,weights="distance",n_jobs=-1)
neigh.fit(encoder.encode(X_train['text']), Y_train) 

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=10, p=2,
           weights='distance')

### Cosine Distance between light and dark

In [8]:
print(1 - cosine_similarity(encoder.encode(["light"]),encoder.encode(["dark"])))
# print(cosine_similarity(encoder.encode(["light"]),encoder.encode(["god"])))

[[ 0.48266226]]


### Predicting and calculating accuracy 

In [9]:
y_pred = neigh.predict(encoder.encode(X_test['text']))
print(len(y_pred)) 
print(len(Y_test)) 

9331
9331


In [10]:
actual_labels = Y_test
predicted_labels = y_pred 
print(metrics.precision_recall_fscore_support(actual_labels, predicted_labels, average='macro'))
print(metrics.precision_recall_fscore_support(actual_labels, predicted_labels, average='micro'))
print(metrics.precision_recall_fscore_support(actual_labels, predicted_labels, average='weighted'))
print(metrics.confusion_matrix(actual_labels, predicted_labels,np.unique(actual_labels))) 

(0.84143222506393855, 0.6892254455649609, 0.7209215890896542, None)
(0.82949308755760365, 0.82949308755760365, 0.82949308755760365, None)
(0.83377306041631238, 0.82949308755760365, 0.80578271392310807, None)
[[ 960 1431]
 [ 160 6780]]
