## Tf-Idf Vectorizer Based KNN Classifier
This notebook creates tf-idf vectors out of the dataset provided. Scikit-learn has a built in Tf-Idf 
implementation. we will also use NLTK's tokenizer and stemmer to preprocess the text.

### Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np  
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KNeighborsClassifier
import sklearn.metrics as metrics
%matplotlib inline

In [2]:
import nltk
import string
from nltk.corpus import stopwords 
from nltk.stem.porter import *
nltk.download('stopwords')
nltk.download('punkt')
stopWords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /usr/local/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/local/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
df = pd.read_csv('data/bible_data_set_full.csv',sep='\t')
np.random.seed(seed=45)

In [4]:
y = df['testment']
X_train,X_test,Y_train,Y_test = train_test_split(df,y,test_size = 0.3,random_state = 45)

In [5]:
print(X_train.head())

       Unnamed: 0       citation      book  chapter  verse  \
15313       15313    Psalms 88:5    Psalms       88      5   
6575         6575     Judges 3:7    Judges        3      7   
16270       16270   Psalms 140:7    Psalms      140      7   
9424         9424  1 Kings 20:16   1 Kings       20     16   
19445       19445  Jeremiah 21:5  Jeremiah       21      5   

                                                    text  testment  \
15313  Free among the dead, like the slain that lie i...         1   
6575   And the children of Israel did evil in the sig...         1   
16270  O GOD the Lord, the strength of my salvation, ...         1   
9424   And they went out at noon. But Benhadad was dr...         1   
19445  And I myself will fight against you with an ou...         1   

                                   nounTextBlob  \
15313  {'free', 'thou rememberest', 'thy hand'}   
6575        {'baalim', 'lord', 'god', 'israel'}   
16270                      {'thou hast', 'god'}   
94

### Flags

Flags to turn on and off removal of puntuations, removal of stopwords and stemming words using porter stemmer

In [6]:
PUNCTUATION_FLAG = True
STOPWORDS_FLAG = True
STEMMING_FLAG = True

In [7]:
simpleTokenize = lambda doc: doc.lower().split(" ")

### Helper funtions

In [8]:
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    
    lowers = text.lower() 
    
    if PUNCTUATION_FLAG:
        no_punctuation = lowers.translate(string.punctuation)
        tokens = nltk.word_tokenize(no_punctuation)
    else:
        tokens = nltk.word_tokenize(lowers)
    
    if STOPWORDS_FLAG:
        filtered = [w for w in tokens if not w in stopwords.words('english')]
    else:
        filtered = tokens
        
    if STEMMING_FLAG:
        stemmer = PorterStemmer()
        final_tokens = stem_tokens(filtered, stemmer) 
    else:
        final_tokens = token
        
    return final_tokens

### Read data and create TfIdf Matrix

In [9]:
text_tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize)
tfidf_text_matrix = text_tfidf.fit_transform(X_train['text'])

### Finding cosine distance between light and dark

In [10]:
# print(text_tfidf.transform(["light"]))
print(1 - cosine_similarity(text_tfidf.transform(["light"]),text_tfidf.transform(["dark"])))

[[ 1.]]


### Creating KNN classifier

In [11]:
neigh = KNeighborsClassifier(n_neighbors=10,weights="distance",n_jobs=-1)
neigh.fit(text_tfidf.transform(X_train['text']), Y_train) 

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=10, p=2,
           weights='distance')

### Predicting and calculating accuracy

In [12]:
y_pred = neigh.predict(text_tfidf.transform(X_test['text']))
print(len(y_pred))  
print(len(Y_test))

9331
9331


In [13]:
actual_labels = Y_test
predicted_labels = y_pred 
print(metrics.precision_recall_fscore_support(actual_labels, predicted_labels, average='macro'))
print(metrics.precision_recall_fscore_support(actual_labels, predicted_labels, average='micro'))
print(metrics.precision_recall_fscore_support(actual_labels, predicted_labels, average='weighted'))
print(metrics.confusion_matrix(actual_labels, predicted_labels,np.unique(actual_labels))) 

(0.90324986807030827, 0.85200864312256452, 0.87336497947152658, None)
(0.90933447647626198, 0.90933447647626198, 0.90933447647626198, None)
(0.90837843646732186, 0.90933447647626198, 0.90626770255106426, None)
[[1756  635]
 [ 211 6729]]
