In [1]:
from sklearn import cross_validation
from operator import itemgetter
import numpy as np
import math
from collections import Counter
 
# 1) given two data points, calculate the euclidean distance between them
def get_distance(data1, data2):
    points = zip(data1, data2)
    diffs_squared_distance = [pow(a - b, 2) for (a, b) in points]
    return math.sqrt(sum(diffs_squared_distance))
 
# 2) given a training set and a test instance, use getDistance to calculate all pairwise distances
def get_neighbours(training_set, test_instance, k):
    distances = [_get_tuple_distance(training_instance, test_instance) for training_instance in training_set]
    # index 1 is the calculated distance between training_instance and test_instance
    sorted_distances = sorted(distances, key=itemgetter(1))
    # extract only training instances
    sorted_training_instances = [tuple[0] for tuple in sorted_distances]
    # select first k elements
    return sorted_training_instances[:k]
 
def _get_tuple_distance(training_instance, test_instance):
    return (training_instance, get_distance(test_instance, training_instance[0]))
 
# 3) given an array of nearest neighbours for a test case, tally up their classes to vote on test case class
def get_majority_vote(neighbours):
    # index 1 is the class
    classes = [neighbour[1] for neighbour in neighbours]
    count = Counter(classes)
    return count.most_common()[0][0] 




In [2]:
import pandas as pd
import numpy as np
from numpy import zeros
#sklearn imports
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn.metrics import classification_report

from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
#import for metrics
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [3]:
# setting up main executable method
def knn():
    results=dict();
    # load the data and create the training and test sets
    # random_state = 1 is just a seed to permit reproducibility of the train/test split
  
    ##############################################
    data = pd.read_csv('train_set.csv', sep="\t")
    #create set from categories
    le = preprocessing.LabelEncoder()
    le.fit(data["Category"])
    y = le.transform(data["Category"])
    set(y)
    #import vectorizer and lsi
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.decomposition import TruncatedSVD
    #do vectorization
    vectorizer = TfidfVectorizer(max_df=0.5,stop_words='english',use_idf=True)
    X = vectorizer.fit_transform(data['Content'])
    lsa=TruncatedSVD(n_components=100)
    X=lsa.fit_transform(X)
    X=preprocessing.Normalizer(copy=False).fit_transform(X)
    ###############################################
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=1) 
    # reformat train/test datasets for convenience
    train = np.array(zip(X_train,y_train))
    test = np.array(zip(X_test, y_test))
 
    # generate predictions
    predictions = []
 
    # let's arbitrarily set k equal to 5, meaning that to predict the class of new instances,
    k = 5
 
    # for each instance in the test set, get nearest neighbours and majority vote on predicted class
    for x in range(len(X_test)):
        neighbours = get_neighbours(training_set=train, test_instance=test[x][0], k=5)
        majority_vote = get_majority_vote(neighbours)
        predictions.append(majority_vote)
    # summarize performance of the classification
    #get metrics for accuracy
    accuracy=accuracy_score(y_test,predictions)
    #get metrics for precision,recall,fscore,support in a list
    score=precision_recall_fscore_support(y_test,predictions,average='macro')
    precision=score[0]
    recall=score[1]
    f1_score=score[2]
    #values to return
    results['predictions']=predictions
    results['accuracy']=accuracy
    results['precision']=precision
    results['recall']=recall
    results['f1_score']=f1_score
    return results
    
if __name__ == "__knn__":
    knn()

In [4]:
results=knn()

In [5]:
predictions=results['predictions']
accuracy=results['accuracy']
precision=results['precision']
recall=results['recall']
f1_score=results['f1_score']

In [6]:
#open csv for write results
data = pd.read_csv('EvaluationMetric_10fold.csv',sep='\t')

In [9]:
#write results to csv
data.loc[0,'KNN']=accuracy
data.loc[1,'KNN']=precision
data.loc[2,'KNN']=recall
data.loc[3,'KNN']=f1_score
#save/overwrite csv
data.to_csv('EvaluationMetric_10fold.csv',sep='\t',index=False)

In [10]:
data

Unnamed: 0,Statistic Measure,Naive Bayes,Random Forest,SVM,KNN,My Method
0,Accuracy,0.939426,0.678787,0.959237,0.954962,
1,Precision,0.935177,0.648057,0.956933,0.953109,
2,Recall,0.937824,0.64748,0.957114,0.954454,
3,F-Measure,0.936162,0.647525,0.957014,0.95372,
