In [73]:
# code source from http://scikit-learn.org/stable/datasets/twenty_newsgroups.html#
# code source from http://www.nltk.org/book/ch02.html
# code source from http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.euclidean_distances.html

from sklearn.datasets import fetch_20newsgroups

import nltk
from nltk.corpus import stopwords
import string


# extracts the archive contents in the ~/scikit_learn_data/20news_home folder 
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

################# this part is for pre-processing the text file which isn't mandatory
# get rid of the stopwords and punctuation
nltk.download('stopwords')
stop_word =stopwords.words('english')
punctuation = string.punctuation
stopw_punctuation = list(stop_word) + list(punctuation)

from nltk.tokenize import word_tokenize

# to tokenize the word
for j in range(len(newsgroups_train.data)):
    newsgroups_train.data[j] = " ".join([w for w in word_tokenize(newsgroups_train.data[j]) if w not in stopw_punctuation])
   # print(newsgroups_train.data[])
for j in range(len(newsgroups_test.data)):
    newsgroups_test.data[j] = " ".join([w for w in word_tokenize(newsgroups_train.data[j]) if w not in stopw_punctuation])

#print(len(newsgroups_train.data))
#print(newsgroups_train.data[1])

##############

# from http://scikit-learn.org/stable/datasets/twenty_newsgroups.html
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectors_train = vectorizer.fit_transform(newsgroups_train.data)
# the test data set don't need to fit
vectors_test = vectorizer.transform(newsgroups_test.data)
print(vectors_train.shape)
print(vectors_test.shape)



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/deshenghu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
(11314, 130088)
(7532, 130088)


In [74]:
# code source from http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.euclidean_distances.html
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity

similarity_ED = euclidean_distances(vectors_train, vectors_test)
similarity_COSINE = cosine_similarity(vectors_train, vectors_test)

print(similarity_ED.shape)
print(similarity_COSINE.shape)





(11314, 7532)
(11314, 7532)


In [76]:
print(newsgroups_train.target)
print(newsgroups_train.target_names)
print(newsgroups_train.target[7])


[7 4 4 ..., 3 1 8]
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
3


In [80]:
import numpy as np

# learn code from this source: https://docs.python.org/2/library/collections.html#collections.Counter
# A Counter is a dict subclass for counting hashable objects. 
from collections import Counter

Y_train = newsgroups_train.target

def Find_KNN(idx, k, similarity_matrix):
    
    
    index_KNN = np.argsort(np.array(similarity_ED[idx, :]))[:k]
    print("The index of the K nearest neighbours are:",index_KNN)
    
   
    label_final , count = Counter(Y_train[index_KNN]).most_common()[0]
    
    print("The labels for the index before are:",Y_train[index_KNN])
    
    #print("The labels for the index before are:",Y_train[index_KNN].target_names)
    
    #convert the target names into array
    convert_array = np.array(newsgroups_train.target_names)
    #convert the lables into array 
    label_array = np.array(Y_train[index_KNN])
    
    # here convert_array[label_array] is the targetname corresponding to that label
    print("The KNN categories are :", convert_array[label_array])
    
    
   # print(list(Y_train[index_KNN])
   # print(np.array(newsgroups))
    print("The final label is:", label_final)
    
 
    
    
    return label_final

Find_KNN(10, 5, similarity_ED )



The index of the K nearest neighbours are: [  10 3543 6593 1143 4211]
The labels for the index before are: [8 8 8 6 8]
The KNN categories are : ['rec.motorcycles' 'rec.motorcycles' 'rec.motorcycles' 'misc.forsale'
 'rec.motorcycles']
The final label is: 8


8

In [78]:
seed_index = np.random.randint(7500, size=1200)

print(seed_index)

count_accuracy = 0

Y_train = newsgroups_test.target

for i in seed_index:
    train_label = Find_KNN(i, 5, similarity_ED)
    real_label = Y_train[i]
    if train_label== real_label:
        count_accuracy += 1
        
        
        
        
       
    #print(train_label, real_label)

Final_accuracy = float(count_accuracy)/float(len(seed_index))
print("Final Accuracy is:", Final_accuracy ) 
    
    #if train_label

[ 276 2051 4015 ...,  201 7198 5147]
The index of the K nearest neighbours are: [ 276 6573 6562 2203  204]
The labels for the index before are: [ 5 15 19 18 15]
The KNN categories are : ['comp.windows.x' 'soc.religion.christian' 'talk.religion.misc'
 'talk.politics.misc' 'soc.religion.christian']
The final label is: 15
The index of the K nearest neighbours are: [2051  138 4769 6494 6453]
The labels for the index before are: [12  3  3 11  1]
The KNN categories are : ['sci.electronics' 'comp.sys.ibm.pc.hardware' 'comp.sys.ibm.pc.hardware'
 'sci.crypt' 'comp.graphics']
The final label is: 3
The index of the K nearest neighbours are: [4015 2201 5658 4839 1966]
The labels for the index before are: [ 0 18 17  8 14]
The KNN categories are : ['alt.atheism' 'talk.politics.misc' 'talk.politics.mideast'
 'rec.motorcycles' 'sci.space']
The final label is: 0
The index of the K nearest neighbours are: [ 475 5611 2723 4135 4144]
The labels for the index before are: [13 12 13  4  6]
The KNN categories

The final label is: 8
The index of the K nearest neighbours are: [1198 1549 6809 2001  681]
The labels for the index before are: [14  0  2 13 14]
The KNN categories are : ['sci.space' 'alt.atheism' 'comp.os.ms-windows.misc' 'sci.med' 'sci.space']
The final label is: 14
The index of the K nearest neighbours are: [6597 6746 6500  982 6675]
The labels for the index before are: [10 10  8  0 17]
The KNN categories are : ['rec.sport.hockey' 'rec.sport.hockey' 'rec.motorcycles' 'alt.atheism'
 'talk.politics.mideast']
The final label is: 10
The index of the K nearest neighbours are: [3900 1974 5183 6730 5664]
The labels for the index before are: [15  3  0 14 13]
The KNN categories are : ['soc.religion.christian' 'comp.sys.ibm.pc.hardware' 'alt.atheism'
 'sci.space' 'sci.med']
The final label is: 15
The index of the K nearest neighbours are: [5472 4695 7434 2409 1230]
The labels for the index before are: [ 5  8  8  7 13]
The KNN categories are : ['comp.windows.x' 'rec.motorcycles' 'rec.motorcyc

The labels for the index before are: [ 9  3  8 14  6]
The KNN categories are : ['rec.sport.baseball' 'comp.sys.ibm.pc.hardware' 'rec.motorcycles'
 'sci.space' 'misc.forsale']
The final label is: 9
The index of the K nearest neighbours are: [1589 5439  375 3108 6711]
The labels for the index before are: [ 9  6 14 14 18]
The KNN categories are : ['rec.sport.baseball' 'misc.forsale' 'sci.space' 'sci.space'
 'talk.politics.misc']
The final label is: 14
The index of the K nearest neighbours are: [3816 1734 2080 4920 5535]
The labels for the index before are: [7 9 2 9 3]
The KNN categories are : ['rec.autos' 'rec.sport.baseball' 'comp.os.ms-windows.misc'
 'rec.sport.baseball' 'comp.sys.ibm.pc.hardware']
The final label is: 9
The index of the K nearest neighbours are: [3730 4740 1025  621 5205]
The labels for the index before are: [10  8  9 14 18]
The KNN categories are : ['rec.sport.hockey' 'rec.motorcycles' 'rec.sport.baseball' 'sci.space'
 'talk.politics.misc']
The final label is: 10
The i

The KNN categories are : ['sci.electronics' 'misc.forsale' 'sci.space' 'talk.religion.misc'
 'sci.electronics']
The final label is: 12
The index of the K nearest neighbours are: [4892 1909 4876 2555  916]
The labels for the index before are: [11 19  8  0  7]
The KNN categories are : ['sci.crypt' 'talk.religion.misc' 'rec.motorcycles' 'alt.atheism'
 'rec.autos']
The final label is: 11
The index of the K nearest neighbours are: [1565 6540 7056  913 6563]
The labels for the index before are: [ 7  1 16  2  1]
The KNN categories are : ['rec.autos' 'comp.graphics' 'talk.politics.guns' 'comp.os.ms-windows.misc'
 'comp.graphics']
The final label is: 1
The index of the K nearest neighbours are: [3800 4421 6621 3385 2315]
The labels for the index before are: [ 9 15  6 19 12]
The KNN categories are : ['rec.sport.baseball' 'soc.religion.christian' 'misc.forsale'
 'talk.religion.misc' 'sci.electronics']
The final label is: 9
The index of the K nearest neighbours are: [6637 1244 1782 2559 1414]
The 

 'misc.forsale']
The final label is: 12
The index of the K nearest neighbours are: [7090 6079    7 5489 1194]
The labels for the index before are: [ 8  0 15  4  5]
The KNN categories are : ['rec.motorcycles' 'alt.atheism' 'soc.religion.christian'
 'comp.sys.mac.hardware' 'comp.windows.x']
The final label is: 8
The index of the K nearest neighbours are: [3038 1434 5426 7457 5564]
The labels for the index before are: [ 8 10 14 14  3]
The KNN categories are : ['rec.motorcycles' 'rec.sport.hockey' 'sci.space' 'sci.space'
 'comp.sys.ibm.pc.hardware']
The final label is: 14
The index of the K nearest neighbours are: [6075 7449 3404 3929 6281]
The labels for the index before are: [ 0 11 17  7  3]
The KNN categories are : ['alt.atheism' 'sci.crypt' 'talk.politics.mideast' 'rec.autos'
 'comp.sys.ibm.pc.hardware']
The final label is: 0
The index of the K nearest neighbours are: [6044 1226 6384  320  834]
The labels for the index before are: [ 2  9  5 15 12]
The KNN categories are : ['comp.os.ms-

The index of the K nearest neighbours are: [6775 3049 7090 4232 2897]
The labels for the index before are: [16 12  8 12  1]
The KNN categories are : ['talk.politics.guns' 'sci.electronics' 'rec.motorcycles' 'sci.electronics'
 'comp.graphics']
The final label is: 12
The index of the K nearest neighbours are: [2812 2593 5467 4001 5120]
The labels for the index before are: [13  5 12  8  1]
The KNN categories are : ['sci.med' 'comp.windows.x' 'sci.electronics' 'rec.motorcycles'
 'comp.graphics']
The final label is: 13
The index of the K nearest neighbours are: [ 289 2439 6626 5076 5262]
The labels for the index before are: [18  5 13  2  8]
The KNN categories are : ['talk.politics.misc' 'comp.windows.x' 'sci.med' 'comp.os.ms-windows.misc'
 'rec.motorcycles']
The final label is: 18
The index of the K nearest neighbours are: [3761 2111 1680 3626 3018]
The labels for the index before are: [15  4 11 18  1]
The KNN categories are : ['soc.religion.christian' 'comp.sys.mac.hardware' 'sci.crypt'
 '

The labels for the index before are: [ 0 17  4 10  0]
The KNN categories are : ['alt.atheism' 'talk.politics.mideast' 'comp.sys.mac.hardware'
 'rec.sport.hockey' 'alt.atheism']
The final label is: 0
The index of the K nearest neighbours are: [1671 5922 3670 6780 2339]
The labels for the index before are: [18 19 14 11  7]
The KNN categories are : ['talk.politics.misc' 'talk.religion.misc' 'sci.space' 'sci.crypt'
 'rec.autos']
The final label is: 18
The index of the K nearest neighbours are: [2691 1321 3984 6081 2685]
The labels for the index before are: [16 10 13  9  3]
The KNN categories are : ['talk.politics.guns' 'rec.sport.hockey' 'sci.med' 'rec.sport.baseball'
 'comp.sys.ibm.pc.hardware']
The final label is: 16
The index of the K nearest neighbours are: [4833  394  332 3211 6323]
The labels for the index before are: [ 8  9  4 13  1]
The KNN categories are : ['rec.motorcycles' 'rec.sport.baseball' 'comp.sys.mac.hardware' 'sci.med'
 'comp.graphics']
The final label is: 8
The index of

The labels for the index before are: [ 1 17  6  1  7]
The KNN categories are : ['comp.graphics' 'talk.politics.mideast' 'misc.forsale' 'comp.graphics'
 'rec.autos']
The final label is: 1
The index of the K nearest neighbours are: [7405 7354 3266 2593  226]
The labels for the index before are: [12 10 19  5  7]
The KNN categories are : ['sci.electronics' 'rec.sport.hockey' 'talk.religion.misc' 'comp.windows.x'
 'rec.autos']
The final label is: 12
The index of the K nearest neighbours are: [ 551 4030  727 5746 3069]
The labels for the index before are: [ 4 11 12  9 19]
The KNN categories are : ['comp.sys.mac.hardware' 'sci.crypt' 'sci.electronics' 'rec.sport.baseball'
 'talk.religion.misc']
The final label is: 4
The index of the K nearest neighbours are: [3013  689 7036 1658 1956]
The labels for the index before are: [ 2  3 14 10 11]
The KNN categories are : ['comp.os.ms-windows.misc' 'comp.sys.ibm.pc.hardware' 'sci.space'
 'rec.sport.hockey' 'sci.crypt']
The final label is: 2
The index o

The KNN categories are : ['rec.sport.baseball' 'sci.crypt' 'sci.electronics' 'comp.sys.mac.hardware'
 'rec.sport.baseball']
The final label is: 9
The index of the K nearest neighbours are: [5063 3145 2686 3488 1945]
The labels for the index before are: [13  4  4 10  4]
The KNN categories are : ['sci.med' 'comp.sys.mac.hardware' 'comp.sys.mac.hardware'
 'rec.sport.hockey' 'comp.sys.mac.hardware']
The final label is: 4
The index of the K nearest neighbours are: [2812 2593 5467 4001 5120]
The labels for the index before are: [13  5 12  8  1]
The KNN categories are : ['sci.med' 'comp.windows.x' 'sci.electronics' 'rec.motorcycles'
 'comp.graphics']
The final label is: 13
The index of the K nearest neighbours are: [7274 1452 4097 5590 1767]
The labels for the index before are: [16  6  8  8  5]
The KNN categories are : ['talk.politics.guns' 'misc.forsale' 'rec.motorcycles' 'rec.motorcycles'
 'comp.windows.x']
The final label is: 8
The index of the K nearest neighbours are: [7375 4854 1955 531

The labels for the index before are: [15  3  5  9 18]
The KNN categories are : ['soc.religion.christian' 'comp.sys.ibm.pc.hardware' 'comp.windows.x'
 'rec.sport.baseball' 'talk.politics.misc']
The final label is: 15
The index of the K nearest neighbours are: [2306 5227 1345 2301 4986]
The labels for the index before are: [ 5  4  8 17 15]
The KNN categories are : ['comp.windows.x' 'comp.sys.mac.hardware' 'rec.motorcycles'
 'talk.politics.mideast' 'soc.religion.christian']
The final label is: 5
The index of the K nearest neighbours are: [3498  478 4398 6149 6918]
The labels for the index before are: [ 0  7 17  0 17]
The KNN categories are : ['alt.atheism' 'rec.autos' 'talk.politics.mideast' 'alt.atheism'
 'talk.politics.mideast']
The final label is: 0
The index of the K nearest neighbours are: [5962 2481 6447  485 4609]
The labels for the index before are: [14  1 12  5  4]
The KNN categories are : ['sci.space' 'comp.graphics' 'sci.electronics' 'comp.windows.x'
 'comp.sys.mac.hardware']
T