In [1]:
from bs4 import BeautifulSoup as bs
import numpy as np
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import re
import collections
import math

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Zahin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Zahin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Zahin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
topics = []
def getTopicnames():
    with open('Data/topics_all.txt', 'r') as reader:
        line = reader.readline()
        while line != '':  # The EOF char is an empty string
            if line[-1] == "\n":

              topics.append(line[:-1])
            else:
              topics.append(line)
            line = reader.readline()



getTopicnames()
print(topics)

['Anime', 'Arduino', 'Astronomy', 'Biology', 'Chess', 'Coffee', 'Cooking', 'Law', 'Space', 'Windows_Phone', 'Wood_Working']


In [6]:
def preprocessText(raw_text):
    #remove html tags
    cleanr = re.compile('<.*?>')
    text = re.sub(cleanr, '', raw_text)
    #remove any URLs
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    #Remove punctuations
    text=text.translate((str.maketrans('','',string.punctuation)))
    #Number Removal
    text = re.sub(r'[-+]?\d+', '', text)
    #removing unicode
    text = re.sub(r'[^\x00-\x7F]',' ', text) # 
    text = text.lower()
    #Tokenize
    text = word_tokenize(text)
    #Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if not word in stop_words]
    #Lemmatize tokens
    lemmatizer=WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
    #Stemming tokens
    stemmer= PorterStemmer()
    text = [stemmer.stem(word) for word in text]
    return text

In [7]:
X_train = []
Y_train = []
X_validation = []
Y_validation = []
X_test = []
Y_test = []
topic_dict = {}
topic_words_dict = {} # dict()
import random
prefix_for_colab =  "" #"/content/drive/My Drive/CSE_472_Assignment2_dataset/"
def preprocessData():
    list_of_words = []
    iter = 1
    count_train = 0
    for topic in topics:
        print(topic)
        list_of_words_in_topic = []
        topic_dict[iter] = topic
        topic_words_dict[iter] = []

        file_name = prefix_for_colab+"Data/Training/" + topic + ".xml"
        with open(file_name,'r',encoding='utf-8') as file:
                content = file.read()
                soup = bs(content)
                rows = soup.findAll("row")
                train_list_to_iterate = rows[:500]
                validation_list_to_iterate = rows[500:700]
                test_list_to_iterate = rows[700:1200]
                for item in train_list_to_iterate:
                    text = item["body"]
                    if text == "":
                      continue
                    processed_text = preprocessText(text)
                    X_train.append(np.array(processed_text))
                    Y_train.append(iter)
                    list_of_words.append(processed_text)
                    list_of_words_in_topic.append(processed_text)
                for item in validation_list_to_iterate:
                    text = item["body"]
                    if text == "":
                      continue
                    processed_text = preprocessText(text)
                    X_validation.append(np.array(processed_text))
                    Y_validation.append(iter)
                for i in range(50):
                  temp_X = []
                  temp_Y = []
                  for item in test_list_to_iterate[i*10:(i+1)*10]:
                      text = item["body"]
                      if text == "":
                        continue
                      processed_text = preprocessText(text)
                      temp_X.append(np.array(processed_text))
                      temp_Y.append(iter)
                  
                  #print(temp_Y)
                  X_test.append(temp_X)
                  Y_test.append(temp_Y)
                  #print(Y_test)

        temp = sum(list_of_words_in_topic, []) 
        topic_words_dict[iter] = temp
        iter = iter + 1
    vocabulary_raw = sum(list_of_words, []) 
    return vocabulary_raw

      
vocabulary_raw = preprocessData()
print(topic_dict)

Anime
Arduino
Astronomy
Biology
Chess
Coffee
Cooking
Law
Space
Windows_Phone
Wood_Working
{1: 'Anime', 2: 'Arduino', 3: 'Astronomy', 4: 'Biology', 5: 'Chess', 6: 'Coffee', 7: 'Cooking', 8: 'Law', 9: 'Space', 10: 'Windows_Phone', 11: 'Wood_Working'}


In [9]:
def makeDictionary():
    import collections
    counts = collections.Counter(vocabulary_raw)
    return dict(counts)
feature_space = makeDictionary()
unique_words_in_feature_space = np.sort(np.array(list(feature_space.keys())).reshape(1,-1)[0])
print(len(unique_words_in_feature_space))
modV = len(unique_words_in_feature_space)
total_words = np.sum(np.array(list(feature_space.values())).reshape(1,-1)[0])

19725


In [10]:
Y_test_for_each_iteration = {}
for i in range(50):
  Y_test_for_each_iteration[i] = []
  temp = []
  for j in range(len(topics)):
    temp.append (Y_test[j*50:(j+1)*50][i])
  temp2 = sum(temp, []) 
  Y_test_for_each_iteration[i] = temp2
X_test_for_each_iteration = {}
for i in range(50):
  X_test_for_each_iteration[i] = []
  temp = []
  for j in range(len(topics)):
    temp.append (X_test[j*50:(j+1)*50][i])
  temp2 = sum(temp, []) 
  X_test_for_each_iteration[i] = temp2


In [11]:
import sys
def getBooleanVector(string_tokens):
  boolean_vector = np.zeros((len(string_tokens),len(feature_space)))
  words_in_feature_space = np.sort(np.array(list(feature_space.keys())).reshape(1,-1)[0])
 # print(words_in_feature_space.shape)
  for i in range(len(string_tokens)):
     boolean_vector[i] = np.in1d(words_in_feature_space, string_tokens[i])
  return boolean_vector
X_test_boolean_vector_reprenstation = {}
for i in range(50):
  X_test_boolean_vector_reprenstation[i]=getBooleanVector(X_test_for_each_iteration[i])

X_train_boolean_vector_reprenstation=getBooleanVector(X_train)
X_validation_boolean_vector_reprenstation=getBooleanVector(X_validation)    

In [12]:
def getNumericVector(string_tokens):
  numeric_vector = np.zeros((len(string_tokens),len(feature_space)))
  words_in_feature_space = np.sort(np.array(list(feature_space.keys())).reshape(1,-1)[0])
  for i in range(len(string_tokens)):
      arr1 = words_in_feature_space
      arr2 = np.array(string_tokens[i])
      temp = np.in1d(arr1,arr2)
      idx = np.searchsorted(arr1,arr2)
      idx[idx==len(arr1)] = 0
      mask = arr1[idx]==arr2
      out = np.bincount(idx[mask])
      temp = temp.astype('int64') # implicitly changes all the "False" values to 0
      np.putmask(temp, temp, out)
      numeric_vector[i] = temp
  return numeric_vector
X_test_numeric_vector_reprenstation = {}
for i in range(50):
  X_test_numeric_vector_reprenstation[i]=getNumericVector(X_test_for_each_iteration[i])

X_train_numeric_vector_reprenstation=getNumericVector(X_train)
X_validation_numeric_vector_reprenstation=getNumericVector(X_validation)   

MemoryError: 

In [50]:
def getTFIDFVector(string_tokens,boolean_vector,numeric_vector):
  #np.seterr(divide='ignore', invalid='ignore')
  tf_idf_vector = np.zeros((len(string_tokens),len(feature_space)))
  words_in_feature_space = np.sort(np.array(list(feature_space.keys())).reshape(1,-1)[0])
  for i in range(len(string_tokens)):
      D = len(string_tokens)
      C_w = np.sum(boolean_vector,axis = 0).reshape(1,-1)
      W_d = np.sum(numeric_vector[i])
      if W_d == 0:
        W_d = 0.0000000000000000000000001
      tf = numeric_vector[i]/W_d 
      alpha = 1 #0.000000000000000001
      beta  = 1 #0.000000000000000001
      idf = np.log( np.divide(D+alpha,C_w+beta))
      #print(tf)
      #print(idf) 

     # print(tf*idf)
      tf_idf_vector[i] = tf*idf
  return tf_idf_vector
X_test_TFIDF_vector_reprenstation = {}
for i in range(50):
  X_test_TFIDF_vector_reprenstation[i]=getTFIDFVector(X_test_for_each_iteration[i],X_test_boolean_vector_reprenstation[i],X_test_numeric_vector_reprenstation[i])

X_train_TFIDF_vector_reprenstation=getTFIDFVector(X_train,X_train_boolean_vector_reprenstation,X_train_numeric_vector_reprenstation)
X_validation_TFIDF_reprenstation=getTFIDFVector(X_validation,X_validation_boolean_vector_reprenstation,X_validation_numeric_vector_reprenstation)   

In [51]:
def hamming_distance(instance1, instance2):
  return np.sum(np.logical_xor(instance1,instance2).astype(int),axis=1)

In [52]:
def euclidean_distance(instance1, instance2):
  return np.linalg.norm(instance1 - instance2,axis=1)

In [53]:
def cosine_similarity(instance1, instance2):
  numerator = np.sum(np.multiply(instance1,instance2),axis=1)
  denominator = np.multiply(np.linalg.norm(instance1,axis=1),np.linalg.norm(instance2,axis=1))
  return numerator/denominator

In [54]:
def prediction_knn(X_train, Y_train, X_test, version, n_neighbors=3):
    allTestNeighbers=[]
    allPredictedOutputs =[]
    Y_train = np.array(Y_train)
    #Determine Number of unique class lebels
    if version == 'v2':
      distances = euclidean_distance(X_train,X_test).reshape(1,-1)
    elif version == 'v1':
      distances = hamming_distance(X_train,X_test).reshape(1,-1)
    elif version == 'v3':
      distances = cosine_similarity(X_train,X_test).reshape(1,-1)
    sorted_indices = np.argsort(distances) #,ascending=True)
    
  #  print(Y_train[sorted_indices].reshape(1,-1))
    sortedYtrain = Y_train[sorted_indices].reshape(1,-1)[0][0:n_neighbors]
    if version == 'v3':
      sortedYtrain = Y_train[sorted_indices].reshape(1,-1)[0][-n_neighbors:]
    (values,counts) = np.unique(sortedYtrain,return_counts=True)
    ind=np.argmax(counts)
    #print (values[ind])
    predOut = values[ind]
    return predOut
        

In [55]:
def performanceEvaluation_knn(X_train, Y_train, X_test, Y_test, version, n_neighbors=3):
    print("K=",n_neighbors)
    totalCount = 0
    correctCount = 0
    for testInput, testActualOutput in zip(X_test, Y_test):
      predictedOutput = prediction_knn(X_train, Y_train, [testInput],version, n_neighbors)
      if predictedOutput == testActualOutput:
            correctCount += 1
      totalCount += 1
    
    print("Total Correct Count: ",correctCount," Total Wrong Count: ",totalCount-correctCount," Accuracy: ",(correctCount*100)/(totalCount))


In [None]:
print("=========================Hamming============================================================")
performanceEvaluation_knn(X_train_boolean_vector_reprenstation, Y_train, X_validation_boolean_vector_reprenstation, Y_validation,'v1',1)
performanceEvaluation_knn(X_train_boolean_vector_reprenstation, Y_train, X_validation_boolean_vector_reprenstation, Y_validation,'v1',3)
performanceEvaluation_knn(X_train_boolean_vector_reprenstation, Y_train, X_validation_boolean_vector_reprenstation, Y_validation,'v1',5)
print("=========================Euclidean============================================================")
performanceEvaluation_knn(X_train_numeric_vector_reprenstation, Y_train, X_validation_numeric_vector_reprenstation, Y_validation,'v2',1) 
performanceEvaluation_knn(X_train_numeric_vector_reprenstation, Y_train, X_validation_numeric_vector_reprenstation, Y_validation,'v2',3) 
performanceEvaluation_knn(X_train_numeric_vector_reprenstation, Y_train, X_validation_numeric_vector_reprenstation, Y_validation,'v2',5) 
print("=========================TF-IDF============================================================")
performanceEvaluation_knn(X_train_TFIDF_vector_reprenstation, Y_train, X_validation_TFIDF_reprenstation, Y_validation,'v3',1) 
performanceEvaluation_knn(X_train_TFIDF_vector_reprenstation, Y_train, X_validation_TFIDF_reprenstation, Y_validation,'v3',3) 
performanceEvaluation_knn(X_train_TFIDF_vector_reprenstation, Y_train, X_validation_TFIDF_reprenstation, Y_validation,'v3',5) 

K= 1
Total Correct Count:  858  Total Wrong Count:  1249  Accuracy:  40.72140484100617
K= 3
Total Correct Count:  831  Total Wrong Count:  1276  Accuracy:  39.43996203132416
K= 5
Total Correct Count:  802  Total Wrong Count:  1305  Accuracy:  38.06359753203607
K= 1
Total Correct Count:  1189  Total Wrong Count:  918  Accuracy:  56.43094447081158
K= 3
Total Correct Count:  1146  Total Wrong Count:  961  Accuracy:  54.390128144280965
K= 5
Total Correct Count:  1130  Total Wrong Count:  977  Accuracy:  53.63075462743237
K= 1
Total Correct Count:  1705  Total Wrong Count:  402  Accuracy:  80.92074038917893
K= 3
Total Correct Count:  1726  Total Wrong Count:  381  Accuracy:  81.91741813004272
K= 5


In [42]:
for i in range(50):
  print("==========================FOR ",i+1,"th iteration")
  print("=========================Hamiltonian============================================================")
  #performanceEvaluation_knn(X_train_boolean_vector_reprenstation, Y_train, X_test_boolean_vector_reprenstation[i], Y_test_for_each_iteration[i],'v1',1)
  #performanceEvaluation_knn(X_train_boolean_vector_reprenstation, Y_train, X_test_boolean_vector_reprenstation[i], Y_test_for_each_iteration[i],'v1',3)
  performanceEvaluation_knn(X_train_boolean_vector_reprenstation, Y_train, X_test_boolean_vector_reprenstation[i], Y_test_for_each_iteration[i],'v1',5)

          
  print("=========================Euclidean============================================================")
  #performanceEvaluation_knn(X_train_numeric_vector_reprenstation, Y_train, X_test_numeric_vector_reprenstation[i], Y_test_for_each_iteration[i],'v2',1) 
  #performanceEvaluation_knn(X_train_numeric_vector_reprenstation, Y_train, X_test_numeric_vector_reprenstation[i], Y_test_for_each_iteration[i],'v2',3) 
  performanceEvaluation_knn(X_train_numeric_vector_reprenstation, Y_train, X_test_numeric_vector_reprenstation[i], Y_test_for_each_iteration[i],'v2',5) 

  print("=========================TF-IDF============================================================")
 #performanceEvaluation_knn(X_train_TFIDF_vector_reprenstation, Y_train, X_test_TFIDF_vector_reprenstation[i], Y_test_for_each_iteration[i],'v3',1) 
  #performanceEvaluation_knn(X_train_TFIDF_vector_reprenstation, Y_train, X_test_TFIDF_vector_reprenstation[i], Y_test_for_each_iteration[i],'v3',3) 
  performanceEvaluation_knn(X_train_TFIDF_vector_reprenstation, Y_train, X_test_TFIDF_vector_reprenstation[i], Y_test_for_each_iteration[i],'v3',5) 


Total Correct Count:  22  Total Wrong Count:  8  Accuracy:  73.33333333333333
K= 5
Total Correct Count:  26  Total Wrong Count:  4  Accuracy:  86.66666666666667
K= 5
Total Correct Count:  29  Total Wrong Count:  1  Accuracy:  96.66666666666667
K= 5
Total Correct Count:  24  Total Wrong Count:  6  Accuracy:  80.0
K= 5
Total Correct Count:  25  Total Wrong Count:  5  Accuracy:  83.33333333333333
K= 5
Total Correct Count:  29  Total Wrong Count:  1  Accuracy:  96.66666666666667
K= 5
Total Correct Count:  20  Total Wrong Count:  10  Accuracy:  66.66666666666667
K= 5
Total Correct Count:  19  Total Wrong Count:  11  Accuracy:  63.333333333333336
K= 5
Total Correct Count:  28  Total Wrong Count:  2  Accuracy:  93.33333333333333
K= 5
Total Correct Count:  22  Total Wrong Count:  8  Accuracy:  73.33333333333333
K= 5
Total Correct Count:  26  Total Wrong Count:  4  Accuracy:  86.66666666666667
K= 5
Total Correct Count:  29  Total Wrong Count:  1  Accuracy:  96.66666666666667
K= 5


KeyboardInterrupt: ignored

In [13]:
import math
def prediction_Naive_Bayes(X_train, Y_train, X_test,alpha):
  unique_labels, counts_labels = np.unique(Y_train, return_counts=True)
  predictedOutput = -1
  prob = -99999999999999
  for label in unique_labels:
   # print("For label: ",label)
    words_in_this_topic = topic_words_dict[label]
    counts = collections.Counter(words_in_this_topic)
    word_counts_in_this_topic = dict(counts)
    P_cm_dt = math.log(counts_labels[label-1]/len(Y_train))
    N_cm = len(words_in_this_topic)
    for word in X_test[0]:
      N_wj_cm = 0
      if word in words_in_this_topic:
        N_wj_cm = word_counts_in_this_topic[word]
      P_wj_given_cm = (N_wj_cm + alpha) /(N_cm + alpha*modV)
      P_cm_dt = P_cm_dt+ math.log(P_wj_given_cm)
    if P_cm_dt > prob:
      prob = P_cm_dt
      predictedOutput = label

  return predictedOutput


In [14]:
def performanceEvaluation_NB(X_train, Y_train, X_test, Y_test,alpha):
    print("Smoothing factor: ",alpha)
    totalCount = 0
    correctCount = 0
    for testInput, testActualOutput in zip(X_test, Y_test):
      predictedOutput = prediction_Naive_Bayes(X_train, Y_train, [testInput],alpha)
      
      if predictedOutput == testActualOutput:
            correctCount += 1
      totalCount += 1
    
    print("Total Correct Count: ",correctCount," Total Wrong Count: ",totalCount-correctCount," Accuracy: ",(correctCount*100)/(totalCount))

        
        

  

In [15]:
smoothing_factors = [.01,0.001,.0001,1]
for a in smoothing_factors:
   performanceEvaluation_NB(X_train, Y_train, X_validation, Y_validation,a)

Smoothing factor:  0.01
Total Correct Count:  1935  Total Wrong Count:  172  Accuracy:  91.83673469387755
Smoothing factor:  0.001
Total Correct Count:  1919  Total Wrong Count:  188  Accuracy:  91.07736117702895
Smoothing factor:  0.0001
Total Correct Count:  1898  Total Wrong Count:  209  Accuracy:  90.08068343616516
Smoothing factor:  1
Total Correct Count:  1919  Total Wrong Count:  188  Accuracy:  91.07736117702895
Smoothing factor:  10
Total Correct Count:  1798  Total Wrong Count:  309  Accuracy:  85.33459895586141
Smoothing factor:  100
Total Correct Count:  1656  Total Wrong Count:  451  Accuracy:  78.5951589938301


In [16]:
print("=========================Naive Bayes============================================================")
for i in range(50):
  print("==========================FOR ",i+1,"th iteration")
  performanceEvaluation_NB(X_train, Y_train, X_test_for_each_iteration[i], Y_test_for_each_iteration[i],0.01)

Smoothing factor:  0.01
Total Correct Count:  98  Total Wrong Count:  10  Accuracy:  90.74074074074075
Smoothing factor:  0.01
Total Correct Count:  95  Total Wrong Count:  10  Accuracy:  90.47619047619048
Smoothing factor:  0.01
Total Correct Count:  98  Total Wrong Count:  6  Accuracy:  94.23076923076923
Smoothing factor:  0.01
Total Correct Count:  97  Total Wrong Count:  7  Accuracy:  93.26923076923077
Smoothing factor:  0.01
Total Correct Count:  98  Total Wrong Count:  7  Accuracy:  93.33333333333333
Smoothing factor:  0.01
Total Correct Count:  101  Total Wrong Count:  5  Accuracy:  95.28301886792453
Smoothing factor:  0.01
Total Correct Count:  103  Total Wrong Count:  5  Accuracy:  95.37037037037037
Smoothing factor:  0.01
Total Correct Count:  102  Total Wrong Count:  7  Accuracy:  93.57798165137615
Smoothing factor:  0.01
Total Correct Count:  102  Total Wrong Count:  7  Accuracy:  93.57798165137615
Smoothing factor:  0.01
Total Correct Count:  99  Total Wrong Count:  7  Acc