In [4]:
import numpy as np
import pandas as pd
import sklearn
from sklearn import neighbors, svm, tree
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# Answer Of Question 1

In [21]:
Train_df = pd.read_csv('CancerTrain.txt', sep=',')
Test_df = pd.read_csv('CancerTest.txt', sep=',')

Test_df.columns = Train_df.columns = ['ID', 'Clump_thickness', 'Unif_size', 'Unif_shape',
'Marginal_adhesion', 'Single_Epith_cellsize', 'Bare_nuclei', 'Bland_chromatin', 
'Normal_nucleoli', 'Mitoses', 'Class']

X_train = Train_df.iloc[:,1:10].values
Y_train = Train_df['Class']
X_test = Test_df.iloc[:,1:10].values
Y_test = Test_df['Class']

clf_id3 = tree.DecisionTreeClassifier(criterion="entropy")
clf_gini = tree.DecisionTreeClassifier(criterion="gini")
clf_kneighbor = neighbors.KNeighborsClassifier()
clf_naive = MultinomialNB()

clf_id3.fit(X_train, Y_train)
clf_gini.fit(X_train, Y_train)
clf_kneighbor.fit(X_train, Y_train)
clf_naive.fit(X_train, Y_train)

accuracy_id3 = clf_id3.score(X_test, Y_test)
accuracy_gini = clf_gini.score(X_test, Y_test)
accuracy_kneighbor = clf_kneighbor.score(X_test, Y_test)
accuracy_naive = clf_naive.score(X_test, Y_test)


print("ID3 Accuracy : " + str(accuracy_id3))
print("Decision Tree Gini Accuracy : " + str(accuracy_gini))
print("Kneighbor Accuracy : " + str(accuracy_kneighbor))
print("Naive Accuracy : " + str(accuracy_naive))

ID3 Accuracy : 0.942211055276
Decision Tree Gini Accuracy : 0.93216080402
Kneighbor Accuracy : 0.954773869347
Naive Accuracy : 0.879396984925


# Answer Of Question 2

In [3]:
Topics = pd.read_csv('topics.data', sep ="\n", encoding='latin-1', header=None)
open_file = open('training.data').read().split("\n\n\n\n\n")
article = []
data = []
for index in open_file : 
    temp = index.split("\n\n\n")
    for news in temp :
        article.append(news)
for index in article : 
    temp = index.split("\n\n")
    data.append(temp)
    
train_df = pd.DataFrame(data)
del train_df[4]
train_df.columns = ["Topic","Subject","Time","Text"]

In [4]:
test_file = open('test.data').read().split("\n\n\n\n\n")
test_article = []
test_data = []
for index in test_file : 
    temp = index.split("\n\n\n")
    for news in temp :
        test_article.append(news)
for index in test_article : 
    temp = index.split("\n\n")
    test_data.append(temp)
    
test_df = pd.DataFrame(test_data)
del test_df[4]
test_df.columns = ["Topic","Subject","Time","Text"]

In [5]:
train_df.fillna(value="empty", inplace=True)
test_df.fillna(value="empty", inplace=True)

#count vectorizer learn all vocabulary since binary is true it will calculate the hamming vector if that word exist 1 if not 0

Hamming_clf_1 = Pipeline([('vect', CountVectorizer(binary=True)),                     
                          ('clf', neighbors.KNeighborsClassifier(n_neighbors = 1))])

Hamming_clf_3 = Pipeline([('vect', CountVectorizer(binary=True)),                     
                          ('clf', neighbors.KNeighborsClassifier(n_neighbors = 3))])

Hamming_clf_5 = Pipeline([('vect', CountVectorizer(binary=True)),                     
                          ('clf', neighbors.KNeighborsClassifier(n_neighbors = 5))])

Hamming_clf_1.fit(train_df['Text'].values,train_df['Topic'].values)
Hamming_clf_3.fit(train_df['Text'].values,train_df['Topic'].values)
Hamming_clf_5.fit(train_df['Text'].values,train_df['Topic'].values)

print("Hamming n=1 accuracy : " + str(Hamming_clf_1.score(test_df['Text'].values,test_df['Topic'].values)))
print("Hamming n=3 accuracy : " + str(Hamming_clf_3.score(test_df['Text'].values,test_df['Topic'].values)))
print("Hamming n=5 accuracy : " + str(Hamming_clf_5.score(test_df['Text'].values,test_df['Topic'].values)))

Hamming n=1 accuracy : 0.622472783826
Hamming n=3 accuracy : 0.585147744946
Hamming n=5 accuracy : 0.622083981337


In [6]:
#since binary is false it will calculate how many times that word exist
Euclidean_clf_1 = Pipeline([('vect', CountVectorizer()),                     
                            ('clf', neighbors.KNeighborsClassifier(n_neighbors = 1))])

Euclidean_clf_3 = Pipeline([('vect', CountVectorizer()),                     
                            ('clf', neighbors.KNeighborsClassifier(n_neighbors = 3))])

Euclidean_clf_5 = Pipeline([('vect', CountVectorizer()),                     
                            ('clf', neighbors.KNeighborsClassifier(n_neighbors = 5))])

Euclidean_clf_1.fit(train_df['Text'].values,train_df['Topic'].values)
Euclidean_clf_3.fit(train_df['Text'].values,train_df['Topic'].values)
Euclidean_clf_5.fit(train_df['Text'].values,train_df['Topic'].values)

print("Euclidean n=1 accuracy : " + str(Euclidean_clf_1.score(test_df['Text'].values,test_df['Topic'].values)))
print("Euclidean n=3 accuracy : " + str(Euclidean_clf_3.score(test_df['Text'].values,test_df['Topic'].values)))
print("Euclidean n=5 accuracy : " + str(Euclidean_clf_5.score(test_df['Text'].values,test_df['Topic'].values)))

Euclidean n=1 accuracy : 0.720839813375
Euclidean n=3 accuracy : 0.703732503888
Euclidean n=5 accuracy : 0.727449455677


In [7]:
# this pipe line first generate count vector then send to calculate tfidf similarity then fit neighbor model on it
tfidf_clf_1 = Pipeline([('vect', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('clf', neighbors.KNeighborsClassifier(n_neighbors = 1))])
tfidf_clf_3 = Pipeline([('vect', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('clf', neighbors.KNeighborsClassifier(n_neighbors = 3))])
tfidf_clf_5 = Pipeline([('vect', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('clf', neighbors.KNeighborsClassifier(n_neighbors = 5))])

tfidf_clf_1.fit(train_df['Text'].values,train_df['Topic'].values)
tfidf_clf_3.fit(train_df['Text'].values,train_df['Topic'].values)
tfidf_clf_5.fit(train_df['Text'].values,train_df['Topic'].values)

print("tfidf n=1 accuracy : " + str(tfidf_clf_1.score(test_df['Text'].values,test_df['Topic'].values)))
print("tfidf n=3 accuracy : " + str(tfidf_clf_3.score(test_df['Text'].values,test_df['Topic'].values)))
print("tfidf n=5 accuracy : " + str(tfidf_clf_5.score(test_df['Text'].values,test_df['Topic'].values)))

tfidf n=1 accuracy : 0.767496111975
tfidf n=3 accuracy : 0.791213063764
tfidf n=5 accuracy : 0.802099533437
