In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

import math
from scipy.sparse import hstack
from collections import Counter

In [2]:
def write(method, y_pred):
    with open("sample_submission_" + method + ".csv", "w") as f:
        f.write("article_id,category\n")
        for i, y in enumerate(y_pred):
            f.write(str(i + 1) + "," + str(y))
            f.write("\n")

In [3]:
def counter_cosine_similarity(y1, y2):
    y1 = Counter(y1)
    y2 = Counter(y2)
    y = set(y1).union(y2)
    dotprod = sum(y1.get(k, 0) * y2.get(k, 0) for k in y)
    magA = math.sqrt(sum(y1.get(k, 0)**2 for k in y))
    magB = math.sqrt(sum(y2.get(k, 0)**2 for k in y))
    return dotprod / (magA * magB)

In [4]:
trainData=pd.read_csv('train.csv', na_filter=False)
trainHeader = trainData.columns.values
trainData = trainData.values

X_train = trainData[:, range(6)]
y_train = trainData[:, 6]
y_train = y_train.astype('int')

In [5]:
testData=pd.read_csv('test.csv', na_filter=False)
testHeader = testData.columns.values
testData = testData.values

X_test = testData[:, range(6)]

In [9]:
X_combined = np.concatenate((X_train, X_test), axis=0)
X_train_size = X_train.shape[0]

vectorizer = TfidfVectorizer()
X_combined_tfidf = [vectorizer.fit_transform(X_combined[:, i + 1]) for i in range(4)]
X_train_tfidf = hstack([X_combined_tfidf[0][0:X_train_size], X_combined_tfidf[1][0:X_train_size], X_combined_tfidf[2][0:X_train_size], X_combined_tfidf[3][0:X_train_size]])
X_test_tfidf = hstack([X_combined_tfidf[0][X_train_size:], X_combined_tfidf[1][X_train_size:], X_combined_tfidf[2][X_train_size:], X_combined_tfidf[3][X_train_size:]])

transformer = TfidfTransformer() 
timestamp_tfidf = transformer.fit_transform(X_combined[:, [5]].astype('float'))
X_train_tfidf = hstack([X_train_tfidf, timestamp_tfidf[0:X_train_size]])
X_test_tfidf = hstack([X_test_tfidf, timestamp_tfidf[X_train_size:]])

In [10]:
#KNN-3
classifier = KNeighborsClassifier(3)

classifier.fit(X_train_tfidf, y_train)
print("Score : " + str(classifier.score(X_train_tfidf, y_train)))
print("Cosine Similarity : " + str(counter_cosine_similarity(list(y_train), list(classifier.predict(X_train_tfidf)))))

write("knn_3", classifier.predict(X_test_tfidf))

Score : 0.6464646464646465
Cosine Similarity : 0.9890027426517952


In [11]:
#KNN-5
classifier = KNeighborsClassifier(5)

classifier.fit(X_train_tfidf, y_train)
print("Score : " + str(classifier.score(X_train_tfidf, y_train)))
print("Cosine Similarity : " + str(counter_cosine_similarity(list(y_train), list(classifier.predict(X_train_tfidf)))))

write("knn_5", classifier.predict(X_test_tfidf))

Score : 0.6019283746556474
Cosine Similarity : 0.9920716234733631


In [8]:
#BBN

In [None]:
#Decision Tree
