In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import fbeta_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

import math
import os
from scipy.sparse import hstack, csr_matrix
from collections import Counter

In [2]:
def read(X, dir):
    for i in range(X.shape[0]):
        x = X[i]
        x[0] = str(x[0])
        with open("page" + os.sep + dir + "_" + str(i) + ".txt", "r") as f:
            text = f.read()
            if "404" in text: 
                text = "" 
        x[2] = text
        x[5] = str(x[5])
    return X

In [3]:
def write(method, y_pred):
    with open("sample_submission_" + method + ".csv", "w") as f:
        f.write("article_id,category\n")
        for i, y in enumerate(y_pred):
            f.write(str(i + 1) + "," + str(y))
            f.write("\n")

In [4]:
def counter_cosine_similarity(y1, y2):
    y1 = Counter(y1)
    y2 = Counter(y2)
    y = set(y1).union(y2)
    dotprod = sum(y1.get(k, 0) * y2.get(k, 0) for k in y)
    magA = math.sqrt(sum(y1.get(k, 0)**2 for k in y))
    magB = math.sqrt(sum(y2.get(k, 0)**2 for k in y))
    return dotprod / (magA * magB)

In [5]:
trainData=pd.read_csv("data" + os.sep + "train.csv", na_filter=False)
trainHeader = trainData.columns.values
trainData = trainData.values

X_train = trainData[:, range(6)]
y_train = trainData[:, 6]
y_train = y_train.astype("int")

X_train = read(X_train, "train")

In [6]:
trainData_2=pd.read_csv("data" + os.sep + "train_v2.csv", na_filter=False)
trainHeader_2 = trainData_2.columns.values
trainData_2 = trainData_2.values

X_train_2 = trainData_2[:, range(6)]
y_train_2 = trainData_2[:, 6]
y_train_2 = y_train_2.astype("int")

X_train_2 = read(X_train_2, "train_2")

In [7]:
testData_2=pd.read_csv("data" + os.sep + "test_v2.csv", na_filter=False)
testHeader_2 = testData_2.columns.values
testData_2 = testData_2.values

X_test_2 = testData_2[:, range(6)]
X_test_2 = read(X_test_2, "test_2")

In [8]:
X_combined = np.concatenate((X_train, X_train_2, X_test_2), axis=0)
X_train_size = X_train.shape[0] + X_train_2.shape[0]

vectorizer = TfidfVectorizer(stop_words="english", norm="l1")
X_combined_tfidf = [vectorizer.fit_transform(X_combined[:, i]) for i in range(6)]

In [9]:
print(X_combined.shape)
print(X_train_size)

print(X_combined_tfidf[0].shape)
print(X_combined_tfidf[1].shape)
print(X_combined_tfidf[2].shape)
print(X_combined_tfidf[3].shape)
print(X_combined_tfidf[4].shape)
print(X_combined_tfidf[5].shape)

(14209, 6)
10383
(14209, 6018)
(14209, 5885)
(14209, 59706)
(14209, 1847)
(14209, 1833)
(14209, 2)


In [10]:
X_train_tfidf = hstack([X_combined_tfidf[1][0:X_train_size], X_combined_tfidf[2][0:X_train_size], X_combined_tfidf[3][0:X_train_size], X_combined_tfidf[4][0:X_train_size], X_combined_tfidf[5][0:X_train_size]])
X_test_tfidf = hstack([X_combined_tfidf[1][X_train_size:], X_combined_tfidf[2][X_train_size:], X_combined_tfidf[3][X_train_size:], X_combined_tfidf[4][X_train_size:], X_combined_tfidf[5][X_train_size:]])

print(X_train_tfidf.shape)
print(X_test_tfidf.shape)


(10383, 69273)
(3826, 69273)


In [11]:
y_train_combined = np.concatenate((y_train, y_train_2), axis=0)
print(y_train_combined.shape)

(10383,)


In [12]:
#KNN
classifier = KNeighborsClassifier(5)

classifier.fit(X_train_tfidf, y_train_combined)
print("F2 Score : " + str(fbeta_score(y_train_combined, classifier.predict(X_train_tfidf), average='macro', beta=2)))
print("Score : " + str(classifier.score(X_train_tfidf, y_train_combined)))
print("Cosine Similarity : " + str(counter_cosine_similarity(list(y_train_combined), list(classifier.predict(X_train_tfidf)))))

write("knn", classifier.predict(X_test_tfidf))

F2 Score : 0.6517697935160097
Score : 0.6999903688721949
Cosine Similarity : 0.9905958735418139


In [13]:
#Decision Tree
classifier = DecisionTreeClassifier(max_depth=40)

classifier.fit(X_train_tfidf, y_train_combined)
print("F2 Score : " + str(fbeta_score(y_train_combined, classifier.predict(X_train_tfidf), average='macro', beta=2)))
print("Score : " + str(classifier.score(X_train_tfidf, y_train_combined)))
print("Cosine Similarity : " + str(counter_cosine_similarity(list(y_train_combined), list(classifier.predict(X_train_tfidf)))))

write("decision_tree", classifier.predict(X_test_tfidf))

F2 Score : 0.7260958515104268
Score : 0.8212462679379755
Cosine Similarity : 0.9652637986878331
