In [193]:
import numpy as np
import pandas as pd

from sklearn.metrics import fbeta_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

import math
import os
from scipy.sparse import hstack, csr_matrix
from collections import Counter

In [194]:
def write(method, y_pred):
    with open("sample_submission_" + method + ".csv", "w") as f:
        f.write("article_id,category\n")
        for i, y in enumerate(y_pred):
            f.write(str(i + 1) + "," + str(y))
            f.write("\n")

In [195]:
def counter_cosine_similarity(y1, y2):
    y1 = Counter(y1)
    y2 = Counter(y2)
    y = set(y1).union(y2)
    dotprod = sum(y1.get(k, 0) * y2.get(k, 0) for k in y)
    magA = math.sqrt(sum(y1.get(k, 0)**2 for k in y))
    magB = math.sqrt(sum(y2.get(k, 0)**2 for k in y))
    return dotprod / (magA * magB)

In [212]:
trainData=pd.read_csv("data" + os.sep + "train.csv", na_filter=False)
trainHeader = trainData.columns.values
trainData = trainData.values

X_train = trainData[:, range(6)]
y_train = trainData[:, 6]
y_train = y_train.astype("int")

In [213]:
testData=pd.read_csv("data" + os.sep + "test.csv", na_filter=False)
testHeader = testData.columns.values
testData = testData.values

X_test = testData[:, range(6)]

In [72]:
X_combined = np.concatenate((X_train, X_test), axis=0)
X_train_size = X_train.shape[0]
X_train_test = 3500

vectorizer = TfidfVectorizer(stop_words="english", norm="l1")
X_combined_tfidf = [vectorizer.fit_transform(X_combined[:, i + 1]) for i in range(4)]

X_train_tfidf = hstack([X_combined_tfidf[0][0:X_train_size], X_combined_tfidf[1][0:X_train_size], X_combined_tfidf[2][0:X_train_size], X_combined_tfidf[3][0:X_train_size]])
X_train_train_tfidf = hstack([X_combined_tfidf[0][0:X_train_test], X_combined_tfidf[1][0:X_train_test], X_combined_tfidf[2][0:X_train_test], X_combined_tfidf[3][0:X_train_test]])
X_train_test_tfidf = hstack([X_combined_tfidf[0][X_train_test:X_train_size], X_combined_tfidf[1][X_train_test:X_train_size], X_combined_tfidf[2][X_train_test:X_train_size], X_combined_tfidf[3][X_train_test:X_train_size]])
X_test_tfidf = hstack([X_combined_tfidf[0][X_train_size:], X_combined_tfidf[1][X_train_size:], X_combined_tfidf[2][X_train_size:], X_combined_tfidf[3][X_train_size:]])
#X_train_tfidf = hstack([X_combined_tfidf[0][0:X_train_size], X_combined_tfidf[2][0:X_train_size]])
#X_train_train_tfidf = hstack([X_combined_tfidf[0][0:X_train_test], X_combined_tfidf[2][0:X_train_test]])
#X_train_test_tfidf = hstack([X_combined_tfidf[0][X_train_test:X_train_size], X_combined_tfidf[2][X_train_test:X_train_size]])
#X_test_tfidf = hstack([X_combined_tfidf[0][X_train_size:], X_combined_tfidf[2][X_train_size:]])

transformer = TfidfTransformer() 
timestamp_tfidf = transformer.fit_transform(X_combined[:, [5]].astype('float'))
X_train_tfidf = hstack([X_train_tfidf, timestamp_tfidf[0:X_train_size]])
X_train_train_tfidf = hstack([X_train_train_tfidf, timestamp_tfidf[0:X_train_test]])
X_train_test_tfidf = hstack([X_train_test_tfidf, timestamp_tfidf[X_train_test:X_train_size]])
X_test_tfidf = hstack([X_test_tfidf, timestamp_tfidf[X_train_size:]])



In [93]:
#KNN
classifier = KNeighborsClassifier(1)

classifier.fit(X_train_train_tfidf, y_train[0:X_train_test])
print("F2 Score : " + str(fbeta_score(y_train[X_train_test:X_train_size], classifier.predict(X_train_test_tfidf), average='macro', beta=2)))
print("Score : " + str(classifier.score(X_train_test_tfidf, y_train[X_train_test:X_train_size])))
print("Cosine Similarity : " + str(counter_cosine_similarity(list(y_train[X_train_test:X_train_size]), list(classifier.predict(X_train_test_tfidf)))))

write("knn", classifier.predict(X_test_tfidf))

F2 Score : 0.21501092358052182
Score : 0.3644859813084112
Cosine Similarity : 0.9912510478857984


In [81]:
#BBN


In [95]:
#Decision Tree
classifier = DecisionTreeClassifier(max_depth=40)

classifier.fit(X_train_train_tfidf, y_train[0:X_train_test])
print("F2 Score : " + str(fbeta_score(y_train[X_train_test:X_train_size], classifier.predict(X_train_test_tfidf), average='macro', beta=2)))
print("Score : " + str(classifier.score(X_train_test_tfidf, y_train[X_train_test:X_train_size])))
print("Cosine Similarity : " + str(counter_cosine_similarity(list(y_train[X_train_test:X_train_size]), list(classifier.predict(X_train_test_tfidf)))))

write("decision_tree", classifier.predict(X_test_tfidf))

F2 Score : 0.26907845340090164
Score : 0.477803738317757
Cosine Similarity : 0.9366658207133244


In [172]:

import numpy as np

def f2_score(y_true, y_pred):
    # fbeta_score throws a confusing error if inputs are not numpy arrays
    y_true, y_pred, = np.array(y_true), np.array(y_pred)
    # We need to use average='samples' here, any other average method will generate bogus results
    return fbeta_score(y_true, y_pred, beta=2, average='samples')
