In [115]:
import numpy as np
import pandas as pd

from sklearn.metrics import fbeta_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

import math
import os
import re
from scipy.sparse import hstack, csr_matrix
from collections import Counter

In [116]:
def read(X, dir):
    for i in range(X.shape[0]):
        x = X[i]
        x[0] = str(x[0])
        with open("page" + os.sep + dir + "_" + str(i) + ".txt", "r") as f:
            text = f.read()
            if "404" in text or "404" in text: 
                text = "" 
        text = ''.join([i for i in text if not i.isdigit()])
        text = re.sub(r'\b\w{1,3}\b', '', text)
        x[2] = text
        x[5] = str(x[5]) 
    return X

In [117]:
def write(method, y_pred):
    with open("sample_submission_" + method + ".csv", "w") as f:
        f.write("article_id,category\n")
        for i, y in enumerate(y_pred):
            f.write(str(i + 1) + "," + str(y))
            f.write("\n")

In [118]:
def counter_cosine_similarity(y1, y2):
    y1 = Counter(y1)
    y2 = Counter(y2)
    y = set(y1).union(y2)
    dotprod = sum(y1.get(k, 0) * y2.get(k, 0) for k in y)
    magA = math.sqrt(sum(y1.get(k, 0)**2 for k in y))
    magB = math.sqrt(sum(y2.get(k, 0)**2 for k in y))
    return dotprod / (magA * magB)

In [119]:
trainData=pd.read_csv("data" + os.sep + "train.csv", na_filter=False)
trainHeader = trainData.columns.values
trainData = trainData.values

X_train = trainData[:, range(6)]
y_train = trainData[:, 6]
y_train = y_train.astype("int")

X_train = read(X_train, "train")

In [120]:
trainData_2=pd.read_csv("data" + os.sep + "train_v2.csv", na_filter=False)
trainHeader_2 = trainData_2.columns.values
trainData_2 = trainData_2.values

X_train_2 = trainData_2[:, range(6)]
y_train_2 = trainData_2[:, 6]
y_train_2 = y_train_2.astype("int")

X_train_2 = read(X_train_2, "train_2")

In [121]:
testData_2=pd.read_csv("data" + os.sep + "test_v2.csv", na_filter=False)
testHeader_2 = testData_2.columns.values
testData_2 = testData_2.values

X_test_2 = testData_2[:, range(6)]
X_test_2 = read(X_test_2, "test_2")

In [122]:
X_train_2_tfidf = dict()
X_test_tfidf = dict()
for i in range(6):
    vectorizer = TfidfVectorizer(analyzer='word', stop_words="english", norm="l1")
    X_train_2_tfidf[i] = vectorizer.fit_transform(X_train_2[:, i])
    X_test_tfidf[i] = vectorizer.transform(X_test_2[:, i])

X_train_2_tfidf = hstack([X_train_2_tfidf[i] for i in range(1,6)])
X_test_tfidf = hstack([X_test_tfidf[i] for i in range(1,6)])

print(X_train_2_tfidf.shape)
print(X_test_tfidf.shape)

(6027, 46201)
(3826, 46201)


In [123]:
#Decision Tree
classifier = DecisionTreeClassifier(max_depth=40)

classifier.fit(X_train_2_tfidf, y_train_2)
print("F2 Score : " + str(fbeta_score(y_train_2, classifier.predict(X_train_2_tfidf), average='macro', beta=2)))
print("Score : " + str(classifier.score(X_train_2_tfidf, y_train_2)))
print("Cosine Similarity : " + str(counter_cosine_similarity(list(y_train_2), list(classifier.predict(X_train_2_tfidf)))))

write("decision_tree", classifier.predict(X_test_tfidf))

F2 Score : 0.7729908041766358
Score : 0.8395553343288535
Cosine Similarity : 0.9682862902142626


In [133]:
from sklearn import linear_model
classifier = linear_model.SGDClassifier(max_iter=100)

classifier.fit(X_train_2_tfidf, y_train_2)
print("F2 Score : " + str(fbeta_score(y_train_2, classifier.predict(X_train_2_tfidf), average='macro', beta=2)))
print("Score : " + str(classifier.score(X_train_2_tfidf, y_train_2)))
print("Cosine Similarity : " + str(counter_cosine_similarity(list(y_train_2), list(classifier.predict(X_train_2_tfidf)))))

write("sgd_no_outlier", classifier.predict(X_test_tfidf))



F2 Score : 0.768590466262044
Score : 0.8367346938775511
Cosine Similarity : 0.9901975223156287


In [128]:
#Decision Tree Ensemble
for i in range(40, 60):
    for j in range(5):
        classifier = DecisionTreeClassifier(max_depth=i)
        classifier.fit(X_train_2_tfidf, y_train_2)
        if (i == 40 and j == 0):
            X_test_pred = classifier.predict(X_test_tfidf)
        else:
            X_test_pred = np.vstack([X_test_pred, classifier.predict(X_test_tfidf)])

X_test_pred_decision_tree_en = np.array([])
for i in range(X_test_pred.shape[1]):
    pred = X_test_pred[:,i]
    counts = np.bincount(pred)
    X_test_pred_decision_tree_en = np.append(X_test_pred_decision_tree_en, np.argmax(counts))
X_test_pred_decision_tree_en = X_test_pred_decision_tree_en.astype(int)

write("decision_tree_en", X_test_pred_decision_tree_en)