In [None]:
import numpy as np
import pandas as pd

from sklearn.metrics import fbeta_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

import math
import os
from scipy.sparse import hstack, csr_matrix
from collections import Counter

In [None]:
def read(X, dir):
    for i in range(X.shape[0]):
        x = X[i]
        x[0] = str(x[0])
        with open("page" + os.sep + dir + "_" + str(i) + ".txt", "r") as f:
            text = f.read()
            if "404" in text or "404" in text: 
                text = "" 
        x[2] = text
        x[5] = str(x[5])
    return X

In [None]:
def write(method, y_pred):
    with open("sample_submission_" + method + ".csv", "w") as f:
        f.write("article_id,category\n")
        for i, y in enumerate(y_pred):
            f.write(str(i + 1) + "," + str(y))
            f.write("\n")

In [None]:
def counter_cosine_similarity(y1, y2):
    y1 = Counter(y1)
    y2 = Counter(y2)
    y = set(y1).union(y2)
    dotprod = sum(y1.get(k, 0) * y2.get(k, 0) for k in y)
    magA = math.sqrt(sum(y1.get(k, 0)**2 for k in y))
    magB = math.sqrt(sum(y2.get(k, 0)**2 for k in y))
    return dotprod / (magA * magB)

In [None]:
trainData=pd.read_csv("data" + os.sep + "train.csv", na_filter=False)
trainHeader = trainData.columns.values
trainData = trainData.values

X_train = trainData[:, range(6)]
y_train = trainData[:, 6]
y_train = y_train.astype("int")

X_train = read(X_train, "train")

In [None]:
trainData_2=pd.read_csv("data" + os.sep + "train_v2.csv", na_filter=False)
trainHeader_2 = trainData_2.columns.values
trainData_2 = trainData_2.values

X_train_2 = trainData_2[:, range(6)]
y_train_2 = trainData_2[:, 6]
y_train_2 = y_train_2.astype("int")

X_train_2 = read(X_train_2, "train_2")

In [None]:
testData_2=pd.read_csv("data" + os.sep + "test_v2.csv", na_filter=False)
testHeader_2 = testData_2.columns.values
testData_2 = testData_2.values

X_test_2 = testData_2[:, range(6)]
X_test_2 = read(X_test_2, "test_2")

In [None]:
X_combined = np.concatenate((X_train, X_train_2, X_test_2), axis=0)
X_train_size = X_train.shape[0] + X_train_2.shape[0]

vectorizer = TfidfVectorizer(stop_words="english", norm="l1")
X_combined_tfidf = [vectorizer.fit_transform(X_combined[:, i]) for i in range(6)]

In [None]:
print(X_combined.shape)
print(X_train_size)

print(X_combined_tfidf[0].shape)
print(X_combined_tfidf[1].shape)
print(X_combined_tfidf[2].shape)
print(X_combined_tfidf[3].shape)
print(X_combined_tfidf[4].shape)
print(X_combined_tfidf[5].shape)

In [None]:
X_train_tfidf = hstack([X_combined_tfidf[1][0:X_train_size], X_combined_tfidf[2][0:X_train_size], X_combined_tfidf[3][0:X_train_size], X_combined_tfidf[4][0:X_train_size], X_combined_tfidf[5][0:X_train_size]])
X_test_tfidf = hstack([X_combined_tfidf[1][X_train_size:], X_combined_tfidf[2][X_train_size:], X_combined_tfidf[3][X_train_size:], X_combined_tfidf[4][X_train_size:], X_combined_tfidf[5][X_train_size:]])

print(X_train_tfidf.shape)
print(X_test_tfidf.shape)

In [None]:
y_train_combined = np.concatenate((y_train, y_train_2), axis=0)
print(y_train_combined.shape)

In [None]:
#KNN
classifier = KNeighborsClassifier(5)

classifier.fit(X_train_tfidf, y_train_combined)
print("F2 Score : " + str(fbeta_score(y_train_combined, classifier.predict(X_train_tfidf), average='macro', beta=2)))
print("Score : " + str(classifier.score(X_train_tfidf, y_train_combined)))
print("Cosine Similarity : " + str(counter_cosine_similarity(list(y_train_combined), list(classifier.predict(X_train_tfidf)))))

write("knn", classifier.predict(X_test_tfidf))

In [None]:
#Decision Tree
classifier = DecisionTreeClassifier(max_depth=40)

classifier.fit(X_train_tfidf, y_train_combined)
print("F2 Score : " + str(fbeta_score(y_train_combined, classifier.predict(X_train_tfidf), average='macro', beta=2)))
print("Score : " + str(classifier.score(X_train_tfidf, y_train_combined)))
print("Cosine Similarity : " + str(counter_cosine_similarity(list(y_train_combined), list(classifier.predict(X_train_tfidf)))))

write("decision_tree", classifier.predict(X_test_tfidf))

In [None]:
#Decision Tree Ensemble
for i in range(40, 60):
    for j in range(5):
        classifier = DecisionTreeClassifier(max_depth=i)
        classifier.fit(X_train_tfidf, y_train_combined)
        if (i == 40 and j == 0):
            X_test_pred = classifier.predict(X_test_tfidf)
        else:
            X_test_pred = np.vstack([X_test_pred, classifier.predict(X_test_tfidf)])

X_test_pred_decision_tree_en = np.array([])
for i in range(X_test_pred.shape[1]):
    pred = X_test_pred[:,i]
    counts = np.bincount(pred)
    X_test_pred_decision_tree_en = np.append(X_test_pred_decision_tree_en, np.argmax(counts))
X_test_pred_decision_tree_en = X_test_pred_decision_tree_en.astype(int)
write("decision_tree_en", X_test_pred_decision_tree_en)

In [None]:
#KNN Ensemble
for i in range(5, 10):
    classifier = KNeighborsClassifier(i)
    classifier.fit(X_train_tfidf, y_train_combined)
    if (i == 5):
        X_test_pred = classifier.predict(X_test_tfidf)
    else:
        X_test_pred = np.vstack([X_test_pred, classifier.predict(X_test_tfidf)])

X_test_pred_knn_en = np.array([])
for i in range(X_test_pred.shape[1]):
    pred = X_test_pred[:,i]
    counts = np.bincount(pred)
    X_test_pred_knn_en = np.append(X_test_pred_knn_en, np.argmax(counts))
X_test_pred_knn_en = X_test_pred_knn_en.astype(int)
write("knn_en", X_test_pred_knn_en)

In [None]:
print("Cosine Similarity : " + str(counter_cosine_similarity(list(X_test_pred_decision_tree_en), list(X_test_pred_knn_en))))

In [None]:
#detect outlier

In [None]:
from sklearn import svm
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

outliers_fraction = 0.15
anomaly_algorithms = [
    ("Local Outlier Factor", LocalOutlierFactor(n_neighbors=35, contamination=outliers_fraction)),
    ("Isolation Forest", IsolationForest(contamination=outliers_fraction, random_state=42)),
    ("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma=0.1))]

outlier_pred = np.array([])
for name, algorithm in anomaly_algorithms:
    algorithm.fit(X_train_tfidf)
    if name == "Local Outlier Factor":
        outlier_pred = algorithm.fit_predict(X_train_tfidf)
    else:
        outlier_pred = np.vstack([outlier_pred, algorithm.fit(X_train_tfidf).predict(X_train_tfidf)])

In [None]:
outlier = np.sum(outlier_pred, axis=0)
outlier = [i for i in range(outlier.shape[0]) if outlier[i] < 0]
print(len(outlier))

In [None]:
X_train_size_no_outlier = X_train_size - len(outlier)
X_combined_no_outlier = np.array([X_combined[i] for i in range(X_combined.shape[0]) if i not in outlier])

X_combined_no_outlier_tfidf = [vectorizer.fit_transform(X_combined_no_outlier[:, i]) for i in range(6)]

In [None]:
X_train_no_outlier_tfidf = hstack([X_combined_no_outlier_tfidf[1][0:X_train_size_no_outlier], X_combined_no_outlier_tfidf[2][0:X_train_size_no_outlier], X_combined_no_outlier_tfidf[3][0:X_train_size_no_outlier], X_combined_no_outlier_tfidf[4][0:X_train_size_no_outlier], X_combined_no_outlier_tfidf[5][0:X_train_size_no_outlier]])
X_test_no_outlier_tfidf = hstack([X_combined_no_outlier_tfidf[1][X_train_size_no_outlier:], X_combined_no_outlier_tfidf[2][X_train_size_no_outlier:], X_combined_no_outlier_tfidf[3][X_train_size_no_outlier:], X_combined_no_outlier_tfidf[4][X_train_size_no_outlier:], X_combined_no_outlier_tfidf[5][X_train_size_no_outlier:]])

print(X_train_no_outlier_tfidf.shape)
print(X_test_no_outlier_tfidf.shape)

In [None]:
y_train_combined_no_outlier = np.array([y_train_combined[i] for i in range(y_train_combined.shape[0]) if i not in outlier])

print(y_train_combined_no_outlier.shape)

In [None]:
#Decision Tree Ensemble
for i in range(40, 60):
    for j in range(5):
        classifier = DecisionTreeClassifier(max_depth=i)
        classifier.fit(X_train_no_outlier_tfidf, y_train_combined_no_outlier)
        if (i == 40 and j == 0):
            X_test_pred = classifier.predict(X_test_no_outlier_tfidf)
        else:
            X_test_pred = np.vstack([X_test_pred, classifier.predict(X_test_no_outlier_tfidf)])

X_test_pred_no_outlier_decision_tree_en = np.array([])
for i in range(X_test_pred.shape[1]):
    pred = X_test_pred[:,i]
    counts = np.bincount(pred)
    X_test_pred_no_outlier_decision_tree_en = np.append(X_test_pred_no_outlier_decision_tree_en, np.argmax(counts))
X_test_pred_no_outlier_decision_tree_en = X_test_pred_no_outlier_decision_tree_en.astype(int)
write("decision_tree_en_no_outlier", X_test_pred_no_outlier_decision_tree_en)

In [None]:
print("Cosine Similarity : " + str(counter_cosine_similarity(list(X_test_pred_decision_tree_en), list(X_test_pred_no_outlier_decision_tree_en))))