In [17]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

In [2]:
with open('data/train.dat') as file:
    train_data = np.loadtxt(file, delimiter=' ')
with open('data/train.labels') as file:
    train_labels = np.loadtxt(file)

In [3]:
train_labels = train_labels.astype(int)

In [4]:
pca = PCA(n_components=30).fit(train_data)
print(f'Explained Variance Ratios: {pca.explained_variance_ratio_}')
print(f'Total Explained Variance: {pca.explained_variance_ratio_.sum()*100:.2f}%')

Explained Variance Ratios: [3.93769773e-01 2.20172894e-01 1.19471691e-01 9.81370704e-02
 4.83872297e-02 3.75871112e-02 2.38757707e-02 1.28982616e-02
 1.01015819e-02 9.50935153e-03 4.89691254e-03 3.89743014e-03
 3.13858295e-03 2.88417632e-03 1.97199250e-03 1.76998103e-03
 1.41329759e-03 1.00695243e-03 9.18924288e-04 6.82589135e-04
 6.33042612e-04 5.53197798e-04 4.79835474e-04 3.82989286e-04
 3.63376462e-04 2.87765312e-04 2.39014137e-04 2.12160245e-04
 1.09258452e-04 7.32474911e-05]
Total Explained Variance: 99.98%


In [5]:
X = pca.fit_transform(train_data)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, train_labels, test_size=0.20, random_state=40)

In [7]:
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print(f'accuracy score = {accuracy_score(y_test, y_pred):.4f}')

accuracy score = 0.4070


In [8]:
clf = DecisionTreeClassifier()
y_pred = clf.fit(X_train, y_train).predict(X_test)
print(f'accuracy score = {accuracy_score(y_test, y_pred):.4f}')

accuracy score = 0.5418


In [35]:
mlpc = MLPClassifier(
        hidden_layer_sizes=(62,62,62), 
        activation='relu', 
        solver='adam', 
        alpha=0.001,
        batch_size=500,
        learning_rate='adaptive',
        learning_rate_init=0.001,
        power_t=0.5,
        max_iter=1000,
        tol=0.005,
        random_state=10
    )
pipe = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('mlp', mlpc),
])
pipe.fit(X_train, y_train)
score = pipe.score(X_test, y_test)
print(f'accuracy score = {score:.4f}')

accuracy score = 0.6789


In [34]:
knn = KNeighborsClassifier(
    n_neighbors=10,
    weights='distance',
    algorithm='ball_tree',
    leaf_size=100,
    metric='manhattan'
)

pipe = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('mlp', knn),
])
pipe.fit(X_train, y_train)
score = pipe.score(X_test, y_test)
print(f'accuracy score = {score:.4f}')

accuracy score = 0.7131


In [10]:
with open('data/test.dat') as file:
    test_data = np.loadtxt(file, delimiter=' ')

In [165]:
test_data = pca.fit_transform(test_data)

In [166]:
test_labels = pipe.predict(test_data)

In [169]:
np.savetxt(f'predictions_f1_{score:.4f}.txt', test_labels, '%i')