In [78]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import sklearn.manifold as manifold
from numpy import genfromtxt
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.manifold import SpectralEmbedding
from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier

In [79]:
def give_me_some_data(path_to_file, headers = None, separator = ',',float_precision = None):
    if(path_to_file):
        return pd.read_csv(
            filepath_or_buffer=path_to_file, 
            header=headers, 
            sep=separator,
            float_precision=float_precision
        )

In [80]:
def give_me_a_dr(dr_tech = "PCA", n_components = 150, n_neighbours = 150):
    switcher = {
        "PCA" : PCA(n_components=n_components),
        "TruncatedSVD": TruncatedSVD(n_components=n_components),
        "LLE-Standard": LocallyLinearEmbedding(n_neighbors=n_neighbours,
                                             n_components=n_components, method = 'standard'),
        "LLE-Hessian": LocallyLinearEmbedding(n_neighbors=n_neighbours,
                                             n_components=n_components, method = 'hessian'),
        "spectral_embedding": SpectralEmbedding(n_components=n_components, random_state=0,
                                      eigen_solver="arpack"),
        "tsne": TSNE(n_components=n_components, init='pca', random_state=0)
    }
    return switcher[dr_tech]

In [81]:
def give_me_a_classifier(classifier = "SVC", neighbours = None):
    switcher = {
        "SVC": SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
        "SGD": SGDClassifier(random_state=42, max_iter=1000, tol=1e-3),
        "KNN": KNeighborsClassifier(n_neighbors=neighbours)
    }
    return switcher[classifier]

In [82]:
def give_me_a_pipeline(dr = None, classifier = None):
    return  Pipeline(steps=[("DR", dr), ('classify', classifier)])

In [55]:
# read in the dataset
training_data = give_me_some_data(
    path_to_file='../data/train.dat', 
    headers=None, 
    separator=' ',
    float_precision='high'
)

training_labels = give_me_some_data(
    path_to_file='../data/train.labels', 
    headers=None, 
)

test_data = give_me_some_data(
    path_to_file='../data/test.dat', 
    headers=None, 
    separator=' ',
    float_precision='high' 
)

# test_data

# df.to_csv("../data/aaj.csv", sep='\t', encoding='utf-8')
# df.nunique().to_csv("../data/unique_vals.csv", sep='\t', encoding='utf-8')
# cols = df.columns
# for col in cols:
#     df[col] = df[col].astype(float) 
# print(df)

# extract the vectors from the Pandas data file
# X = df.iloc[:,1:].values
# df
# my_data = np.loadtxt('../data/train.dat', delimiter=' ',dtype = np.float64)
# np.set_printoptions(precision = 20)
# my_data
# df1 = pd.DataFrame(my_data)
# df1

# standardise the data
# X_std = StandardScaler().fit_transform(X)

In [74]:
X_train, X_test, y_train, y_test = train_test_split(
    training_data,
    training_labels,
    test_size=0.2,
    shuffle=True,
    random_state=42,
)


In [85]:
# for k, v in dict.items():
#     print(k,v)
# compute_pipeline = give_me_a_pipeline(give_me_a_dr(dr_tech="PCA"),give_me_a_classifier(classifier="SVC"))
# compute_pipeline.fit(X_train, y_train)
# y_pred = compute_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          1       0.60      0.80      0.68      2116
          2       0.45      0.43      0.44      1500
          3       0.00      0.00      0.00       362
          4       0.00      0.00      0.00        58
          5       0.00      0.00      0.00        41
          6       0.00      0.00      0.00         4
          7       0.00      0.00      0.00         2
          8       0.00      0.00      0.00       153
         10       0.00      0.00      0.00         1
         11       0.00      0.00      0.00         1

avg / total       0.46      0.55      0.50      4238



  'precision', 'predicted', average, warn_for)


In [None]:
# print("Computing LLE embedding")
# X_r, err = manifold.locally_linear_embedding(training_data, n_neighbors=5000,
#                                              n_components=50, method = 'hessian')
# print("Done. Reconstruction error: %g" % err)


In [None]:
dasdasdasdasd