In [1]:
import tensorflow as tf
import numpy as np
import sklearn
import pandas as pd

In [2]:
df = pd.read_csv('./fashion-mnist_train.csv')

In [3]:
label = df['label']
X = df.iloc[:, 1:]

In [4]:
y = np.asarray(label, np.int32)

In [5]:
X = np.asarray(X, np.float32)

# create a fixed validation set

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# baseline: directly fit a logistic regression model

## validation accuracy: 84.26%

In [11]:
from sklearn.linear_model import LogisticRegression as LR
# use default parameters
lr = LR(n_jobs=-1, solver='lbfgs')

In [12]:
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

In [13]:
lr.score(X_test, y_test)

0.84258333333333335

In [23]:
col_mean = np.mean(X_train, axis=0)
col_std = np.std(X_train, axis=0)
scaled_X_train = (X_train - col_mean) / np.maximum(col_std, 1e-5)
scaled_X_test = (X_test - col_mean) / np.maximum(col_std, 1e-5)
from sklearn.decomposition import PCA
pca = PCA(n_components=256)

In [25]:
X_train_trans = pca.fit_transform(scaled_X_train)

In [26]:
X_test_trans = pca.transform(scaled_X_test)

## 256 pcs acc: 85.29%

In [28]:
lr = LR(n_jobs=-1, solver='lbfgs')
lr.fit(X_train_trans, y_train)
lr.score(X_test_trans, y_test)

0.85291666666666666

## 64 pcs acc: 82.97%


In [31]:
lr = LR(n_jobs=-1, solver='lbfgs')
lr.fit(X_train_trans[:, :64], y_train)
lr.score(X_test_trans[:, :64], y_test)

0.82966666666666666

# Grid search parameters for svm

## cross validation to evaluate model performance

In [9]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
pca = PCA(n_components=128)
svc = svm.SVC()
X_trans = pca.fit_transform(X)

%time svc.fit(X_trans, y)

CPU times: user 24min 33s, sys: 2.28 s, total: 24min 35s
Wall time: 24min 35s


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [7]:


parameters = {'kernel':('linear', 'rbf', 'sigmoid'), 'C':[0.1, 1, 10, 100]}

clf = GridSearchCV(svc, parameters, n_jobs=4)
clf.fit(X, y)

KeyboardInterrupt: 

# Grid search parameters for [xgboost](https://github.com/dmlc/xgboost) model

In [13]:
from sklearn.grid_search import GridSearchCV
import xgboost as xgb
## estimate the time needed to fit a random forest consisting 50 trees
xgb_model = xgb.XGBClassifier(objective='multi:softmax', learning_rate=0.01, max_depth=6, \
                              subsample=0.8, colsample_bytree=0.5, n_estimators=50, min_child_weight=1)
%time xgb_model.fit(X,y)

CPU times: user 13min 37s, sys: 1.74 s, total: 13min 38s
Wall time: 1min 46s


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.5,
       gamma=0, learning_rate=0.01, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=50, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.8)

In [11]:
?xgb.XGBClassifier

In [None]:

xgb_model = xgb.XGBClassifier()

## create a reasonable parameter grid based on the time it takes to fit one model.
parameters = {'nthread':[1],
              'objective':['multi:softmax'],
              'num_class': [10]
              'learning_rate': [0.01, 0.05],
              'max_depth': [6],
              'min_child_weight': [1, 5],
              'silent': [1],
              'eval_metric': ['auc']
              'subsample': [0.5],
              'colsample_bytree': [0.3, 0.5],
              'n_estimators': [50],
             }
clf = GridSearchCV(xgb_model, parameters, n_jobs=4, 
                   cv=3, 
                   scoring='roc_auc',
                   verbose=2, refit=True)