In [21]:
import pandas as pd
import numpy as np

from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score

from sklearn import metrics

import time

In [22]:
# import data
df_train = pd.read_csv('fashion-mnist_train.csv')
df_test = pd.read_csv('fashion-mnist_test.csv')

In [23]:
train_x = df_train.drop(['label'], axis = 1)
train_labels = df_train['label']
test_x = df_test.drop(['label'], axis = 1)
test_labels = df_test['label']

# LightGBM

In [24]:
#conda install -c conda-forge lightgbm

In [25]:
from lightgbm import LGBMClassifier

In [28]:
lgb_model = LGBMClassifier(objective='multiclass',path_smooth = 0.5)

In [29]:
start1 = time.time()

lgb_model.fit(train_x,train_labels,categorical_feature=[0,3])

end1 = time.time()
lightGBM_time = end1-start1
print("lightGBM Time: {:0.2f} minute".format(lightGBM_time/60.0))



lightGBM Time: 4.43 minute


In [30]:
expected_y  = test_labels
predicted_y = lgb_model.predict(test_x)
lightGBM_pred_y = predicted_y
print(metrics.classification_report(expected_y, predicted_y))

              precision    recall  f1-score   support

           0       0.84      0.87      0.85      1000
           1       0.99      0.98      0.99      1000
           2       0.84      0.84      0.84      1000
           3       0.91      0.92      0.92      1000
           4       0.85      0.87      0.86      1000
           5       0.99      0.96      0.98      1000
           6       0.75      0.71      0.73      1000
           7       0.95      0.97      0.96      1000
           8       0.98      0.98      0.98      1000
           9       0.96      0.97      0.97      1000

    accuracy                           0.91     10000
   macro avg       0.91      0.91      0.91     10000
weighted avg       0.91      0.91      0.91     10000



In [10]:
# Light GBM
# accuracy changed 

In [None]:
# dont run this part, i'm still tuning
err_list = []
for i in np.arange(0.0, 1.0, 0.1):
    model = LGBMClassifier(objective='multiclass',path_smooth = i)
    model.fit(X_train,y_train,categorical_feature=[0,3])
    predictions = model.predict(X_test)
    error = sum(predictions!=y_test)/len(y_test)
    err_list.append(error)
err_list

In [None]:
# dont run this part, i'm still tuning
err_list = []
for i in np.arange(0.0, 1.0, 0.1):
    model = LGBMClassifier(objective='multiclass',path_smooth = i)
    model.fit(X_train,y_train,categorical_feature=[0,3])
    predictions = model.predict(X_test)
    error = sum(predictions!=y_test)/len(y_test)
    err_list.append(error)
err_list

# LDA

In [31]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [32]:
lda = LDA(n_components=9)

In [33]:
start2 = time.time()

expected_y  = test_labels
lda_model = lda.fit(train_x, train_labels)
predicted_y = lda_model.predict(test_x)
lda_pred_y = predicted_y
print(metrics.classification_report(expected_y, predicted_y))

end2 = time.time()
LDA_time = end2-start2
print("LDA Time: {:0.2f} minute".format(LDA_time/60.0))

              precision    recall  f1-score   support

           0       0.80      0.77      0.78      1000
           1       0.99      0.94      0.97      1000
           2       0.76      0.71      0.73      1000
           3       0.82      0.87      0.85      1000
           4       0.75      0.79      0.77      1000
           5       0.86      0.89      0.88      1000
           6       0.59      0.60      0.60      1000
           7       0.87      0.85      0.86      1000
           8       0.94      0.92      0.93      1000
           9       0.90      0.91      0.90      1000

    accuracy                           0.83     10000
   macro avg       0.83      0.83      0.83     10000
weighted avg       0.83      0.83      0.83     10000

LDA Time: 0.28 minute


In [34]:
report = metrics.classification_report(expected_y, predicted_y, output_dict=True)
pd.DataFrame(report).transpose()

Unnamed: 0,precision,recall,f1-score,support
0,0.80167,0.768,0.784474,1000.0
1,0.994709,0.94,0.966581,1000.0
2,0.755814,0.715,0.734841,1000.0
3,0.821698,0.871,0.845631,1000.0
4,0.746212,0.788,0.766537,1000.0
5,0.862403,0.89,0.875984,1000.0
6,0.585673,0.605,0.59518,1000.0
7,0.87156,0.855,0.8632,1000.0
8,0.937564,0.916,0.926657,1000.0
9,0.897233,0.908,0.902584,1000.0


# Random Forest

In [35]:
#start3 = time.time()

from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(criterion='entropy', max_depth=70, n_estimators=100)
rf_fit=random_forest.fit(train_x, train_labels)

#end3 = time.time()
#RanForest_time = end3-start3
#print("Random Forest Time: {:0.2f} minute".format(RanForest_time/60.0)

In [36]:

pred_forest = random_forest.predict(test_x)
random_forest_accuracy = metrics.accuracy_score(test_labels, pred_forest)
print("Test Accuracy score: {}".format(random_forest_accuracy))
print(metrics.classification_report(test_labels, pred_forest))


Test Accuracy score: 0.8845
              precision    recall  f1-score   support

           0       0.81      0.87      0.83      1000
           1       0.99      0.98      0.98      1000
           2       0.79      0.81      0.80      1000
           3       0.90      0.93      0.91      1000
           4       0.80      0.85      0.83      1000
           5       0.98      0.94      0.96      1000
           6       0.76      0.60      0.67      1000
           7       0.92      0.94      0.93      1000
           8       0.95      0.98      0.96      1000
           9       0.94      0.95      0.95      1000

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



# SVM

In [37]:
from sklearn import svm
import sklearn.model_selection as model_selection
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [39]:
startsvm = time.time()
# Use Ploynomial kernel 
svmpoly = svm.SVC(kernel='poly', degree=3, C=10).fit(train_x, train_labels)

endsvm = time.time()
SVMsvm_time = endsvm-startsvm
print("LDA Time: {:0.2f} minute".format(SVMsvm_time/60.0))

In [40]:
start4 = time.time()

expected_y  = test_labels
predicted_y = svmpoly.predict(test_x)
SVM_pred_y = predicted_y
print(metrics.classification_report(expected_y, predicted_y))

end4 = time.time()
SVM_time = end4-start4
print("LDA Time: {:0.2f} minute".format(SVM_time/60.0))

              precision    recall  f1-score   support

           0       0.81      0.88      0.84      1000
           1       0.98      0.98      0.98      1000
           2       0.82      0.80      0.81      1000
           3       0.90      0.90      0.90      1000
           4       0.85      0.82      0.83      1000
           5       0.93      0.96      0.94      1000
           6       0.75      0.72      0.73      1000
           7       0.94      0.94      0.94      1000
           8       0.98      0.96      0.97      1000
           9       0.95      0.95      0.95      1000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

LDA Time: 1.84 minute


In [41]:
report = metrics.classification_report(expected_y, predicted_y, output_dict=True)
pd.DataFrame(report).transpose()

Unnamed: 0,precision,recall,f1-score,support
0,0.809041,0.877,0.841651,1000.0
1,0.984985,0.984,0.984492,1000.0
2,0.820513,0.8,0.810127,1000.0
3,0.898507,0.903,0.900748,1000.0
4,0.851197,0.818,0.834268,1000.0
5,0.929126,0.957,0.942857,1000.0
6,0.746362,0.718,0.731906,1000.0
7,0.939122,0.941,0.94006,1000.0
8,0.977665,0.963,0.970277,1000.0
9,0.954865,0.952,0.95343,1000.0


In [42]:
# to get the precision dictionary
out = {}
for i in range(0,10):
    out.update({i:round(report[str(i)]['precision'],2)})

In [43]:
# create df of y_pred of test set for each model
y_pred_test = pd.concat([pd.DataFrame(lightGBM_pred_y, columns = ['LightGBM']),
           pd.DataFrame(lda_pred_y, columns = ['LDA']),
           pd.DataFrame(SVM_pred_y, columns = ['SVM']),
           pd.DataFrame(pred_forest, columns = ['RF']),],axis=1)

In [44]:
y_pred_test.to_csv('y_pred_test.csv')

## User defined function

In [None]:
# Construct the reference table
ref1 = {0: 0.84, 1: 1.0, 2: 0.8, 3: 0.89, 4: 0.8, 5: 0.99, 6: 0.72, 7: 0.95, 8: 0.98, 9: 0.97}
ref2 = {0: 0.82, 1: 1.0, 2: 0.71, 3: 0.81, 4: 0.71, 5: 0.89, 6: 0.56, 7: 0.88, 8: 0.94, 9: 0.92}
ref3 = {0: 0.79, 1: 0.99, 2: 0.8, 3: 0.88, 4: 0.83, 5: 0.84, 6: 0.66, 7: 0.94, 8: 0.97, 9: 0.96}
ref_df = [ref1, ref2, ref3]

In [None]:
ypred_org = y_pred_test
ypred = ypred_org.copy()

In [None]:
# represent the y_test_pred with according precision
for i in range(ypred.shape[1]):
   ypred.iloc[:,i] = ypred.iloc[:,i].replace(ref_df[i])

ypred.columns = [1,2,3]
ypred

Unnamed: 0,1,2,3
0,0.97,0.92,0.96
1,0.80,0.71,0.80
2,1.00,1.00,0.99
3,1.00,1.00,0.99
4,0.72,0.56,0.66
...,...,...,...
9995,0.97,0.92,0.96
9996,1.00,1.00,0.99
9997,0.98,0.56,0.97
9998,1.00,1.00,0.99


In [None]:
# For each sample, take the result from model with highest precision 
n=ypred.idxmax(axis='columns')
result=[]
for j in range(len(n)):
    m=n[j]-1
    result.append(ypred_org.iloc[j,m])

In [None]:
# Calculate the precision for udf model
udf_report=metrics.classification_report(test_labels, result, output_dict=True)
pd.DataFrame(udf_report).transpose()

Unnamed: 0,precision,recall,f1-score,support
0,0.79375,0.889,0.838679,1000.0
1,0.989837,0.974,0.981855,1000.0
2,0.80426,0.793,0.79859,1000.0
3,0.846435,0.926,0.884432,1000.0
4,0.78658,0.844,0.814279,1000.0
5,0.951866,0.969,0.960357,1000.0
6,0.824615,0.536,0.649697,1000.0
7,0.949704,0.963,0.956306,1000.0
8,0.930806,0.982,0.955718,1000.0
9,0.95825,0.964,0.961117,1000.0


# Ensemble the models

In [None]:
import lightgbm as ltg
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn import svm
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import StackingClassifier

In [None]:
estimator = []
estimator.append(('LDA', LDA(n_components=9)))
estimator.append(('SVC', svm.SVC(kernel='poly', degree=3, C=1)))
estimator.append(('LGBM', ltg.LGBMClassifier(objective='multiclass',path_smooth = 0.2)))

## Voting

In [None]:
vot_hard=VotingClassifier(estimators = estimator, voting ='hard')
vot_hard.fit(train_x, train_labels)

VotingClassifier(estimators=[('LDA',
                              LinearDiscriminantAnalysis(n_components=9)),
                             ('SVC', SVC(C=1, kernel='poly')),
                             ('LGBM',
                              LGBMClassifier(objective='multiclass',
                                             path_smooth=0.2))])

In [None]:
# predict with voting-ensembled model and check accuracy on testing set
ypred=vot_hard.predict(test_x)
accuracy_score(test_labels, ypred)

0.8723

# Stacking

In [None]:
clf = StackingClassifier(estimators=estimator, final_estimator=LogisticRegression(max_iter=200, solver='liblinear'))

In [None]:
clf.fit(train_x, train_labels).score(test_x, test_labels)