Leaf Classification

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.gridspec as gridspec

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import KFold,GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, log_loss
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVR
from sklearn.svm import SVC
from sklearn import neighbors
from sklearn.neighbors import NearestNeighbors

from scipy.stats.stats import pearsonr   

train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')

**Data Preparation**

In [2]:
label_encoder = LabelEncoder()
classLabel = label_encoder.fit(train.species)
labels = label_encoder.fit_transform(train.species)
classes = list(classLabel.classes_)                    
test_ids = test.id

train = train.drop(['species', 'id'], axis=1)
test = test.drop(['id'], axis=1)

scaler = StandardScaler().fit(train)
train = scaler.transform(train)

In [None]:
len(classes)

In [3]:
#K-fold with 5  
kfold = KFold(n_splits=5, shuffle=True, random_state=4)

In [4]:
# naive bayse with K-fold cross validation
nb = GaussianNB()
#standardization
#scaler = StandardScaler().fit(train)
#train = scaler.transform(train)
naiveBayseScore = list()
for train_index, test_index in kfold.split(train):
    X_train, X_test = train[train_index], train[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    naiveBayse = nb.fit(X_train,y_train)
    naiveBayseScore.append(naiveBayse.score(X_test, y_test))
nb_validation = naiveBayseScore

#nb_validation=[nb.fit(train[train_index], labels[train_index]).score(train[test_index], labels[test_index]).mean() \
#              for train_index, test_index in kfold.split(train)]
np.mean(nb_validation)

0.48686868686868684

In [5]:
#linear DiscriminatAnalysis
ld = LinearDiscriminantAnalysis(priors=None)
linearDiscScore = list()

for train_index, test_index in kfold.split(train):
    X_train, X_test = train[train_index], train[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    linearDisc = ld.fit(X_train,y_train)
    linearDiscScore.append(linearDisc.score(X_test, y_test))
ld_validation = linearDiscScore

np.mean(ld_validation)



0.97171717171717165

In [6]:
#KNearestNeighbours
knn = neighbors.KNeighborsClassifier(algorithm='ball_tree',weights= 'distance')
kNNScore = list()
for train_index, test_index in kfold.split(train):
    X_train, X_test = train[train_index], train[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    KnnCl = knn.fit(X_train,y_train)
    kNNScore.append(KnnCl.score(X_test, y_test))
knn_validation = kNNScore

np.mean(knn_validation)

0.96262626262626261

In [7]:
#Logistic Regression

logreg = LogisticRegression(solver='lbfgs',multi_class='multinomial')
logregScore = list()

for train_index, test_index in kfold.split(train):
    X_train, X_test = train[train_index], train[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    LogReg = logreg.fit(X_train,y_train)
    logregScore.append(LogReg.score(X_test, y_test))
ls_validation = logregScore

#gs_validation=[gs.fit(train[train_index], labels[train_index]).score(train[test_index], labels[test_index]).mean() \
#              for train_index, test_index in kfold.split(train)]
np.mean(ls_validation)

0.98181818181818203

In [8]:
#Logistic Regression and a GridSearch

params = {'C':[100, 1000], 'tol': [0.001, 0.0001]}
lr = LogisticRegression(solver='lbfgs', multi_class='multinomial')
gs = GridSearchCV(lr, params, scoring=None, refit='True', cv=3) 
gridSearchScore = list()

for train_index, test_index in kfold.split(train):
    X_train, X_test = train[train_index], train[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    GridSearch = gs.fit(X_train,y_train)
    gridSearchScore.append(GridSearch.score(X_test, y_test))
gs_validation = gridSearchScore

#gs_validation=[gs.fit(train[train_index], labels[train_index]).score(train[test_index], labels[test_index]).mean() \
#              for train_index, test_index in kfold.split(train)]

np.mean(gs_validation)


0.98383838383838373

In [None]:
#random forest
rf = RandomForestClassifier(n_estimators=500)
#random forest
randomForestScore = list()

for train_index, test_index in kfold.split(train):
    X_train, X_test = train[train_index], train[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    #fit
    randomForest = rf.fit(X_train,y_train)
    randomForestScore.append(randomForest.score(X_test, y_test))

rf_validation = randomForestScore

#rf_validation=[rf.fit(train[train_index], labels[train_index]).score(train[test_index], labels[test_index]).mean() \
#              for train_index, test_index in kfold.split(train)]
np.mean(rf_validation)

In [None]:
#random forest
etcl = ExtraTreesClassifier(n_estimators=500, random_state=0)
ExtraTreeScore = list()

for train_index, test_index in kfold.split(train):
    X_train, X_test = train[train_index], train[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    #fit
    ExtraTree = etcl.fit(X_train,y_train)
    ExtraTreeScore.append(ExtraTree.score(X_test, y_test))

etcl_validation = ExtraTreeScore

#rf_validation=[rf.fit(train[train_index], labels[train_index]).score(train[test_index], labels[test_index]).mean() \
#              for train_index, test_index in kfold.split(train)]
np.mean(etcl_validation)

In [None]:
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]
imp_std = np.std([est.feature_importances_ for est in rf.estimators_], axis=0)
fig = plt.figure(figsize=(12, 4))

gs1 = gridspec.GridSpec(1, 2,height_ratios=[1])

ax1, ax2 = fig.add_subplot(gs1[0]), fig.add_subplot(gs1[1])

ax1.margins(0.05), ax2.margins(0.05) 

ax1.bar(range(10), importances[indices][:10], \
       color="#6480e5", yerr=imp_std[indices][:10], ecolor='#31427e', align="center")

ax2.bar(range(10), importances[indices][-10:], \
       color="#e56464", yerr=imp_std[indices][-10:], ecolor='#7e3131', align="center")

ax1.set_xticks(range(10)), ax2.set_xticks(range(10))

ax1.set_xticklabels(indices[:10]), ax2.set_xticklabels(indices[-10:])

ax1.set_xlim([-1, 10]), ax2.set_xlim([-1, 10])
ax1.set_ylim([0, 0.035]), ax2.set_ylim([0, 0.035])

ax1.set_xlabel('Feature numbers'), ax2.set_xlabel('Feature numbers')
ax1.set_ylabel('Random Forest Normalized Importance') 
ax2.set_ylabel('Random Forest Normalized Importance')

ax1.set_title('First 10 Important Features'), ax2.set_title('Last 10 Important Features')
gs1.tight_layout(fig)
plt.show()

In [None]:
#correlation analysis (TBD)


In [None]:
#feature selection exploration
svc = SVC(kernel="linear")
ld = LinearDiscriminantAnalysis();

featureSelector = RFECV(estimator=svc, step = 1, cv=5, scoring='accuracy')

rfecv = featureSelector.fit(train, labels)
train_rfecv = rfecv.transform(train) 

print("Optimal number of features : %d" % featureSelector.n_features_)

plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(featureSelector.grid_scores_) + 1), featureSelector.grid_scores_)
plt.show()

In [None]:
#selection optimal n number of feature 
svc = SVC(kernel="linear")
rfe = RFE(estimator=svc, n_features_to_select=25, step=1)
train_rfe = rfe.fit_transform(train, labels)

train_rfe[1]

In [None]:
#after RFE
naiveBayseScore = list()
for train_index, test_index in kfold.split(train):
    X_train, X_test = train_rfecv[train_index], train_rfecv[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    #fit
    naiveBayse = nb.fit(X_train,y_train)
    naiveBayseScore.append(naiveBayse.score(X_test, y_test))
    
nb_validation = naiveBayseScore

np.mean(nb_validation)

In [9]:
#Priciple Component Analysis
pca = PCA(n_components = 'mle', svd_solver = 'full',iterated_power='auto')
pca_fit = pca.fit(train)
train_pca=pca_fit.transform(train)

In [10]:
#After PCA
nb_pca = GaussianNB()
naiveBayseScore = list()
for train_index, test_index in kfold.split(train_pca):
    X_train, X_test = train_pca[train_index], train_pca[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    naiveBayse = nb_pca.fit(X_train,y_train)
    naiveBayseScore.append(naiveBayse.score(X_test, y_test))
    
nb_validation = naiveBayseScore
np.mean(nb_validation)

0.90909090909090895

In [11]:
#After PCA
#linear DiscriminatAnalysis
#shrinkage='auto',
ld_pca = LinearDiscriminantAnalysis(solver='lsqr')
linearDiscScore = list()

for train_index, test_index in kfold.split(train_pca):
    X_train, X_test = train_pca[train_index], train_pca[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    #fit
    linearDisc = ld_pca.fit(X_train,y_train)
    linearDiscScore.append(linearDisc.score(X_test, y_test))

ld_validation = linearDiscScore

np.mean(ld_validation)



0.97171717171717165

In [12]:
#After PCA
#KNearestNeighbours
knn_pca = neighbors.KNeighborsClassifier(algorithm='ball_tree',weights= 'distance')
kNNScore = list()
for train_index, test_index in kfold.split(train_pca):
    X_train, X_test = train_pca[train_index], train_pca[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    KnnCl = knn_pca.fit(X_train,y_train)
    kNNScore.append(KnnCl.score(X_test, y_test))
    
knn_validation = kNNScore

np.mean(knn_validation)

0.96262626262626261

In [13]:
#After PCA
#Logistic Regression and a GridSearch
params = {'C':[100, 1000], 'tol': [0.001, 0.0001]}
lr_pca = LogisticRegression(solver='lbfgs', multi_class='multinomial')
gs_pca = GridSearchCV(lr_pca, params, scoring=None, refit='True', cv=3) 
gridSearchScore = list()

for train_index, test_index in kfold.split(train_pca):
    X_train, X_test = train_pca[train_index], train_pca[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    GridSearch = gs_pca.fit(X_train,y_train)
    gridSearchScore.append(GridSearch.score(X_test, y_test))
gs_validation = gridSearchScore

np.mean(gs_validation)

0.98383838383838373

In [14]:
#After PCA
#random forest
etcl_pca = ExtraTreesClassifier(n_estimators=500, random_state=0)
ExtraTreeScore = list()

for train_index, test_index in kfold.split(train_pca):
    X_train, X_test = train_pca[train_index], train_pca[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    #fit
    ExtraTree = etcl_pca.fit(X_train,y_train)
    ExtraTreeScore.append(ExtraTree.score(X_test, y_test))

etcl_validation = ExtraTreeScore

np.mean(etcl_validation)

0.97575757575757582

In [None]:
#bagging esemble the classifiers
from sklearn.ensemble import VotingClassifier
#('nb_pca',nb_pca), ('ld_pca', ld_pca), ('knn_pca',knn_pca),
eclf1 = VotingClassifier(estimators=[('nb_pca',nb_pca), ('knn_pca',knn_pca), ('gs_pca', gs_pca), 
                                     ('etcl_pca',etcl_pca)],
                         voting='soft')
VotingClassifierScore = list()
for train_index, test_index in kfold.split(train_pca):
    X_train, X_test = train_pca[train_index], train_pca[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    #fit
    VotingClass = eclf1.fit(X_train,y_train)
    VotingClassifierScore.append(VotingClass.score(X_test, y_test))

np.mean(VotingClassifierScore)

In [None]:
test = scaler.transform(test)
test_pca=pca_fit.transform(test)
test_predictions = eclf1.predict_proba(test_pca)

log_cols=["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)

test_predictions = eclf1.predict(test)

acc = accuracy_score(, test_predictions)

acc

In [15]:

test = scaler.transform(test)
test_pca=pca_fit.transform(test)
test_predictions = gs_pca.predict_proba(test_pca)

submission = pd.DataFrame(test_predictions, columns=classes)
submission.insert(0, 'id', test_ids)
submission.reset_index()
submission.set_index('id', inplace=True)
fp = open('submit.csv', 'w')
fp.write(submission.to_csv())
print('Finished writing submission')
submission.tail()

Finished writing submission


Unnamed: 0_level_0,Acer_Capillipes,Acer_Circinatum,Acer_Mono,Acer_Opalus,Acer_Palmatum,Acer_Pictum,Acer_Platanoids,Acer_Rubrum,Acer_Rufinerve,Acer_Saccharinum,...,Salix_Fragilis,Salix_Intergra,Sorbus_Aria,Tilia_Oliveri,Tilia_Platyphyllos,Tilia_Tomentosa,Ulmus_Bergmanniana,Viburnum_Tinus,Viburnum_x_Rhytidophylloides,Zelkova_Serrata
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1576,8.570116e-07,0.9998349,1.491306e-07,2.367666e-08,8.240294e-05,7.842066e-08,3.390336e-09,1.839186e-06,5.145595e-06,9.248885e-07,...,3.526926e-08,7.638954e-09,2.176534e-08,1.856065e-09,8.649973e-11,2.058003e-09,1.1244e-06,1.499914e-11,2.453847e-09,2.271927e-05
1577,2.488508e-07,9.220734e-08,1.271193e-09,1.042362e-06,9.857609e-09,4.736718e-10,2.317026e-08,1.06071e-05,8.001536e-05,8.654126e-09,...,1.390747e-08,6.374147e-09,4.393712e-06,8.712163e-10,1.003759e-05,1.025632e-05,1.60285e-07,2.873007e-11,2.523514e-09,2.382522e-05
1579,4.198415e-08,8.418455e-09,2.161047e-09,7.980462e-11,9.189209e-08,2.276649e-07,7.560791e-11,1.875409e-08,1.017468e-09,5.781226e-07,...,7.700392e-13,3.637695e-12,8.703355e-09,3.443338e-10,2.509468e-09,1.865638e-14,2.058288e-12,1.944001e-10,2.157379e-09,1.884388e-07
1580,3.757027e-10,1.785899e-10,9.808644e-09,5.898435e-09,2.391792e-09,1.155581e-10,2.127765e-08,3.995165e-08,1.677714e-11,9.435058e-13,...,2.021303e-08,1.70939e-07,4.398426e-12,8.780395e-08,2.148123e-13,7.722929e-09,4.900621e-11,2.568975e-10,5.571697e-13,1.249406e-10
1583,1.14128e-11,1.613197e-07,3.154049e-08,8.637998e-08,6.435937e-09,1.281374e-06,1.792546e-07,1.01132e-08,1.451156e-09,8.100506e-10,...,2.57166e-10,7.685028e-11,1.454397e-10,1.087026e-09,3.969906e-10,1.039783e-09,2.806976e-11,1.082223e-10,1.151716e-11,1.452147e-07
