# leaf classification problem

### The training set contains 99 species and 10 samples for each

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectFromModel

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import accuracy_score, log_loss

In [13]:
df = pd.read_csv('train_sort.csv')
df.info()
df_copy = df.copy()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 990 entries, 0 to 989
Columns: 194 entries, id to texture64
dtypes: float64(192), int64(1), object(1)
memory usage: 1.5+ MB


In [3]:
type_label = LabelEncoder().fit(df['species'])
y = type_label.transform(df['species'])

df1 = df.drop(['species','id'],axis = 1)
X = df1.values

In [4]:
scaler = StandardScaler().fit(X)
X = scaler.transform(X)

In [31]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2)

In [32]:
classifiers = [
    MLPClassifier(hidden_layer_sizes = (150,),solver='lbfgs',activation='logistic'),
    KNeighborsClassifier(1),
    NuSVC(probability=True),
    #RandomForestClassifier(),
    #AdaBoostClassifier(),
    #GradientBoostingClassifier(),
    #GaussianNB(),
    LinearDiscriminantAnalysis(),
    #QuadraticDiscriminantAnalysis()
]

# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(xtrain, ytrain)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    train_predictions = clf.predict(xtest)
    acc = accuracy_score(ytest, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    
    train_predictions = clf.predict_proba(xtest)
    #ll = log_loss(ytest, train_predictions)
    #print("Log Loss: {}".format(ll))
    
    #log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
    #log = log.append(log_entry)
    
print("="*30)

MLPClassifier
****Results****
Accuracy: 95.9596%
KNeighborsClassifier
****Results****
Accuracy: 95.9596%
NuSVC
****Results****
Accuracy: 98.4848%
LinearDiscriminantAnalysis
****Results****
Accuracy: 96.4646%


  np.exp(prob, prob)


In [33]:
classifiers = [
    MLPClassifier(hidden_layer_sizes = (150,),solver='lbfgs'),
    KNeighborsClassifier(1),
    NuSVC(probability=True),
    RandomForestClassifier(),
    #AdaBoostClassifier(),
    #GradientBoostingClassifier(),
    #GaussianNB(),
    LinearDiscriminantAnalysis(),
    #QuadraticDiscriminantAnalysis()
]

j = [str(j) for j in range(len(classifiers))]
est = zip(j,classifiers)

clf = VotingClassifier(estimators=list(est),voting='soft')

clf.fit(xtrain,ytrain)
    #clf.fit(x_train,y_train)
    
y_val_l = clf.predict_proba(xtest)
cv_score = sum(pd.DataFrame(y_val_l, columns=type_label.classes_).idxmax(axis=1).values
                                   == type_label.inverse_transform(ytest))/len(ytest)
print("Validation accuracy: ", cv_score)

Validation accuracy:  0.984848484848


  np.exp(prob, prob)


## Let's go predict, shall we?

In [34]:
test_df = pd.read_csv('test.csv')
index = test_df.pop('id')
test_data = test_df.values

test_X = test_data
test_X = scaler.transform(test_X)

In [44]:
predict_proba = clf.predict_proba(test_X)
predict_proba

  np.exp(prob, prob)


array([[ 0.00128417,  0.00151247,  0.00116013, ...,  0.00109032,
         0.00300038,  0.00173114],
       [ 0.00116789,  0.00122341,  0.00256458, ...,  0.02399054,
         0.00134046,  0.00204386],
       [ 0.00194943,  0.60111544,  0.00142099, ...,  0.00121909,
         0.0016016 ,  0.00458772],
       ..., 
       [ 0.00233964,  0.00194029,  0.00140562, ...,  0.00140563,
         0.00185203,  0.00371039],
       [ 0.00126721,  0.00153896,  0.00211605, ...,  0.02214102,
         0.00205882,  0.00219123],
       [ 0.00094733,  0.00143698,  0.00232145, ...,  0.00195149,
         0.00147669,  0.00229023]])

In [49]:
'''a,b = predict_proba.shape

for i in range(a):
    for j in range(b):
        p = predict_proba[i,j]
        if p>=0.5:
            predict_proba[i,j] = 1.0
        else:
            predict_proba[i,j] = 0'''

'a,b = predict_proba.shape\n\nfor i in range(a):\n    for j in range(b):\n        p = predict_proba[i,j]\n        if p>=0.5:\n            predict_proba[i,j] = 1.0\n        else:\n            predict_proba[i,j] = 0'

## Creating submission

In [46]:
result = pd.DataFrame(predict_proba,index=index,columns=np.sort(df_copy['species'].unique()))

In [48]:
result.to_csv('submission_1.csv')

## Playground

In [16]:
df1 = pd.read_csv('submission_1.csv')
df2 = pd.read_csv('submission_6.csv')

In [25]:
df2 = df2.drop(['Unnamed: 0'],axis=1)

In [26]:
df1.equals(df2)

False

In [27]:
ne = (df1 != df2).any(1)
ne_stacked = (df1 != df2).stack()
changed = ne_stacked[ne_stacked]
changed.index.names = ['id', 'col']

In [28]:
difference_locations = np.where(df1 != df2)
changed_from = df1.values[difference_locations]
changed_to = df2.values[difference_locations]
pd.DataFrame({'from': changed_from, 'to': changed_to}, index=changed.index)

Unnamed: 0_level_0,Unnamed: 1_level_0,from,to
id,col,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Acer_Capillipes,0.001124,0.0
0,Acer_Circinatum,0.001405,0.0
0,Acer_Mono,0.000978,0.0
0,Acer_Opalus,0.001217,0.0
0,Acer_Palmatum,0.001058,0.0
0,Acer_Pictum,0.001526,0.0
0,Acer_Platanoids,0.000919,0.0
0,Acer_Rubrum,0.001009,0.0
0,Acer_Rufinerve,0.001164,0.0
0,Acer_Saccharinum,0.002225,0.0


In [None]:
df2.to_csv('submission_6.csv')

In [None]:
df2