In [64]:
"""
This program takes in binary leaf images and accurately classifies 99 species of plants
"""
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [65]:
def loadData(): 
    """
    loads in the data and transforms it
    returns x, y: input data for the model, target data for the model
    """
    leaf = pd.read_csv('train.csv')
    y = pd.DataFrame(leaf, columns=['species']).as_matrix().ravel()
    x = leaf.drop(['id', 'species'], 1).as_matrix()
    print(x.shape)
    print(y.shape)
    return x,y

In [66]:
def transformData(x,y):
    """
    transforms data
    x: input data for model
    y: target data for the model
    return le: label encoder to transform target data for model into normalized encoding
    """
    le = LabelEncoder()
    le.fit(y)
    y = le.transform(y)
    return le

In [67]:
def buildModel(): 
    """
    builds model
    return model: data classified using random forest classifier 2 trees in forest
    """
    model = RandomForestClassifier(2)
    score = np.mean(cross_val_score(model, x, y, cv=10))
    print(score)
    model.fit(x, y)
    return model

In [68]:
def fitModel(model): 
    """
    fits model
    model: data classified using random forest classifier 2 trees in forest
    return answer: predicted probability of class labels of model test data
    return test_ids: 
    """
    leaf_test = pd.read_csv('test.csv')
    test_ids = pd.DataFrame(leaf_test, columns = ['id']).as_matrix().ravel()
    leaf_test = leaf_test.drop('id', 1).as_matrix()
    answer = model.predict_proba(leaf_test)
    print(answer)
    return answer, test_ids

In [69]:
def createSubmission(answer, le): 
    """
    creates our submission
    """
    sub = pd.DataFrame(answer, index = test_ids, columns = le.classes_)
    sub.index.names = ['id']
    sub.to_csv('output.csv')

In [70]:
#call each method
if __name__ == "__main__":
    x,y = loadData()
    le = transformData(x,y)
    model = buildModel()
    answer, test_ids = fitModel(model)
    createSubmission(answer, le)

(990, 192)
(990,)
0.542424242424
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  1.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]
