In [54]:
from sklearn import svm
import pandas as pd
import math


## Make a simple sklearn classifier
First, read the data in using `pandas.read_csv()`.
Note that the final column contains the `class_type` field that we are interested in.

In [55]:
data = pd.read_csv("../input/zoo.csv")
data.head(6)

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
0,aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1
5,buffalo,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1


## Preprocess the data
Split the data up for training and evaluation.

In [56]:
def preprocess(data):
    X = data.iloc[:, 1:17]  # all rows, all the features and no labels
    y = data.iloc[:, 17]  # all rows, label only

    return X, y

In [57]:
# Shuffle and split the dataset
# We don't need to use this any more, thanks to scikit-learn!

data = data.sample(frac=1).reset_index(drop=True)
data_total_len = data[data.columns[0]].size

data_train_frac = 0.6
split_index = math.floor(data_total_len*data_train_frac)

train_data = data.iloc[:split_index]
eval_data = data.iloc[split_index:]

Split the data using scikit-learn instead, using fewer lines!

In [58]:
from sklearn.model_selection import train_test_split

all_X, all_y = preprocess(data)
X_train, X_test, y_train, y_test = train_test_split(all_X, all_y)

In [59]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(75, 16) (26, 16) (75,) (26,)


## Train and Evaluate the model
It's easy to swap in a different model of your choice.

In [60]:
clf = svm.SVC()
clf.fit(X_train, y_train)  



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [61]:
clf.score(X_test, y_test)

0.9615384615384616

## Predict on some new data
We can predict new values with a one line call.

In [62]:
clf.predict(X_test[15:25])

array([1, 2, 1, 6, 2, 2, 1, 7, 1, 1])

In [63]:
# Show what the correct answer is
y_test[10:15]

41    1
5     1
36    7
58    1
62    4
Name: class_type, dtype: int64