In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.cross_validation import train_test_split
from sklearn import neighbors
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier



In [2]:
data = pd.read_csv('balance-scale.data')
df = pd.DataFrame(data)
df_columns = ['class','Left-weighted','Left-distace','Right-weighted','Right-distace']
df.columns = df_columns
df.head()

Unnamed: 0,class,Left-weighted,Left-distace,Right-weighted,Right-distace
0,R,1,1,1,2
1,R,1,1,1,3
2,R,1,1,1,4
3,R,1,1,1,5
4,R,1,1,2,1


In [3]:
y = df['class']
X = df.ix[:,1:5]
y.head(2)
X.head(2)

Unnamed: 0,Left-weighted,Left-distace,Right-weighted,Right-distace
0,1,1,1,2
1,1,1,1,3


In [4]:
le = LabelEncoder()
le.fit(y)
y_std = le.transform(y)

In [5]:
X_std = X.as_matrix()

In [6]:
X_train , X_test , y_train , y_test = train_test_split(X_std,y_std,test_size=0.2)

In [7]:
clf = neighbors.KNeighborsClassifier(n_jobs=-1,n_neighbors=15,algorithm='kd_tree')
#clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)

KNeighborsClassifier(algorithm='kd_tree', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=15, p=2,
           weights='uniform')

In [8]:
y_pred = clf.predict(X_test)

In [9]:
score = accuracy_score(y_test,y_pred)
score*100

91.200000000000003

In [30]:
clf.predict([1,1,1,1])



array([2])

In [29]:
y_pred,y_test

(array([2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1,
        2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 1, 2, 2, 2,
        1, 2, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 0, 2, 2,
        2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1,
        2, 1, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
        2, 1, 1, 2, 1, 2, 1, 1, 2, 1]),
 array([2, 1, 2, 2, 1, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1,
        2, 1, 2, 1, 2, 1, 1, 2, 1, 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 1, 2, 2, 2,
        1, 2, 1, 1, 1, 2, 1, 2, 2, 0, 0, 2, 1, 1, 1, 2, 0, 2, 1, 2, 0, 2, 2,
        2, 1, 0, 1, 0, 0, 1, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1,
        2, 0, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 2, 1, 1, 1, 2, 0, 2, 2, 2, 2, 2,
        2, 1, 1, 2, 1, 2, 1, 1, 2, 1]))

In [12]:
y_pred_std = le.inverse_transform(y_pred)
y_pred_std

array(['R', 'L', 'R', 'R', 'L', 'R', 'R', 'R', 'R', 'L', 'R', 'L', 'L',
       'R', 'R', 'R', 'R', 'L', 'R', 'R', 'R', 'L', 'L', 'R', 'L', 'R',
       'L', 'R', 'L', 'L', 'R', 'L', 'R', 'R', 'R', 'R', 'L', 'L', 'L',
       'R', 'R', 'L', 'L', 'R', 'R', 'R', 'L', 'R', 'L', 'L', 'L', 'R',
       'R', 'R', 'R', 'L', 'L', 'R', 'L', 'L', 'L', 'R', 'L', 'R', 'L',
       'L', 'B', 'R', 'R', 'R', 'L', 'R', 'L', 'R', 'L', 'L', 'R', 'L',
       'R', 'R', 'R', 'R', 'R', 'L', 'R', 'L', 'L', 'L', 'L', 'L', 'L',
       'L', 'R', 'L', 'R', 'L', 'R', 'R', 'R', 'R', 'L', 'R', 'L', 'R',
       'R', 'L', 'L', 'L', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'L',
       'L', 'R', 'L', 'R', 'L', 'L', 'R', 'L'], dtype=object)

In [13]:
with open('balanceScale.pickle','wb') as w:
    pickle.dump(clf,w)

In [14]:
clf.kneighbors_graph(X).toarray()

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  1.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])