### 전처리
1. 결측치 확인
2. 결측치 처리(KNN)
3. 인코딩
4. 데이터 분석

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook

- 데이터 불러오기

In [2]:
df = pd.read_csv('mushrooms.csv')
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


- 결측치 확인

In [3]:
df.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

- 일단은 없어 보인다
- 인코딩을 하기 위해 값 확인

In [4]:
for i in list(df):
    print(i, df[i].unique())

class ['p' 'e']
cap-shape ['x' 'b' 's' 'f' 'k' 'c']
cap-surface ['s' 'y' 'f' 'g']
cap-color ['n' 'y' 'w' 'g' 'e' 'p' 'b' 'u' 'c' 'r']
bruises ['t' 'f']
odor ['p' 'a' 'l' 'n' 'f' 'c' 'y' 's' 'm']
gill-attachment ['f' 'a']
gill-spacing ['c' 'w']
gill-size ['n' 'b']
gill-color ['k' 'n' 'g' 'p' 'w' 'h' 'u' 'e' 'b' 'r' 'y' 'o']
stalk-shape ['e' 't']
stalk-root ['e' 'c' 'b' 'r' '?']
stalk-surface-above-ring ['s' 'f' 'k' 'y']
stalk-surface-below-ring ['s' 'f' 'y' 'k']
stalk-color-above-ring ['w' 'g' 'p' 'n' 'b' 'e' 'o' 'c' 'y']
stalk-color-below-ring ['w' 'p' 'g' 'b' 'n' 'e' 'y' 'o' 'c']
veil-type ['p']
veil-color ['w' 'n' 'o' 'y']
ring-number ['o' 't' 'n']
ring-type ['p' 'e' 'l' 'f' 'n']
spore-print-color ['k' 'n' 'u' 'h' 'w' 'r' 'o' 'y' 'b']
population ['s' 'n' 'a' 'v' 'y' 'c']
habitat ['u' 'g' 'm' 'd' 'p' 'w' 'l']


- stalk-root 에 ?가 보인다 왠지 불길하다.

In [5]:
len(df.loc[df['stalk-root']=='?'])

2480

- ?가 2480개나 된다.

In [6]:
len(df)

8124

- 결측치 비율이 30%나 됨으로 최빈값이나 KNN을 통해 결측치를 처리할 것이다.
- 최빈값을 넣으면 편하겠지만 KNN을 써보도록 하겠다.
- 그래도 일단 최빈값 확인이나 해보자.

In [7]:
df.groupby('stalk-root').size()

stalk-root
?    2480
b    3776
c     556
e    1120
r     192
dtype: int64

- KNN분석을 하려면 문자형 데이터를 전부 숫자형으로 바꿔줘야한다....

In [8]:
for i in list(df):
    print(i, df[i].unique())

class ['p' 'e']
cap-shape ['x' 'b' 's' 'f' 'k' 'c']
cap-surface ['s' 'y' 'f' 'g']
cap-color ['n' 'y' 'w' 'g' 'e' 'p' 'b' 'u' 'c' 'r']
bruises ['t' 'f']
odor ['p' 'a' 'l' 'n' 'f' 'c' 'y' 's' 'm']
gill-attachment ['f' 'a']
gill-spacing ['c' 'w']
gill-size ['n' 'b']
gill-color ['k' 'n' 'g' 'p' 'w' 'h' 'u' 'e' 'b' 'r' 'y' 'o']
stalk-shape ['e' 't']
stalk-root ['e' 'c' 'b' 'r' '?']
stalk-surface-above-ring ['s' 'f' 'k' 'y']
stalk-surface-below-ring ['s' 'f' 'y' 'k']
stalk-color-above-ring ['w' 'g' 'p' 'n' 'b' 'e' 'o' 'c' 'y']
stalk-color-below-ring ['w' 'p' 'g' 'b' 'n' 'e' 'y' 'o' 'c']
veil-type ['p']
veil-color ['w' 'n' 'o' 'y']
ring-number ['o' 't' 'n']
ring-type ['p' 'e' 'l' 'f' 'n']
spore-print-color ['k' 'n' 'u' 'h' 'w' 'r' 'o' 'y' 'b']
population ['s' 'n' 'a' 'v' 'y' 'c']
habitat ['u' 'g' 'm' 'd' 'p' 'w' 'l']


In [9]:
type(df['class'].unique())

numpy.ndarray

- 먼저 Label 컬럼과 Data 컬럼을 분리해준다
- label이 시리즈로 바껴서.ipynb_checkpoints/다시 데이터프레임으로

In [10]:
data = df.iloc[:,1:]
label = df.iloc[:,0]
# label = pd.DataFrame({'class':label_data.values})

In [11]:
data.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,x,s,n,t,p,f,c,n,k,e,...,s,w,w,p,w,o,p,k,s,u
1,x,s,y,t,a,f,c,b,k,e,...,s,w,w,p,w,o,p,n,n,g
2,b,s,w,t,l,f,c,b,n,e,...,s,w,w,p,w,o,p,n,n,m
3,x,y,w,t,p,f,c,n,n,e,...,s,w,w,p,w,o,p,k,s,u
4,x,s,g,f,n,f,w,b,k,t,...,s,w,w,p,w,o,e,n,a,g


In [12]:
label.head()

0    p
1    e
2    e
3    p
4    e
Name: class, dtype: object

- 결측치를 처리하기 위해 결측치가 없는 행을 따로 분리해서 KNN분석 후에 예측값을 결측치에 넣어줄 것이다.

In [13]:
not_null = data.loc[df['stalk-root']!='?']
null = data.loc[df['stalk-root']=='?']

In [14]:
len(not_null), len(null)

(5644, 2480)

In [15]:
not_null.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,x,s,n,t,p,f,c,n,k,e,...,s,w,w,p,w,o,p,k,s,u
1,x,s,y,t,a,f,c,b,k,e,...,s,w,w,p,w,o,p,n,n,g
2,b,s,w,t,l,f,c,b,n,e,...,s,w,w,p,w,o,p,n,n,m
3,x,y,w,t,p,f,c,n,n,e,...,s,w,w,p,w,o,p,k,s,u
4,x,s,g,f,n,f,w,b,k,t,...,s,w,w,p,w,o,e,n,a,g


In [16]:
null.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
3984,x,y,b,t,n,f,c,b,e,e,...,s,e,w,p,w,t,e,w,c,w
4023,x,y,e,f,y,f,c,n,b,t,...,s,w,w,p,w,o,e,w,v,p
4076,f,y,u,f,n,f,c,n,h,e,...,f,w,w,p,w,o,f,h,y,d
4100,x,y,e,f,y,f,c,n,b,t,...,s,p,p,p,w,o,e,w,v,d
4104,x,y,n,f,f,f,c,n,b,t,...,s,p,p,p,w,o,e,w,v,l


- not_null의 인코딩 시작
- encoding 함수 생성

In [17]:
def encoding(df):
    for col in list(df):
        for i,j in enumerate(df[col].unique()):
#         print(col, i,j)
#         print(not_null[col].head())
            df.loc[df[col]==j,col]=i
    return df

In [18]:
encoding(not_null)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,1,1
2,1,0,2,0,2,0,0,1,1,0,...,0,0,0,0,0,0,0,1,1,2
3,0,1,2,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,3,1,3,0,1,1,0,1,...,0,0,0,0,0,0,1,1,2,1
5,0,1,1,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,1,1
6,1,0,2,0,1,0,0,1,2,0,...,0,0,0,0,0,0,0,0,1,2
7,1,1,2,0,2,0,0,1,1,0,...,0,0,0,0,0,0,0,1,0,2
8,0,1,2,0,0,0,0,0,3,0,...,0,0,0,0,0,0,0,0,3,1
9,1,0,1,0,1,0,0,1,2,0,...,0,0,0,0,0,0,0,0,0,2


In [19]:
not_null.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,1,1
2,1,0,2,0,2,0,0,1,1,0,...,0,0,0,0,0,0,0,1,1,2
3,0,1,2,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,3,1,3,0,1,1,0,1,...,0,0,0,0,0,0,1,1,2,1


In [20]:
len(not_null)

5644

In [21]:
for i in list(not_null):
    print(i,not_null[i].unique())

cap-shape [0 1 2 3 4 5]
cap-surface [0 1 2 3]
cap-color [0 1 2 3 4 5 6 7]
bruises [0 1]
odor [0 1 2 3 4 5 6]
gill-attachment [0 1]
gill-spacing [0 1]
gill-size [0 1]
gill-color [0 1 2 3 4 5 6 7 8]
stalk-shape [0 1]
stalk-root [0 1 2 3]
stalk-surface-above-ring [0 1 2 3]
stalk-surface-below-ring [0 1 2 3]
stalk-color-above-ring [0 1 2 3 4 5 6]
stalk-color-below-ring [0 1 2 3 4 5 6]
veil-type [0]
veil-color [0 1]
ring-number [0 1 2]
ring-type [0 1 2 3]
spore-print-color [0 1 2 3 4 5]
population [0 1 2 3 4 5]
habitat [0 1 2 3 4 5]


- encoding 된 것을 확인했고 이제 KNN 분석 시작

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

- stalk-root를 y값으로 주기 위해 분리

In [23]:
not_null.columns

Index(['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [24]:
len(not_null.columns)

22

In [25]:
X = not_null.iloc[:,[0,1,2,3,4,5,6,7,8,9,11,12,13,14,15,16,17,18,19,20,21]]
Y = not_null.iloc[:,10]
# Y = pd.DataFrame({'stalk-root':Y_data})

In [26]:
X.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,1,1
2,1,0,2,0,2,0,0,1,1,0,...,0,0,0,0,0,0,0,1,1,2
3,0,1,2,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,3,1,3,0,1,1,0,1,...,0,0,0,0,0,0,1,1,2,1


In [27]:
X.columns

Index(['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-surface-above-ring', 'stalk-surface-below-ring',
       'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type',
       'veil-color', 'ring-number', 'ring-type', 'spore-print-color',
       'population', 'habitat'],
      dtype='object')

In [28]:
Y.head()

0    0
1    1
2    1
3    0
4    0
Name: stalk-root, dtype: int64

In [29]:
x_train, x_test, y_train, y_test = train_test_split(X, Y,
                                                   test_size=0.1,                                                    
                                                   random_state=1)

In [30]:
len(x_train),len(x_test),len(y_train),len(y_test)

(5079, 565, 5079, 565)

In [31]:
x_train = x_train.values
x_test = x_test.values
y_train = y_train.values
y_test = y_test.values

In [32]:
x_train.shape, y_train.shape

((5079, 21), (5079,))

In [33]:
model = KNeighborsClassifier(n_neighbors=1)

In [34]:
model.fit(x_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [35]:
pred = model.predict(x_test)

In [36]:
len(pred)

565

In [37]:
len(pred), len(y_test)

(565, 565)

In [38]:
y_test.data

<memory at 0x000001F5504CFDC8>

In [39]:
(pred == y_test).sum()

565

In [40]:
pred

array([2, 0, 0, 2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 3, 0, 2, 0, 2, 2,
       2, 1, 0, 1, 2, 1, 2, 2, 2, 0, 0, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1,
       2, 2, 2, 2, 2, 0, 2, 2, 0, 0, 2, 0, 2, 0, 2, 0, 2, 2, 2, 0, 1, 2,
       2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 0, 0, 2, 2,
       2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 0, 2, 2, 2, 2, 1, 0,
       2, 0, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2,
       2, 2, 2, 2, 1, 1, 3, 2, 1, 3, 2, 2, 2, 2, 3, 1, 2, 0, 2, 0, 2, 2,
       2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 0,
       2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 1, 2, 2, 2, 0,
       2, 2, 2, 1, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 3,
       0, 2, 2, 2, 0, 0, 2, 1, 0, 2, 2, 2, 2, 2, 2, 1, 2, 3, 0, 0, 2, 2,
       0, 2, 2, 0, 1, 2, 0, 2, 1, 2, 2, 2, 0, 1, 2, 0, 1, 2, 2, 1, 3, 3,
       2, 2, 1, 2, 0, 3, 3, 3, 2, 2, 2, 2, 2, 2, 1, 0, 2, 2, 0, 0, 2, 3,
       0, 0, 0, 3, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,

In [41]:
y_test

array([2, 0, 0, 2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 3, 0, 2, 0, 2, 2,
       2, 1, 0, 1, 2, 1, 2, 2, 2, 0, 0, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1,
       2, 2, 2, 2, 2, 0, 2, 2, 0, 0, 2, 0, 2, 0, 2, 0, 2, 2, 2, 0, 1, 2,
       2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 0, 0, 2, 2,
       2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 2, 0, 2, 2, 2, 2, 1, 0,
       2, 0, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2,
       2, 2, 2, 2, 1, 1, 3, 2, 1, 3, 2, 2, 2, 2, 3, 1, 2, 0, 2, 0, 2, 2,
       2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 0,
       2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 1, 2, 2, 2, 0,
       2, 2, 2, 1, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 3,
       0, 2, 2, 2, 0, 0, 2, 1, 0, 2, 2, 2, 2, 2, 2, 1, 2, 3, 0, 0, 2, 2,
       0, 2, 2, 0, 1, 2, 0, 2, 1, 2, 2, 2, 0, 1, 2, 0, 1, 2, 2, 1, 3, 3,
       2, 2, 1, 2, 0, 3, 3, 3, 2, 2, 2, 2, 2, 2, 1, 0, 2, 2, 0, 0, 2, 3,
       0, 0, 0, 3, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,

In [42]:
for k in range(1,21):
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(x_train,y_train)
    pred = model.predict(x_test)
    print(k,'정확도',(pred == y_test).sum() / len(pred))

1 정확도 1.0
2 정확도 1.0
3 정확도 1.0
4 정확도 1.0
5 정확도 1.0
6 정확도 1.0
7 정확도 1.0
8 정확도 1.0
9 정확도 1.0
10 정확도 1.0
11 정확도 0.9982300884955753
12 정확도 0.9982300884955753
13 정확도 0.9982300884955753
14 정확도 0.9982300884955753
15 정확도 0.9982300884955753
16 정확도 0.9982300884955753
17 정확도 0.9982300884955753
18 정확도 0.9964601769911504
19 정확도 0.9964601769911504
20 정확도 0.9964601769911504


- 뭔가 정확도가 이상한거 같음

In [43]:
null_test = encoding(null)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [44]:
XX = null_test.iloc[:,[0,1,2,3,4,5,6,7,8,9,11,12,13,14,15,16,17,18,19,20,21]]

In [45]:
XX_test = XX.values

In [46]:
len(XX_test)

2480

In [47]:
model = KNeighborsClassifier(n_neighbors=3)
model.fit(x_train,y_train)
pred = model.predict(XX_test)
pred

array([0, 1, 1, ..., 2, 1, 2], dtype=int64)

In [48]:
len(pred)

2480

- 이제 다시 조립

In [49]:
null.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
3984,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4023,0,0,1,1,1,0,0,1,1,1,...,0,1,0,0,0,1,0,0,1,1
4076,1,0,2,1,0,0,0,1,2,0,...,1,1,0,0,0,1,1,1,2,2
4100,0,0,1,1,1,0,0,1,1,1,...,0,2,1,0,0,1,0,0,1,2
4104,0,0,3,1,2,0,0,1,1,1,...,0,2,1,0,0,1,0,0,1,3


In [50]:
null_test.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
3984,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4023,0,0,1,1,1,0,0,1,1,1,...,0,1,0,0,0,1,0,0,1,1
4076,1,0,2,1,0,0,0,1,2,0,...,1,1,0,0,0,1,1,1,2,2
4100,0,0,1,1,1,0,0,1,1,1,...,0,2,1,0,0,1,0,0,1,2
4104,0,0,3,1,2,0,0,1,1,1,...,0,2,1,0,0,1,0,0,1,3


In [51]:
null_test.columns

Index(['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [52]:
null_test.loc[:,'stalk-root'] = list(pred)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [53]:
null_test.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
3984,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4023,0,0,1,1,1,0,0,1,1,1,...,0,1,0,0,0,1,0,0,1,1
4076,1,0,2,1,0,0,0,1,2,0,...,1,1,0,0,0,1,1,1,2,2
4100,0,0,1,1,1,0,0,1,1,1,...,0,2,1,0,0,1,0,0,1,2
4104,0,0,3,1,2,0,0,1,1,1,...,0,2,1,0,0,1,0,0,1,3


In [54]:
df_train = pd.concat([not_null, null_test])

In [55]:
len(df_train)

8124

In [56]:
df_train.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,1,1
2,1,0,2,0,2,0,0,1,1,0,...,0,0,0,0,0,0,0,1,1,2
3,0,1,2,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,3,1,3,0,1,1,0,1,...,0,0,0,0,0,0,1,1,2,1


In [57]:
df_train.sort_index()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,1,1
2,1,0,2,0,2,0,0,1,1,0,...,0,0,0,0,0,0,0,1,1,2
3,0,1,2,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,3,1,3,0,1,1,0,1,...,0,0,0,0,0,0,1,1,2,1
5,0,1,1,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,1,1
6,1,0,2,0,1,0,0,1,2,0,...,0,0,0,0,0,0,0,0,1,2
7,1,1,2,0,2,0,0,1,1,0,...,0,0,0,0,0,0,0,1,0,2
8,0,1,2,0,0,0,0,0,3,0,...,0,0,0,0,0,0,0,0,3,1
9,1,0,1,0,1,0,0,1,2,0,...,0,0,0,0,0,0,0,0,0,2


In [58]:
df_train.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,1,1,1
2,1,0,2,0,2,0,0,1,1,0,...,0,0,0,0,0,0,0,1,1,2
3,0,1,2,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,3,1,3,0,1,1,0,1,...,0,0,0,0,0,0,1,1,2,1


In [59]:
label.head()

0    p
1    e
2    e
3    p
4    e
Name: class, dtype: object

In [60]:
x_train, x_test, y_train, y_test = train_test_split(df_train, label,
                                                    test_size=0.1,
                                                   random_state=0)

In [61]:
model = KNeighborsClassifier(n_neighbors=3)

In [62]:
model.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [63]:
pred = model.predict(x_test)
pred

array(['p', 'e', 'e', 'e', 'e', 'e', 'p', 'p', 'e', 'e', 'p', 'p', 'e',
       'e', 'e', 'p', 'e', 'p', 'e', 'e', 'p', 'e', 'p', 'p', 'p', 'e',
       'p', 'e', 'e', 'e', 'p', 'p', 'p', 'p', 'p', 'e', 'p', 'e', 'p',
       'p', 'p', 'p', 'e', 'p', 'e', 'p', 'e', 'e', 'p', 'p', 'e', 'e',
       'p', 'e', 'e', 'p', 'p', 'p', 'e', 'p', 'p', 'e', 'e', 'p', 'e',
       'e', 'p', 'e', 'p', 'e', 'e', 'p', 'e', 'p', 'e', 'e', 'e', 'p',
       'e', 'e', 'p', 'e', 'e', 'p', 'p', 'e', 'e', 'p', 'e', 'e', 'e',
       'p', 'e', 'p', 'e', 'p', 'p', 'e', 'e', 'p', 'p', 'e', 'e', 'e',
       'p', 'e', 'p', 'e', 'p', 'p', 'p', 'p', 'e', 'e', 'e', 'e', 'p',
       'p', 'e', 'e', 'e', 'p', 'e', 'e', 'e', 'p', 'e', 'e', 'p', 'p',
       'e', 'p', 'p', 'p', 'p', 'p', 'e', 'p', 'p', 'p', 'p', 'e', 'e',
       'p', 'p', 'p', 'p', 'e', 'p', 'p', 'e', 'p', 'p', 'p', 'p', 'p',
       'p', 'p', 'e', 'e', 'p', 'p', 'e', 'p', 'e', 'p', 'e', 'e', 'e',
       'p', 'p', 'p', 'p', 'e', 'p', 'e', 'p', 'e', 'p', 'e', 'e

In [64]:
for k in range(1,11):
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(x_train, y_train)
    pred = model.predict(x_test)
    print("K가 {}일때 정확도는 : {}%".format(k,(pred==y_test).sum()/len(pred)))

K가 1일때 정확도는 : 0.8204182041820418%
K가 2일때 정확도는 : 0.7712177121771218%
K가 3일때 정확도는 : 0.8573185731857319%
K가 4일때 정확도는 : 0.8499384993849939%
K가 5일때 정확도는 : 0.8683886838868389%
K가 6일때 정확도는 : 0.8536285362853628%
K가 7일때 정확도는 : 0.8782287822878229%
K가 8일때 정확도는 : 0.8646986469864698%
K가 9일때 정확도는 : 0.8782287822878229%
K가 10일때 정확도는 : 0.8757687576875769%
