## KNN 분류 분석 실습

In [2]:
import pandas as pd
import seaborn as sns

In [3]:
df = sns.load_dataset('titanic')
df.info() #891개행, 15변수

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
survived       891 non-null int64
pclass         891 non-null int64
sex            891 non-null object
age            714 non-null float64
sibsp          891 non-null int64
parch          891 non-null int64
fare           891 non-null float64
embarked       889 non-null object
class          891 non-null category
who            891 non-null object
adult_male     891 non-null bool
deck           203 non-null category
embark_town    889 non-null object
alive          891 non-null object
alone          891 non-null bool
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB


In [4]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


### NaN 값이 많은 deck열(변수) 삭제

### embarked와 embark_town 열(변수)는 의미가 동일하므로 embark_town 열 삭제

In [5]:
ndf = df.drop(['deck', 'embark_town'], axis=1)
ndf.info()

# drop은 행과 열 모두 삭제하므로 axis=1이라고 주어야 열 삭제!

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
survived      891 non-null int64
pclass        891 non-null int64
sex           891 non-null object
age           714 non-null float64
sibsp         891 non-null int64
parch         891 non-null int64
fare          891 non-null float64
embarked      889 non-null object
class         891 non-null category
who           891 non-null object
adult_male    891 non-null bool
alive         891 non-null object
alone         891 non-null bool
dtypes: bool(2), category(1), float64(2), int64(4), object(4)
memory usage: 72.4+ KB


### age변수의 값이 NaN인 행을 삭제

In [6]:
ndf = ndf.dropna(subset=['age'], how='any', axis=0)
# how = 'any' 설정은 하나라도 포함되어있으면 삭제한다는 의미
ndf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 13 columns):
survived      714 non-null int64
pclass        714 non-null int64
sex           714 non-null object
age           714 non-null float64
sibsp         714 non-null int64
parch         714 non-null int64
fare          714 non-null float64
embarked      712 non-null object
class         714 non-null category
who           714 non-null object
adult_male    714 non-null bool
alive         714 non-null object
alone         714 non-null bool
dtypes: bool(2), category(1), float64(2), int64(4), object(4)
memory usage: 63.6+ KB


### embared 열의 NaN값을 승선도시 중에서 가장 많이 출연한 데이터 값으로 치환하기

In [7]:
ndf['embarked'].value_counts(dropna=True)

S    554
C    130
Q     28
Name: embarked, dtype: int64

In [8]:
most_freq = ndf['embarked'].value_counts(dropna=True).idxmax()
print(most_freq)

S


In [9]:
ndf['embarked'].fillna(most_freq, inplace=True)
print(ndf.describe(include='all'))

          survived      pclass   sex         age       sibsp       parch  \
count   714.000000  714.000000   714  714.000000  714.000000  714.000000   
unique         NaN         NaN     2         NaN         NaN         NaN   
top            NaN         NaN  male         NaN         NaN         NaN   
freq           NaN         NaN   453         NaN         NaN         NaN   
mean      0.406162    2.236695   NaN   29.699118    0.512605    0.431373   
std       0.491460    0.838250   NaN   14.526497    0.929783    0.853289   
min       0.000000    1.000000   NaN    0.420000    0.000000    0.000000   
25%       0.000000    1.000000   NaN   20.125000    0.000000    0.000000   
50%       0.000000    2.000000   NaN   28.000000    0.000000    0.000000   
75%       1.000000    3.000000   NaN   38.000000    1.000000    1.000000   
max       1.000000    3.000000   NaN   80.000000    5.000000    6.000000   

              fare embarked  class  who adult_male alive alone  
count   714.000000    

### 분류 분석에서 사용할 변수 선택

In [10]:
# survived, pclass, sex, age, sibsp, parch, embarked 
X = ndf[['pclass', 'sex', 'age', 'sibsp', 'parch', 'embarked' ]]
Y = ndf['survived']

### 범주형 데이터를 모델이 인식할 수 있는 숫자형 데이터로 변환
- one-hot encoding

In [11]:
onehot_sex = pd.get_dummies(ndf['sex'])
ndf = pd.concat([ndf, onehot_sex], axis= 1)
ndf.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,alive,alone,female,male
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,no,False,0,1
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,yes,False,1,0
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,yes,True,1,0
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,yes,False,1,0
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,no,True,0,1


In [12]:
onehot_embarked = pd.get_dummies(ndf['embarked'], prefix='town')
ndf = pd.concat([ndf, onehot_embarked], axis= 1)
ndf.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,alive,alone,female,male,town_C,town_Q,town_S
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,no,False,0,1,0,0,1
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,yes,False,1,0,1,0,0
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,yes,True,1,0,0,0,1
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,yes,False,1,0,0,0,1
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,no,True,0,1,0,0,1


In [13]:
X = ndf[ ['pclass', 'female', 'male', 'age', 'sibsp', 'parch', 'town_C','town_Q','town_S']]
Y = ndf['survived']

### KNN 분류 분석을 수행하려면 설명변수를 정규화 (평균 0, 표준편차1)

In [14]:
from sklearn import preprocessing
X.head()

Unnamed: 0,pclass,female,male,age,sibsp,parch,town_C,town_Q,town_S
0,3,0,1,22.0,1,0,0,0,1
1,1,1,0,38.0,1,0,1,0,0
2,3,1,0,26.0,0,0,0,0,1
3,1,1,0,35.0,1,0,0,0,1
4,3,0,1,35.0,0,0,0,0,1


In [17]:
X = preprocessing.StandardScaler().fit(X).transform(X)

### train data : test data 을 7:3으로 데이터 분리

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=10)
print(X_train.shape)
print(X_test.shape)

(499, 9)
(215, 9)


### KNN 분류 분석으로  모델 생성

In [20]:
from sklearn.neighbors import KNeighborsClassifier

In [21]:
knn = KNeighborsClassifier(n_neighbors=5)  
# k개수 선정이 매우 중요함!
# 실습에서는 임의로 5로 설정

knn.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

### 학습 데이터로부터 생성된 모델로부터 예측값 생성

In [22]:
y_predict = knn.predict(X_test)

print(y_predict[0:10])
print(Y_test.values[0:10])

[0 0 1 0 0 1 1 1 0 0]
[0 0 1 0 0 1 1 1 0 0]


In [24]:
from sklearn import metrics
knn_matrix = metrics.confusion_matrix(Y_test, y_predict)
print(knn_matrix)

[[110  15]
 [ 25  65]]


In [25]:
knn_report = metrics.classification_report(Y_test, y_predict)
print(knn_report)

              precision    recall  f1-score   support

           0       0.81      0.88      0.85       125
           1       0.81      0.72      0.76        90

    accuracy                           0.81       215
   macro avg       0.81      0.80      0.81       215
weighted avg       0.81      0.81      0.81       215



--- 
- 생존 여부를 분류할 때 영향을 주는 변수를 선택해서 k(최근접을 몇 개까지 볼 것인지 지정)는 되도록 작은 수를 설정하고 홀수로 설정해서 분류분석을 수행합니다.
- 데이터셋에서 생존자 클래스(생존자, 비생존자)의 데이터 수가  동일하다면 정확률로, 생존자 클래스의 데이터 수가 상이하다면 f1통계량으로 모델의 정확도를 판단한다.
- 통상적으로 k=1일때 overfitting 발생할 가능성이 높습니다
