In [97]:
import seaborn as sns
import pandas as pd
import numpy as np
df = sns.load_dataset('titanic')

In [98]:
#Nan값이 많은 deck 열을 삭제, embarked와 내용이 겹치는 embark_town 열을 삭제
rdf = df.drop(['deck', 'embark_town'], axis = 1) #axis=1은 열기준
print(rdf.columns.values)
print('\n')

['survived' 'pclass' 'sex' 'age' 'sibsp' 'parch' 'fare' 'embarked' 'class'
 'who' 'adult_male' 'alive' 'alone']




In [99]:
#age열에 나이 데이터가 없는 모든 행을 삭제 - age열(891개 중 177개의 NaN 값)
rdf = rdf.dropna(subset=['age'], how='any', axis=0)
print(len(rdf))
print('\n')

714




In [100]:
#value의 개수 세기
rdf['embarked'].value_counts()

S    554
C    130
Q     28
Name: embarked, dtype: int64

In [101]:
#NaN값 제거하기
rdf['embarked'].value_counts(dropna=True)

S    554
C    130
Q     28
Name: embarked, dtype: int64

In [102]:
#NaN값을 가장 많이 나온 값으로 치환
rdf['embarked'].value_counts(dropna=True).idxmax()

'S'

In [103]:
#embarked열의 NaN값을 승선도시 중에서 가장 많이 출현한 값으로 치환(매우 중요!)
most_freq = rdf['embarked'].value_counts(dropna=True).idxmax()
print(most_freq)
print('\n')

S




In [104]:
rdf.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,714.0,714.0,714.0,714.0,714.0,714.0
mean,0.406162,2.236695,29.699118,0.512605,0.431373,34.694514
std,0.49146,0.83825,14.526497,0.929783,0.853289,52.91893
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,1.0,20.125,0.0,0.0,8.05
50%,0.0,2.0,28.0,0.0,0.0,15.7417
75%,1.0,3.0,38.0,1.0,1.0,33.375
max,1.0,3.0,80.0,5.0,6.0,512.3292


In [105]:
#include='all'로 모든 인덱스 표시
rdf.describe(include='all')

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,alive,alone
count,714.0,714.0,714,714.0,714.0,714.0,714.0,712,714,714,714,714,714
unique,,,2,,,,,3,3,3,2,2,2
top,,,male,,,,,S,Third,man,True,no,True
freq,,,453,,,,,554,355,413,413,424,404
mean,0.406162,2.236695,,29.699118,0.512605,0.431373,34.694514,,,,,,
std,0.49146,0.83825,,14.526497,0.929783,0.853289,52.91893,,,,,,
min,0.0,1.0,,0.42,0.0,0.0,0.0,,,,,,
25%,0.0,1.0,,20.125,0.0,0.0,8.05,,,,,,
50%,0.0,2.0,,28.0,0.0,0.0,15.7417,,,,,,
75%,1.0,3.0,,38.0,1.0,1.0,33.375,,,,,,


In [106]:
#NaN값 때문에 평균값을 구할 수 없으니 fillna활용,
#가장 많이 나온 수로 대체, inplace=True로 바로 적용
rdf['embarked'].fillna(most_freq, inplace=True)

In [107]:
rdf['embarked']

0      S
1      C
2      S
3      S
4      S
      ..
885    Q
886    S
887    S
889    C
890    Q
Name: embarked, Length: 714, dtype: object

In [108]:
rdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   survived    714 non-null    int64   
 1   pclass      714 non-null    int64   
 2   sex         714 non-null    object  
 3   age         714 non-null    float64 
 4   sibsp       714 non-null    int64   
 5   parch       714 non-null    int64   
 6   fare        714 non-null    float64 
 7   embarked    714 non-null    object  
 8   class       714 non-null    category
 9   who         714 non-null    object  
 10  adult_male  714 non-null    bool    
 11  alive       714 non-null    object  
 12  alone       714 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(4)
memory usage: 63.6+ KB


In [109]:
# 분석에 사용할 속성을 선택

#분석에 활용할 열(속성)을 선택
ndf = rdf[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'embarked']]
print(ndf.head())
print('\n')

   survived  pclass     sex   age  sibsp  parch embarked
0         0       3    male  22.0      1      0        S
1         1       1  female  38.0      1      0        C
2         1       3  female  26.0      0      0        S
3         1       1  female  35.0      1      0        S
4         0       3    male  35.0      0      0        S




In [110]:
#object(sex, embarked)를 숫자형 데이터로 변환
onehot_sex = pd.get_dummies(ndf['sex']) #더미변수로 변환(남자=1, 여자=0)
onehot_sex

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
...,...,...
885,1,0
886,0,1
887,1,0
889,0,1


In [111]:
#concat함수로 ndf에 onehot_sex를 넣는다.(빈도수로 표시)
ndf = pd.concat([ndf, onehot_sex], axis=1)#concat: outer join
ndf

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,embarked,female,male
0,0,3,male,22.0,1,0,S,0,1
1,1,1,female,38.0,1,0,C,1,0
2,1,3,female,26.0,0,0,S,1,0
3,1,1,female,35.0,1,0,S,1,0
4,0,3,male,35.0,0,0,S,0,1
...,...,...,...,...,...,...,...,...,...
885,0,3,female,39.0,0,5,Q,1,0
886,0,2,male,27.0,0,0,S,0,1
887,1,1,female,19.0,0,0,S,1,0
889,1,1,male,26.0,0,0,C,0,1


In [112]:
onehot_embarked = pd.get_dummies(ndf['embarked'], prefix='town')
onehot_embarked

Unnamed: 0,town_C,town_Q,town_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
885,0,1,0
886,0,0,1
887,0,0,1
889,1,0,0


In [113]:
ndf = pd.concat([ndf, onehot_embarked], axis=1)
ndf

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,embarked,female,male,town_C,town_Q,town_S
0,0,3,male,22.0,1,0,S,0,1,0,0,1
1,1,1,female,38.0,1,0,C,1,0,1,0,0
2,1,3,female,26.0,0,0,S,1,0,0,0,1
3,1,1,female,35.0,1,0,S,1,0,0,0,1
4,0,3,male,35.0,0,0,S,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,female,39.0,0,5,Q,1,0,0,1,0
886,0,2,male,27.0,0,0,S,0,1,0,0,1
887,1,1,female,19.0,0,0,S,1,0,0,0,1
889,1,1,male,26.0,0,0,C,0,1,1,0,0


In [114]:
#새로 만든 열, 기존 열 겹치니까 삭제
ndf.drop(['sex', 'embarked'], axis = 1, inplace=True)
ndf

Unnamed: 0,survived,pclass,age,sibsp,parch,female,male,town_C,town_Q,town_S
0,0,3,22.0,1,0,0,1,0,0,1
1,1,1,38.0,1,0,1,0,1,0,0
2,1,3,26.0,0,0,1,0,0,0,1
3,1,1,35.0,1,0,1,0,0,0,1
4,0,3,35.0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...
885,0,3,39.0,0,5,1,0,0,1,0
886,0,2,27.0,0,0,0,1,0,0,1
887,1,1,19.0,0,0,1,0,0,0,1
889,1,1,26.0,0,0,0,1,1,0,0


In [115]:
#데이터 불러와서-전처리(정체)-데이터 분리(훈련용, 테스트용 분리)

In [117]:
y = ndf['survived']
X = ndf[['survived','pclass','age','sibsp','parch','female','male','town_C','town_Q','town_S']]


In [118]:
#설명 변수 데이터를 정규화(normalization)
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(X).transform(X)

#train data, test data 구분 (7:3 비율)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state=10)

In [119]:
X_train.shape

(499, 10)

In [120]:
X_test.shape

(215, 10)

In [121]:
## knn (k: 군집의 개수: 연구자가 설정)

from sklearn.neighbors import KNeighborsClassifier#분류


In [122]:
#모형 객체 생성(k=5설정, k는 3~6이 적당)
knn = KNeighborsClassifier(n_neighbors=5)

In [123]:
knn.fit(X_train, y_train) #모델링 할 때는 훈련용 데이터만 쓴다. 중요!

KNeighborsClassifier()

In [124]:
y_hat = knn.predict(X_test) #예측한 모델 결과 y_hat은 테스트용(새로운 데이터라고 간주한 데이터) 데이터만 쓴다. 중요!

In [126]:
print(y_hat[0:10]) #y_hat은 예측한 결과 데이터

[0 0 1 0 0 1 1 1 0 0]


In [128]:
print(y_test.values[0:10]) #y_test는 실데이터

[0 0 1 0 0 1 1 1 0 0]


In [133]:
# 모형 성능 평가 (혼동 행렬_confusion matrix: 오분류표)

from sklearn import metrics
knn_metrics = metrics.confusion_matrix(y_test, y_hat)
print(knn_metrics)

[[123   2]
 [  2  88]]


In [136]:
print(metrics.classification_report(y_test, y_hat))


              precision    recall  f1-score   support

           0       0.98      0.98      0.98       125
           1       0.98      0.98      0.98        90

    accuracy                           0.98       215
   macro avg       0.98      0.98      0.98       215
weighted avg       0.98      0.98      0.98       215



In [139]:
#accuracy: 정확도
(109+65)/215
#약 81%의 정확도가 나왔다.

0.8093023255813954

In [140]:
y_test.value_counts()

0    125
1     90
Name: survived, dtype: int64

In [143]:
#knn 분류 알고리즘

#sklearn 라이브러리에서 KNN 분류 모형 가져오기
from sklearn.neighbors import KNeighborsClassifier

#모형 객체 생성(k=5로 설정)
knn = KNeighborsClassifier(n_neighbors=5)

#train data를 가지고 모형 학습
knn.fit(X_train, y_train)

#test data를 가지고 y_hat을 예측(분류)
y_hat = knn.predict(X_test)

print(y_hat[0:10])
print(y_test.values[0:10])

[0 0 1 0 0 1 1 1 0 0]
[0 0 1 0 0 1 1 1 0 0]


In [144]:
#모형 성능 평가 - Confusion Matrix 계산
from sklearn import metrics
knn_matrix = metrics.confusion_matrix(y_test, y_hat)
print(knn_matrix)

[[123   2]
 [  2  88]]


In [145]:
#모형 성능 평가
knn_report = metrics.classification_report(y_test, y_hat)
print(knn_report)

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       125
           1       0.98      0.98      0.98        90

    accuracy                           0.98       215
   macro avg       0.98      0.98      0.98       215
weighted avg       0.98      0.98      0.98       215



In [146]:
#svm 모형

#sklearn 라이브러리에서 SVM 분류 모형 가져오기
from sklearn import svm

#모형 객체 생성(kernel='rbf' 적용)
svm_model = svm.SVC(kernel='rbf')

#train data를 가지고 모형 학습
svm_model.fit(X_train, y_train)

#test data를 가지고 y_hat 예측(분류)
y_hat = svm_model.predict(X_test)

print(y_hat[0:10])
print(y_test.values[0:10])

[0 0 1 0 0 1 1 1 0 0]
[0 0 1 0 0 1 1 1 0 0]


In [148]:
#모형 성능 평가 - Confusion Matrix 계산
from sklearn import metrics
svm_matrix = metrics.confusion_matrix(y_test, y_hat)
print(svm_matrix)
print('\n')

#모형 성능 평가 - 평가 지표 계산
svm_report = metrics.classification_report(y_test, y_hat) #실제값, 예측값 비교(성능 비교)
print(svm_report)

[[125   0]
 [  1  89]]


              precision    recall  f1-score   support

           0       0.99      1.00      1.00       125
           1       1.00      0.99      0.99        90

    accuracy                           1.00       215
   macro avg       1.00      0.99      1.00       215
weighted avg       1.00      1.00      1.00       215



In [150]:
uci_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/\
00292/Wholesale%20customers%20data.csv'

df = pd.read_csv(uci_path, header=None)
df

Unnamed: 0,0,1,2,3,4,5,6,7
0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
1,2,3,12669,9656,7561,214,2674,1338
2,2,3,7057,9810,9568,1762,3293,1776
3,2,3,6353,8808,7684,2405,3516,7844
4,1,3,13265,1196,4221,6404,507,1788
...,...,...,...,...,...,...,...,...
436,1,3,29703,12051,16027,13135,182,2204
437,1,3,39228,1431,764,4510,93,2346
438,2,3,14531,15488,30243,437,14841,1867
439,1,3,10290,1981,2232,1038,168,2125


In [None]:
# 열 이름 지정
df.columns = ['id','clump','cell_size','cell_shape', 'adhesion','epithlial',
              'bare_nuclei','chromatin','normal_nucleoli', 'mitoses', 'class']


In [None]:
#  IPython 디스플레이 설정 - 출력할 열의 개수 한도 늘리기
pd.set_option('display.max_columns', 15)


In [None]:
##데이터 탐색 (EDA)

df.head()

In [None]:
df.info()

In [None]:
# 속성(변수) 선택
X=df[['clump','cell_size','cell_shape', 'adhesion','epithlial',
      'bare_nuclei','chromatin','normal_nucleoli', 'mitoses']]  #설명 변수 X
y=df['class']


In [152]:
# Wholesale customers 데이터셋 가져오기 (출처: UCI ML Repository)
uci_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/\
00292/Wholesale%20customers%20data.csv'
df = pd.read_csv(uci_path, header=0)
df

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,2,3,12669,9656,7561,214,2674,1338
1,2,3,7057,9810,9568,1762,3293,1776
2,2,3,6353,8808,7684,2405,3516,7844
3,1,3,13265,1196,4221,6404,507,1788
4,2,3,22615,5410,7198,3915,1777,5185
...,...,...,...,...,...,...,...,...
435,1,3,29703,12051,16027,13135,182,2204
436,1,3,39228,1431,764,4510,93,2346
437,2,3,14531,15488,30243,437,14841,1867
438,1,3,10290,1981,2232,1038,168,2125
