## ML Titanic data

< 모듈 불러오기>

In [50]:
import pandas as pd
import numpy as np

# encoding
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# divid data
from sklearn.model_selection import train_test_split

# validation 4개 중 선택
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold #-- 비율 유지 k-fold
from sklearn.model_selection import RepeatedKFold #-- 복원추출 k-fold
from sklearn.model_selection import RepeatedStratifiedKFold #-- 복원추출 비율 유지 k-fold

# 평가
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

# 파라미터
from sklearn.model_selection import GridSearchCV  #--> 직접 다 체크
from sklearn.model_selection import RandomizedSearchCV #--> 랜덤하게 서치

# ML
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

###  **< 데이터 확인하기>**

**데이터 불러오기**

In [31]:
data = pd.read_csv("train.csv")

In [32]:
data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


**데이터 정보 확인하기**

In [33]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


### 결측치 확인

**결측값이 있는 것으로 확인됨.**  
(대부분 891개 데이터이나 age, cabin, embarked 의 결측치 조정이 필요함)

**결측치 비율로 확인하기**

In [34]:
for col in data.columns:
    print(str(col)+":", round(data.loc[:,col].isnull().sum()/len(data)*100,2),"%")

PassengerId: 0.0 %
Survived: 0.0 %
Pclass: 0.0 %
Name: 0.0 %
Sex: 0.0 %
Age: 19.87 %
SibSp: 0.0 %
Parch: 0.0 %
Ticket: 0.0 %
Fare: 0.0 %
Cabin: 77.1 %
Embarked: 0.22 %


### **<전처리>**

In [35]:
# 결측치 처리
def check_fillna(df):
    df.loc[:, "Age"].fillna(df.loc[:,"Age"].mean(), inplace=True)
    df.loc[:, "Cabin"].fillna("N", inplace = True)
    df.loc[:, "Embarked"].fillna("N", inplace = True)
    df.loc[:, "Fare"].fillna(0, inplace = True) #--> 결측치 대체 값은 변수유형도 확인할 것.

    return df


# 불필요한 컬럼 제거
def drop_feature(df):
    df.drop(["PassengerId","Name","Ticket"], axis=1, inplace = True)

    return df


# 인코딩 : 범주형은 숫자로 바꿔줘야함. (라벨인코딩: 1,2,3... / 원핫 인코딩 : 남,여 = 0,1 / 더미변수 : 남 = 0, 여 = 1)
def encode_fetures(df):
    # Cabin의 앞자리 1개만 가져올 거임. ex) C85 => "C"
    df.loc[:, "Cabin"] = df.loc[:, "Cabin"].apply(lambda x: str(x)[:1]) 

    #라벨 인코딩 
    cols = ["Cabin", "Sex", "Embarked"]

    for col in cols:
        le = LabelEncoder()
        le.fit(df.loc[:, col])

        df.isetitem( df.columns.get_loc(col), le.transform(df.loc[:, col]) ) #****

    return df

**** df.isetitem( df.columns.get_loc(col), le.transform(df.loc[:, col]) ) 

< old version >  
df.loc[:, col] = le.transform(df.loc[:,col])

컬럼 하나만 보면 int 나 float 로 바껴있는데 data.info()로 확인하면  
해당 컬럼이 여전히 object 형으로 나타남  

info에서 명시적으로 int 로 바꿔주기 위해 isetitem 을 사용

In [36]:
# 한 번에 표현
def titanic_preprocessing(df):
    df = check_fillna(df)
    df = drop_feature(df)
    df = encode_fetures(df)

    return df

In [37]:
data = titanic_preprocessing(data)
data.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,1,22.0,1,0,7.25,7,3
1,1,1,0,38.0,1,0,71.2833,2,0
2,1,3,0,26.0,0,0,7.925,7,3
3,1,1,0,35.0,1,0,53.1,2,3
4,0,3,1,35.0,0,0,8.05,7,3


### < 독립변수 종속변수 분리 & valid data 분리 >

독립변수 종속변수 분리

In [38]:
y_titanic = data.loc[:,"Survived"]
X_titanic = data.drop("Survived", axis = 1)

In [39]:
y_titanic

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

data 분리

In [40]:
X_train, X_val, y_train, y_val = train_test_split(
    X_titanic,
    y_titanic,
    test_size = 0.2, #test 비율이 20%
    random_state = 1234, #꼭 1234 아니고 고정시키면 됨
    stratify = y_titanic
)

In [41]:
y_train.value_counts(normalize = True)

Survived
0    0.616573
1    0.383427
Name: proportion, dtype: float64

**k-fold (validation k개 만들기)**

In [53]:
# 방법 1
kfold = KFold (n_splits = 5, 
               random_state=1234, 
               shuffle =True)

# 방법 2
str_kfold = StratifiedKFold(n_splits = 5,
                           random_state = 1234,
                           shuffle = True)

# 방법 3
rfold  = RepeatedKFold(n_splits = 5,
                       random_state = 1234,
                       n_repeats = 10)

# 방법 4
rsfold = RepeatedStratifiedKFold(n_splits =5,
                                 random_state=1234,
                                 n_repeats = 10)


### **< ML >**

### **KNN**

In [55]:
knn = KNeighborsClassifier(n_jobs = -1) #컴퓨터가 가지고 있는 팔다리 다 쓸 거임!

score = cross_val_score( knn, 
                        X_train, 
                        y_train, 
                        scoring = "accuracy" )
score

array([0.72027972, 0.75524476, 0.66901408, 0.72535211, 0.71126761])

In [57]:
for iter_count, acc in enumerate(score):
    print(f"KNN의 {iter_count} 시도 acc:{acc}")
print("KNN의 모델의 Acc Mean: ", score.mean())
print("KNN의 모델의 Acc std: ", score.std()) 

KNN의 0 시도 acc:0.7202797202797203
KNN의 1 시도 acc:0.7552447552447552
KNN의 2 시도 acc:0.6690140845070423
KNN의 3 시도 acc:0.7253521126760564
KNN의 4 시도 acc:0.7112676056338029
KNN의 모델의 Acc Mean:  0.7162316556682755
KNN의 모델의 Acc std:  0.027841452263861546


평균적으로 한 71% 정도는 성능이 나오는 것 같은데 더 좋게는 안되는지 찾아봐야할 듯!

1) RGS : 랜덤으로 최적의 파라미터 찾을거임!

In [58]:
parameters = {
    "n_neighbors" : [1,3,5,7,9,11,13,15,17,19], #이웃 개수 설정 10개 / 컴프리헨션으로 더 많이 지정 가능
    "algorithm" : ["auto", "ball_tree", "kd_tree"] # 방법 3개 
} # 총 30번 돌려볼거임.

In [59]:
knn = KNeighborsClassifier(n_jobs = -1)
n_iter = 10 #test 파라미터 조합보다 경우의 수가 작으면 에러남
knn_kf_rgs = RandomizedSearchCV(
    knn, 
    param_distributions = parameters,
    cv = kfold,
    scoring = "accuracy",
    n_jobs= -1,
    random_state = 1234,
    n_iter = n_iter
)

knn_kf_rgs.fit(X_train, y_train)

In [60]:
knn_kf_rgs.cv_results_

{'mean_fit_time': array([0.00852599, 0.00937643, 0.01186171, 0.00910811, 0.00801902,
        0.00922737, 0.00660129, 0.00753083, 0.00684443, 0.00570092]),
 'std_fit_time': array([0.00276631, 0.00263589, 0.00202536, 0.00138954, 0.00254446,
        0.00218797, 0.0016592 , 0.00179464, 0.00071085, 0.00134313]),
 'mean_score_time': array([0.0810236 , 0.06943083, 0.06589251, 0.04247651, 0.04226308,
        0.04296751, 0.03713226, 0.03826351, 0.03847985, 0.030549  ]),
 'std_score_time': array([0.0370367 , 0.00583675, 0.00605315, 0.00370649, 0.0052124 ,
        0.00786207, 0.00163041, 0.0067802 , 0.00663222, 0.00300755]),
 'param_n_neighbors': masked_array(data=[15, 1, 9, 3, 17, 17, 7, 7, 9, 7],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_algorithm': masked_array(data=['auto', 'ball_tree', 'auto', 'auto', 'kd_tree', 'auto',
                    'auto', 'kd_tree', 'ball_

In [62]:
# 가장 좋았떤 지표는?
knn_kf_rgs.best_params_

{'n_neighbors': 9, 'algorithm': 'auto'}

In [64]:
# 가장 좋은 조합의 평균 성능은?
knn_kf_rgs.best_score_

0.7191273515217178

모델 정확도 평가하기

In [66]:
knn_kf_rgs_best = knn_kf_rgs.best_estimator_
knn_kf_rgs_ypred = knn_kf_rgs_best.predict(X_val)

knn_kf_rgs_acc = accuracy_score(y_val, knn_kf_rgs_ypred)
print("KNN kfold RGS Acc:",knn_kf_rgs_acc)

KNN kfold RGS Acc: 0.6871508379888268


RGS 바탕으로 디테일하게 탐색!

In [72]:
parameters ={
    "n_neighbors" : [7,9,11,13] 
    # 별도의 방법 지정 없으면 auto
}
knn = KNeighborsClassifier(n_jobs = -1)

knn_kf_gs = GridSearchCV( knn,
                         param_grid = parameters,
                         cv = kfold,
                         scoring = "accuracy",
                         n_jobs = -1
                        )

knn_kf_gs.fit(X_train, y_train)

In [73]:
knn_kf_gs.cv_results_

{'mean_fit_time': array([0.01156578, 0.01236749, 0.00793943, 0.00601997]),
 'std_fit_time': array([2.32873885e-03, 1.35169986e-03, 1.17370540e-03, 7.15784196e-05]),
 'mean_score_time': array([0.07519803, 0.06123438, 0.04100966, 0.03143115]),
 'std_score_time': array([0.00663405, 0.01913291, 0.005682  , 0.00729494]),
 'param_n_neighbors': masked_array(data=[7, 9, 11, 13],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_neighbors': 7},
  {'n_neighbors': 9},
  {'n_neighbors': 11},
  {'n_neighbors': 13}],
 'split0_test_score': array([0.72727273, 0.72027972, 0.71328671, 0.70629371]),
 'split1_test_score': array([0.6993007 , 0.6993007 , 0.70629371, 0.70629371]),
 'split2_test_score': array([0.67605634, 0.6971831 , 0.70422535, 0.71126761]),
 'split3_test_score': array([0.69014085, 0.71126761, 0.70422535, 0.66901408]),
 'split4_test_score': array([0.77464789, 0.76760563, 0.74647887, 0.74647887]),
 'mean_test_score': array([0.

In [74]:
print(knn_kf_gs.best_params_)
print(knn_kf_gs.best_score_)

{'n_neighbors': 9}
0.7191273515217178


### **< RandomForest >**

In [75]:
rf = RandomForestClassifier(n_jobs = -1,
                            random_state=1234)

score = cross_val_score(rf, 
                        X_train, 
                        y_train, 
                        cv = kfold,
                        scoring = "accuracy")
score

array([0.81118881, 0.84615385, 0.78873239, 0.76056338, 0.83098592])

In [76]:
for iter_count, acc in enumerate(score):
    print(f"RF의 {iter_count} 시도 acc:{acc}")
print("RF 모델의 Acc Mean: ", score.mean())
print("RF 모델의 Acc std: ", score.std())

RF의 0 시도 acc:0.8111888111888111
RF의 1 시도 acc:0.8461538461538461
RF의 2 시도 acc:0.7887323943661971
RF의 3 시도 acc:0.7605633802816901
RF의 4 시도 acc:0.8309859154929577
RF 모델의 Acc Mean:  0.8075248694967005
RF 모델의 Acc std:  0.030379537772527453


In [78]:
#RGS
rf = RandomForestClassifier(n_jobs= -1,
                            random_state = 1234)
parameters = {
    "n_estimators" : [10,30,50,70,100,200,300, 500,1000,2000],
    "max_features":[3,4,5,6,7], # 분할 시 고려할 최대 feature 개수
    "max_depth":[2,3,4,5,6,7,8,10,20], # 트리 최대 깊이
    "min_samples_split": [1,3,5,7,9] # 노드 분할을 위한 최소 샘플 수
}
n_iter = 20

rf_kfold_rgs = RandomizedSearchCV(
    rf,
    param_distributions = parameters,
    cv = kfold,
    scoring = "accuracy",
    n_jobs = -1,
    random_state = 1234,
    n_iter = n_iter
)

rf_kfold_rgs.fit(X_train, y_train)

15 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\NT551XCJ\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\NT551XCJ\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py", line 340, in fit
    self._validate_params()
  File "C:\Users\NT551XCJ\anaconda3\Lib\site-packages\sklearn\base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\NT551XCJ\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 97, in validate_parameter_constraints
    raise I

In [79]:
rf_kfold_rgs.best_score_

0.8299911356249385

In [80]:
rf_kfold_rgs.best_params_

{'n_estimators': 50, 'min_samples_split': 7, 'max_features': 6, 'max_depth': 6}

In [81]:
rf_kf_rgs_best = rf_kfold_rgs.best_estimator_
rf_kf_rgs_ypred = rf_kf_rgs_best.predict(X_val)
rf_kf_rgs_acc = accuracy_score( y_val, rf_kf_rgs_ypred)
rf_kf_rgs_acc

0.8324022346368715

In [84]:
# GS
rf = RandomForestClassifier(n_jobs=-1, random_state=1234)
parameters ={
    "n_estimators":[50, 100,500],
    "max_features":[6, 5,7],
    "max_depth" : [6, 5, 7]
} 
rf_kf_gs = GridSearchCV( rf,
                        param_grid=parameters,
                         cv=kfold,scoring="accuracy",
                         n_jobs=-1)
rf_kf_gs.fit(X_train, y_train)

In [85]:
# val으로 다시 체크
rf_kf_gs_best = rf_kf_gs.best_estimator_
rf_kf_gs_ypred = rf_kf_gs_best.predict(X_val)
rf_kf_gs_acc = accuracy_score(y_val,rf_kf_gs_ypred )
rf_kf_gs_acc

0.8379888268156425