In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

#### 문제 : feature = pclass, who, fare, age    target data = survived

< Information >

- age null 값은 평균값으로 전처리하여 실행할 것.
- Train data set, Test data set으로 나누어 학습할 것.
- gridsearchcv로 적합한 parameter 값을 구하고 Accuracy를 구할 것. ( penalty, C, solver )
- pclass: 3, age: 14, who: child, fare: 11 인 경우 생존유무를 예측하시오.

In [2]:
df = pd.read_excel('data5/titanic.xlsx')
df.head()

Unnamed: 0.1,Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town
0,0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton
1,1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg
2,2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton
3,3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton
4,4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton


- step1.1. age 결측값 개수 확인.

In [3]:
df['age'].isna().sum()

177

- step1.2. 결측값을 age의 평균으로 채우기.

In [4]:
df['age'].fillna( df['age'].mean(), inplace=True )  

- step1.3. who 문자열 Encoding.

In [5]:
# df['who'].unique()
df['who'].replace( ['man', 'woman', 'child'] , [1, 2, 3], inplace=True )

- step2. Train data set, Test data set으로 나누기.

In [6]:
x_data = df.iloc[ :, [2, 4, 10, 7] ]
y_data = df['survived']
x_train, x_test, y_train, y_test = train_test_split( x_data, y_data,  stratify=y_data, test_size=0.3, random_state=2021 )
print(f'x_train shape: { x_train.shape } ')
print(f'x_test shape:  { x_test.shape  } ')
print(f'y_train shape: { x_train.shape } ')
print(f'y_test shape:  { x_test.shape  } ')

x_train shape: (623, 4) 
x_test shape:  (268, 4) 
y_train shape: (623, 4) 
y_test shape:  (268, 4) 


In [7]:
x_data.head()

Unnamed: 0,pclass,age,who,fare
0,3,22.0,1,7.25
1,1,38.0,2,71.2833
2,3,26.0,2,7.925
3,1,35.0,2,53.1
4,3,35.0,1,8.05


- step3 GridSearchCV로 적합한 parameter 값을 구하고, Train, Test 각각의 Accuracy를 구하기. ( penalty, C, solver )

In [8]:
model_logit = make_pipeline( StandardScaler(),  LogisticRegression() )
param_value = { 'logisticregression__penalty': [ 'l1','l2' ],
                'logisticregression__C'      : np.logspace( -5, 5, 11 ),
                'logisticregression__solver' : ['liblinear', 'saga',]
              }
grid_search_logit = GridSearchCV( model_logit , param_grid = param_value ) 
grid_search_logit.fit( x_train, y_train )

print(f'parameter : { grid_search_logit.best_params_}')
print(f'Accuracy of train data : { round(grid_search_logit.score( x_train , y_train ), 3)*100 }%' )
print(f'Accuracy of test data  : { round(grid_search_logit.score( x_test , y_test   ), 3)*100 }%' )

parameter : {'logisticregression__C': 0.1, 'logisticregression__penalty': 'l1', 'logisticregression__solver': 'saga'}
Accuracy of train data : 78.7%
Accuracy of test data  : 80.2%


- step4. pclass: 3, age: 14, who: child(3), fare: 11 인 경우 생존유무를 예측하기.

In [9]:
float( grid_search_logit.predict( [ [3, 14, 3, 11] ] ) ) #  grid_search_logit.predict_proba( [[3, 14, 3, 11]] ).argmax( axis= 1) 

1.0