In [3]:
import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## 데이터 전처리
1. 인코딩
2. 스케일링


*인코딩 스케일링 둘다 ndarray형태로 돌려준다고 생각하자*

### 1.인코딩
기본적으로 sklearn의 알고리즘은 문자열 인식하지 못한다 => 숫자로 바꿔줘야한다\
대표적인 두가지 방법 존재
- label encoding 
    - 1d array로만 가능하다
    - 문제점 1: 모델에 따라서 ordering을 수치로 해석할수 있음
    - 문제점 2: unseen data 처리 불가능
- one-hot encoding
    - - matrix형태로 데이터가 들어가야한다 => 즉 단일 컬럼은 (-1,1) 형태여야한다 
    - 두개 이상의 column도 가능하다 => 즉 Dataframe형태로도 가능하다
    - 문제점 : 차원의 저주
    - 장점 : unknown handle 가능 
**그냥 닥치고 one-hot encoding 쓴다**



#### Label Encoding

In [43]:
# label encoding
items = np.array(['TV','냉장고','세탁기','선풍기','선풍기','믹서','세탁기'])

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(items)
label_encoder.transform(items)

label_encoder.classes_

array([0, 1, 4, 3, 3, 2, 4])

array(['TV', '냉장고', '믹서', '선풍기', '세탁기'], dtype='<U3')

In [3]:
# 애초에 label encoding은 unknown handle 옵션도 없다
# 대처할수 없다
label_encoder.transform(['핸드폰'])

ValueError: y contains previously unseen labels: '핸드폰'

#### one-hot encoder

In [44]:
# error => reshape(1,-1)
from sklearn.preprocessing import OneHotEncoder
items = np.array(['TV','냉장고','세탁기','선풍기','선풍기','믹서','세탁기'])

onehot_encoder = OneHotEncoder()
onehot_encoder.fit_transform(items)


ValueError: Expected 2D array, got 1D array instead:
array=['TV' '냉장고' '세탁기' '선풍기' '선풍기' '믹서' '세탁기'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [47]:
# one hot encodr 예시
from sklearn.preprocessing import OneHotEncoder
items = np.array(['TV','냉장고','세탁기','선풍기','선풍기','믹서','세탁기']).reshape(-1,1)

onehot_encoder = OneHotEncoder()
onehot_encoder.fit_transform(items)
onehot_encoder.fit_transform(items).toarray()

onehot_encoder.categories_


<7x5 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1.]])

[array(['TV', '냉장고', '믹서', '선풍기', '세탁기'], dtype='<U3')]

In [46]:
# toarray() 귀찮으면 그냥, sparse=Fasle 옵션주면된다 
from sklearn.preprocessing import OneHotEncoder
items = np.array(['TV','냉장고','세탁기','선풍기','선풍기','믹서','세탁기']).reshape(-1,1)

onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoder.fit_transform(items)

array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1.]])

In [50]:
# one hot encoder 결과 dataframe으로 받아보자
from sklearn.preprocessing import OneHotEncoder
items = np.array(['TV','냉장고','세탁기','선풍기','선풍기','믹서','세탁기']).reshape(-1,1)

onehot_encoder = OneHotEncoder(sparse=False)
result = onehot_encoder.fit_transform(items)
onehot_encoder.categories_

pd.DataFrame(result,columns=onehot_encoder.categories_,dtype=int)

[array(['TV', '냉장고', '믹서', '선풍기', '세탁기'], dtype='<U3')]

Unnamed: 0,TV,냉장고,믹서,선풍기,세탁기
0,1,0,0,0,0
1,0,1,0,0,0
2,0,0,0,0,1
3,0,0,0,1,0
4,0,0,0,1,0
5,0,0,1,0,0
6,0,0,0,0,1


In [68]:
# 데이터 프레임 형태로 input 가능
# return은 still ndarray

# 데이터 프레임 생성 only consist of categorical
items1 = np.array(['TV','냉장고','세탁기','선풍기','선풍기','믹서','세탁기'])
items2 = np.array(['samsung','apple','samsung','apple','samsung','apple','samsung'])
cat_df =pd.DataFrame({'cat1':items1,'cat2':items2})
cat_df

onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoder.fit_transform(cat_df)

col_names = np.hstack([onehot_encoder.categories_[0],onehot_encoder.categories_[1]])

pd.DataFrame(onehot_encoder.fit_transform(cat_df),columns=col_names,dtype=int)

Unnamed: 0,cat1,cat2
0,TV,samsung
1,냉장고,apple
2,세탁기,samsung
3,선풍기,apple
4,선풍기,samsung
5,믹서,apple
6,세탁기,samsung


array([[1., 0., 0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0., 1.],
       [0., 0., 0., 1., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0., 1.],
       [0., 0., 1., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0., 1.]])

Unnamed: 0,TV,냉장고,믹서,선풍기,세탁기,apple,samsung
0,1,0,0,0,0,0,1
1,0,1,0,0,0,1,0
2,0,0,0,0,1,0,1
3,0,0,0,1,0,1,0
4,0,0,0,1,0,0,1
5,0,0,1,0,0,1,0
6,0,0,0,0,1,0,1


In [None]:
# 주의 shape(-1,1) 원하니깐 Dataframe에서 col단위로 넣을때 [['colname]] 으로 넣어야지 에러가 나지 않겠지

In [70]:
# pd.get_dummies() 도 가능하다 
# 훨씬 쉬어보이는데 => depreicated due to unseen data and cross validation scheme

items1 = np.array(['TV','냉장고','세탁기','선풍기','선풍기','믹서','세탁기'])
items2 = np.array(['samsung','apple','samsung','apple','samsung','apple','samsung'])
cat_df =pd.DataFrame({'cat1':items1,'cat2':items2})
pd.get_dummies(cat_df,columns=['cat1'])

pd.get_dummies(cat_df,columns=['cat1','cat2'])

Unnamed: 0,cat2,cat1_TV,cat1_냉장고,cat1_믹서,cat1_선풍기,cat1_세탁기
0,samsung,1,0,0,0,0
1,apple,0,1,0,0,0
2,samsung,0,0,0,0,1
3,apple,0,0,0,1,0
4,samsung,0,0,0,1,0
5,apple,0,0,1,0,0
6,samsung,0,0,0,0,1


Unnamed: 0,cat1_TV,cat1_냉장고,cat1_믹서,cat1_선풍기,cat1_세탁기,cat2_apple,cat2_samsung
0,1,0,0,0,0,0,1
1,0,1,0,0,0,1,0
2,0,0,0,0,1,0,1
3,0,0,0,1,0,1,0
4,0,0,0,1,0,0,1
5,0,0,1,0,0,1,0
6,0,0,0,0,1,0,1


### 2.스케일링
- 정규화(0과 1사이의 값으로 변환 - MinMaxScalar) (x - x최소값) / (x최대값 - x최소값) 만약 음수일 경우 1 대체됨, 
- 표준화(정규분포를 따르는 형식으로 평균이 0 분산 1 가우시안분포 변환 - StandardScalar)(x - x 평균값) / (x표준편차)

In [73]:
# iris data로 진행하겠다
from sklearn.datasets import load_iris

iris = load_iris()

#### Standarad Scaler

In [75]:
from sklearn.preprocessing import StandardScaler
s_scaler = StandardScaler()

s_iris   = s_scaler.fit_transform(iris.data)
s_iris_df = pd.DataFrame(data    = s_iris , 
                          columns = iris.feature_names)
s_iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444
...,...,...,...,...
145,1.038005,-0.131979,0.819596,1.448832
146,0.553333,-1.282963,0.705921,0.922303
147,0.795669,-0.131979,0.819596,1.053935
148,0.432165,0.788808,0.933271,1.448832


#### MinMax Scaler

In [77]:
from sklearn.preprocessing import MinMaxScaler
m_scaler = MinMaxScaler()

m_iris   = m_scaler.fit_transform(iris.data)
m_iris_df = pd.DataFrame(data    = m_iris , 
                          columns = iris.feature_names)
m_iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,0.222222,0.625000,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.500000,0.050847,0.041667
3,0.083333,0.458333,0.084746,0.041667
4,0.194444,0.666667,0.067797,0.041667
...,...,...,...,...
145,0.666667,0.416667,0.711864,0.916667
146,0.555556,0.208333,0.677966,0.750000
147,0.611111,0.416667,0.711864,0.791667
148,0.527778,0.583333,0.745763,0.916667


> #### **인코딩 스케일링 언제해야되냐**?

항상 굉장히 헷갈리는 질문이다.\
당연하게도 train 에서만 학습이 되어야하고, test에서는 변형만 시켜야한다
- for train set => fit() and transform()\
- for test set => transform()

Scaling에서는 이게 별 문제가 없을수 있는데 Encoding에서는 문제가 발생할 수 있다.\
Why? train set에는 없던 cateogry가 test set에서 튀어나올 수 있다 => 이를 대처하기 위해서는 one-hot encoding을 사용해줘야한다\
이에 대해 조금 후에 자세히 보겠다

검색해봐도 너무 의견이 다양하고 심지어 책에서도 의견이 다양하다. 그렇기 때문에 아래 offical document 참고하자
![picture 1](images/0050116989a1eef3ab1262cb317246918f21667daa2c0fa427db094bc5ec903e.png)  
![picture 2](images/54de8867cccc976d2fa7769a81a0c0cb9ea131a7e7f059d64bbc1a8bbb032f5f.png)  



> 직접 pipeline을 만들어보자 for data with both numeric and categorical \
> numerical -> standardscaler\
> categorical -> one-hot encoder 사용할거다
> 의도적으로 categorical data in test set에 unseen data 집어넣을거다

In [22]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# train, test dataset 생성
train_ex_df = pd.DataFrame({'num1':np.random.randint(1,10,size=(10,)),'num2': np.random.randint(10,20,size=(10,)),'cat': [s for s in 'baabbbccbb']})
train_ex_df
test_ex_df = pd.DataFrame({'num1':np.random.randint(1,10,size=(4,)),'num2': np.random.randint(10,20,size=(4,)),'cat': [s for s in 'abcd']})
test_ex_df

numeric_features = ["num1","num2"]

numeric_transformer = StandardScaler()

categorical_features = ["cat"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ] # 원한다면 여기서 추가해서 특정 numeric 혹은 categorical column은 또 다른 scaling 적용가능하다
)

result_train = preprocessor.fit_transform(train_ex_df)
result_test = preprocessor.transform(test_ex_df) # 맨 마지막 row를 보면 'd'가 unseen 이기때문에 0 0 0으로 들어갔다


Unnamed: 0,num1,num2,cat
0,4,13,b
1,2,13,a
2,8,14,a
3,1,18,b
4,6,18,b
5,9,17,b
6,9,15,c
7,7,18,c
8,2,11,b
9,2,10,b


Unnamed: 0,num1,num2,cat
0,6,12,a
1,9,13,b
2,3,12,c
3,2,15,d


In [16]:
# preprocessor attribute 예시
preprocessor.feature_names_in_
preprocessor.transformers_
preprocessor.get_feature_names_out()


array(['num1', 'num2', 'cat'], dtype=object)

[('num', StandardScaler(), ['num1', 'num2']),
 ('cat', OneHotEncoder(handle_unknown='ignore'), ['cat'])]

array(['num__num1', 'num__num2', 'cat__cat_a', 'cat__cat_b', 'cat__cat_c'],
      dtype=object)

In [23]:
# preprocessor attribute이용해서 원래 데이터 프레임형태로 복원해보자

pd.DataFrame(result_train,columns=preprocessor.get_feature_names_out() )

pd.DataFrame(result_test,columns=preprocessor.get_feature_names_out() )


Unnamed: 0,num__num1,num__num2,cat__cat_a,cat__cat_b,cat__cat_c
0,-0.333333,-0.600665,0.0,1.0,0.0
1,-1.0,-0.600665,1.0,0.0,0.0
2,1.0,-0.247333,1.0,0.0,0.0
3,-1.333333,1.165998,0.0,1.0,0.0
4,0.333333,1.165998,0.0,1.0,0.0
5,1.333333,0.812665,0.0,1.0,0.0
6,1.333333,0.106,0.0,0.0,1.0
7,0.666667,1.165998,0.0,0.0,1.0
8,-1.0,-1.307331,0.0,1.0,0.0
9,-1.0,-1.660663,0.0,1.0,0.0


Unnamed: 0,num__num1,num__num2,cat__cat_a,cat__cat_b,cat__cat_c
0,0.333333,-0.953998,1.0,0.0,0.0
1,1.333333,-0.600665,0.0,1.0,0.0
2,-0.666667,-0.953998,0.0,0.0,1.0
3,-1.0,0.106,0.0,0.0,0.0


### 3.타이타닉 생존자 ML 에측 구현 [실습]

- 데이터 전처리(null 처리, 불필요한 속성 제거, 인코딩 수행)
- 모델학습 및 검증/예측/평가


우선 먼저 앞에서 배운 내용들을 좀 복습하기 위해서 StratifiedKFold, cross_val 사용 후에 pipeline 이용하겠다.

##### 1.데이터 로드

In [7]:
titanic_df = pd.read_csv('Data/titanic_train.csv',header=0)

In [8]:
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [15]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [33]:
# 일단 요것만 사용하겟다
# pclass 는 그냥 numeric으로 둬도 상관없다
titanic_df = titanic_df[['Survived' , 'Pclass' , 'Sex' , 'Age' , 'SibSp' , 'Parch' , 'Cabin' , 'Embarked']]
titanic_df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Cabin,Embarked
0,0,3,male,22.0,1,0,,S
1,1,1,female,38.0,1,0,C85,C
2,1,3,female,26.0,0,0,,S
3,1,1,female,35.0,1,0,C123,S
4,0,3,male,35.0,0,0,,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,,S
887,1,1,female,19.0,0,0,B42,S
888,0,3,female,,1,2,,S
889,1,1,male,26.0,0,0,C148,C


##### 2.결측지 처리

In [34]:
# column 별 결측지 갯수
titanic_df.isna().sum(axis=0)

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Cabin       687
Embarked      2
dtype: int64

- age, cabin, embarked에서 결측지가 존재한다
- 사실 모두다 train할때 impute 가능하다
- 근데 예를들어 numerical feature에서 평균을 na fill을 위해 사용하게 되면 -> 이건 미리 test data 정복 사용하면 leakge가 있는거다
- categorical feature도 마찬가지다 -> 최빈값을 사용하게 되면 이것도 미리 test data의 정보가 사용된 것이다
- 근데 만약에 그냥 categorical feature에서 모르는값을 다 'N'으로 채우거나, numerical feature에서 다 0으로 채우는건 딱히 문제가 없다

In [38]:
# Cabin column fillNA with 'N'
titanic_df['Cabin'].replace(np.nan,'N',inplace=True)
# 아래도 동일하다
# titanic_df['Cabin'].fillna('N')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titanic_df['Cabin'].replace(np.nan,'N',inplace=True)


In [40]:
# Embarked column fillna with 'N'
titanic_df['Embarked'].fillna('N',inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titanic_df['Embarked'].fillna('N',inplace=True)


In [47]:
titanic_df.isna().sum(axis=0) # 아직 age는 그대로 na존재 => Imputer로 바꿀거다

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Cabin         0
Embarked      0
dtype: int64

#### 학습을 위한 준비
- step01 : feature  , label 데이터 셋 추출
- step02 : 데이터 분리
- step03 : 분류모델 선정 후 학습
- step04 : 예측
- step05 : 평가

- additinal 교차검증 추가(KFold , coross_val_score , GridSearchCV 를 이용한 하이퍼 파라미터 튜닝!!)
- 피처 중요도 확인


In [57]:
# step01. feature, target 추출
titanic_features = titanic_df.iloc[:,~titanic_df.columns.isin(['Survived'])]
titanic_targets = titanic_df.loc[:,['Survived']]
titanic_features
titanic_targets


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Cabin,Embarked
0,3,male,22.0,1,0,N,S
1,1,female,38.0,1,0,C85,C
2,3,female,26.0,0,0,N,S
3,1,female,35.0,1,0,C123,S
4,3,male,35.0,0,0,N,S
...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,N,S
887,1,female,19.0,0,0,B42,S
888,3,female,,1,2,N,S
889,1,male,26.0,0,0,C148,C


Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0
...,...
886,0
887,1
888,0
889,1


In [122]:
# stpe02. split train/test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(titanic_features,titanic_targets,test_size=0.2)
X_train.shape
X_test.shape
y_train.shape
y_test.shape


(712, 7)

(179, 7)

(712, 1)

(179, 1)

In [59]:
# step03 model selection

# 먼저 모델 생성하자
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


dt_model = DecisionTreeClassifier(random_state = 200)
rf_model = RandomForestClassifier(random_state = 200)
lr_model = LogisticRegression(random_state = 200)

In [114]:
# split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(titanic_features,titanic_targets,test_size=0.2,random_state=11)


# Simple Impute for Age
from sklearn.impute import SimpleImputer
simple_imputer = SimpleImputer(strategy='mean')
X_train[['Age']] = simple_imputer.fit_transform(X_train[['Age']])
X_test[['Age']] = simple_imputer.transform(X_test[['Age']])


# 이게 one-hot coding 할려면 좀 복잡할거다 참자
# Encoding for categorical ['Sex','Cabin','Embarked']

cat_features = ['Sex','Cabin','Embarked']
numeric_features = titanic_features.columns.difference(['Sex','Cabin','Embarked'])

from sklearn.preprocessing import OneHotEncoder
onehot_encoder = OneHotEncoder(sparse=False,handle_unknown='ignore')


result = onehot_encoder.fit_transform(X_train[cat_features])
result_columns = onehot_encoder.get_feature_names_out()
X_train.drop(columns=cat_features,inplace=True)
X_train[result_columns] = result

result = onehot_encoder.transform(X_test[cat_features])
result_columns = onehot_encoder.get_feature_names_out()
X_test.drop(columns=cat_features,inplace=True)
X_test[result_columns] = result


# Scaling 해주자 for all numeric features
from sklearn.preprocessing import StandardScaler
standard_scaler = StandardScaler()
X_train[numeric_features] = standard_scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = standard_scaler.transform(X_test[numeric_features])


# decision tree로 학습 해보자
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

dt_model = DecisionTreeClassifier(random_state = 200)

dt_model.fit(X_train,y_train)
y_predict = dt_model.predict(X_test)
accuracy_score(y_test,y_predict)


# random forest로 학습 해보자

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier

rf_model.fit(X_train,y_train)
y_predict = rf_model.predict(X_test)
accuracy_score(y_test,y_predict)

  X_train[result_columns] = result
  X_train[result_columns] = result
  X_train[result_columns] = result
  X_train[result_columns] = result
  X_train[result_columns] = result
  X_train[result_columns] = result
  X_train[result_columns] = result
  X_train[result_columns] = result
  X_train[result_columns] = result
  X_train[result_columns] = result
  X_train[result_columns] = result
  X_train[result_columns] = result
  X_train[result_columns] = result
  X_train[result_columns] = result
  X_train[result_columns] = result
  X_train[result_columns] = result
  X_train[result_columns] = result
  X_train[result_columns] = result
  X_train[result_columns] = result
  X_train[result_columns] = result
  X_train[result_columns] = result
  X_train[result_columns] = result
  X_train[result_columns] = result
  X_train[result_columns] = result
  X_train[result_columns] = result
  X_train[result_columns] = result
  X_train[result_columns] = result
  X_train[result_columns] = result
  X_train[result_col

0.7877094972067039

  rf_model.fit(X_train,y_train)


0.8324022346368715

In [118]:
# split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(titanic_features,titanic_targets,test_size=0.2,random_state=11)


# Simple Impute for Age
from sklearn.impute import SimpleImputer
simple_imputer = SimpleImputer(strategy='mean')
X_train[['Age']] = simple_imputer.fit_transform(X_train[['Age']])
X_test[['Age']] = simple_imputer.transform(X_test[['Age']])


# 이게 one-hot coding 할려면 좀 복잡할거다 참자
# Encoding for categorical ['Sex','Cabin','Embarked']

cat_features = ['Sex','Cabin','Embarked']
numeric_features = titanic_features.columns.difference(['Sex','Cabin','Embarked'])

from sklearn.preprocessing import OneHotEncoder
onehot_encoder = OneHotEncoder(sparse=False,handle_unknown='ignore')


result = onehot_encoder.fit_transform(X_train[cat_features])
result_columns = onehot_encoder.get_feature_names_out()
X_train



# # Scaling 해주자 for all numeric features
# from sklearn.preprocessing import StandardScaler
# standard_scaler = StandardScaler()
# X_train[numeric_features] = standard_scaler.fit_transform(X_train[numeric_features])
# X_test[numeric_features] = standard_scaler.transform(X_test[numeric_features])


# # decision tree로 학습 해보자
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.metrics import accuracy_score

# dt_model = DecisionTreeClassifier(random_state = 200)

# dt_model.fit(X_train,y_train)
# y_predict = dt_model.predict(X_test)
# accuracy_score(y_test,y_predict)


# # random forest로 학습 해보자

# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score

# from sklearn.ensemble import RandomForestClassifier

# rf_model.fit(X_train,y_train)
# y_predict = rf_model.predict(X_test)
# accuracy_score(y_test,y_predict)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Cabin,Embarked
333,3,male,16.0,2,0,N,S
662,1,male,47.0,0,0,E58,S
382,3,male,32.0,0,0,N,S
331,1,male,45.5,0,0,C124,S
149,2,male,42.0,0,0,N,S
...,...,...,...,...,...,...,...
269,1,female,35.0,0,0,C99,S
337,1,female,41.0,0,0,E40,C
91,3,male,20.0,0,0,N,S
80,3,male,22.0,0,0,N,S


In [130]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score


titanic_df = pd.read_csv('Data/titanic_train.csv',header=0)
titanic_df = titanic_df[['Survived' , 'Pclass' , 'Sex' , 'Age' , 'SibSp' , 'Parch' , 'Cabin' , 'Embarked']]
titanic_df['Embarked'].fillna('N',inplace=True)
titanic_df['Cabin'].replace(np.nan,'N',inplace=True)


titanic_features = titanic_df.iloc[:,~titanic_df.columns.isin(['Survived'])]
titanic_targets = titanic_df.loc[:,['Survived']]

X_train, X_test, y_train, y_test = train_test_split(titanic_features,titanic_targets,test_size=0.2,random_state=11)



cat_features = ['Sex','Cabin','Embarked']
numeric_features = titanic_features.columns.difference(['Sex','Cabin','Embarked'])

cv_accuracy = []

stratified_k_fold = StratifiedKFold(n_splits=5)

features = X_train.copy()
targets = y_train.copy()

for train_index, val_index in stratified_k_fold.split(features,targets):
    X_train, y_train = features.iloc[train_index] , targets.iloc[train_index]
    X_val , y_val = features.iloc[val_index] , targets.iloc[val_index]
    X_train

    # simple impute for age
    simple_imputer = SimpleImputer(strategy='mean')
    X_train[['Age']] = simple_imputer.fit_transform(X_train[['Age']])
    X_val[['Age']] = simple_imputer.transform(X_val[['Age']])

    # one-hot encoding for categorical features
    onehot_encoder = OneHotEncoder(sparse=False,handle_unknown='ignore')
    result = onehot_encoder.fit_transform(X_train[cat_features])
    result_columns = onehot_encoder.get_feature_names_out()
    X_train.drop(columns=cat_features,inplace=True)
    X_train[result_columns] = result

    result = onehot_encoder.transform(X_val[cat_features])
    result_columns = onehot_encoder.get_feature_names_out()
    X_val.drop(columns=cat_features,inplace=True)
    X_val[result_columns] = result

    # standradscaler for numeric features

    standard_scaler = StandardScaler()
    X_train[numeric_features] = standard_scaler.fit_transform(X_train[numeric_features])
    X_val[numeric_features] = standard_scaler.transform(X_val[numeric_features])

    # rf model
    rf_model = RandomForestClassifier(random_state=200)
    rf_model.fit(X_train,y_train)
    y_predict = rf_model.predict(X_val)
    score = accuracy_score(y_val,y_predict)
    cv_accuracy.append(score)

    

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Cabin,Embarked
646,3,male,19.0,0,0,N,S
223,3,male,,0,0,N,S
807,3,female,18.0,0,0,N,S
288,2,male,42.0,0,0,N,S
624,3,male,21.0,0,0,N,S
...,...,...,...,...,...,...,...
269,1,female,35.0,0,0,C99,S
337,1,female,41.0,0,0,E40,C
91,3,male,20.0,0,0,N,S
80,3,male,22.0,0,0,N,S


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[['Age']] = simple_imputer.fit_transform(X_train[['Age']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val[['Age']] = simple_imputer.transform(X_val[['Age']])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.drop(columns=cat_features,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_inde

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Cabin,Embarked
333,3,male,16.0,2,0,N,S
662,1,male,47.0,0,0,E58,S
382,3,male,32.0,0,0,N,S
331,1,male,45.5,0,0,C124,S
149,2,male,42.0,0,0,N,S
...,...,...,...,...,...,...,...
269,1,female,35.0,0,0,C99,S
337,1,female,41.0,0,0,E40,C
91,3,male,20.0,0,0,N,S
80,3,male,22.0,0,0,N,S


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[['Age']] = simple_imputer.fit_transform(X_train[['Age']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val[['Age']] = simple_imputer.transform(X_val[['Age']])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.drop(columns=cat_features,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_inde

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Cabin,Embarked
333,3,male,16.0,2,0,N,S
662,1,male,47.0,0,0,E58,S
382,3,male,32.0,0,0,N,S
331,1,male,45.5,0,0,C124,S
149,2,male,42.0,0,0,N,S
...,...,...,...,...,...,...,...
269,1,female,35.0,0,0,C99,S
337,1,female,41.0,0,0,E40,C
91,3,male,20.0,0,0,N,S
80,3,male,22.0,0,0,N,S


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[['Age']] = simple_imputer.fit_transform(X_train[['Age']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val[['Age']] = simple_imputer.transform(X_val[['Age']])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.drop(columns=cat_features,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_inde

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Cabin,Embarked
333,3,male,16.0,2,0,N,S
662,1,male,47.0,0,0,E58,S
382,3,male,32.0,0,0,N,S
331,1,male,45.5,0,0,C124,S
149,2,male,42.0,0,0,N,S
...,...,...,...,...,...,...,...
269,1,female,35.0,0,0,C99,S
337,1,female,41.0,0,0,E40,C
91,3,male,20.0,0,0,N,S
80,3,male,22.0,0,0,N,S


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[['Age']] = simple_imputer.fit_transform(X_train[['Age']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val[['Age']] = simple_imputer.transform(X_val[['Age']])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.drop(columns=cat_features,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_inde

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Cabin,Embarked
333,3,male,16.0,2,0,N,S
662,1,male,47.0,0,0,E58,S
382,3,male,32.0,0,0,N,S
331,1,male,45.5,0,0,C124,S
149,2,male,42.0,0,0,N,S
...,...,...,...,...,...,...,...
241,3,female,,1,0,N,Q
529,2,male,23.0,2,1,N,S
343,2,male,25.0,0,0,N,S
748,1,male,19.0,1,0,D30,S


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[['Age']] = simple_imputer.fit_transform(X_train[['Age']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val[['Age']] = simple_imputer.transform(X_val[['Age']])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.drop(columns=cat_features,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_inde

In [131]:
cv_accuracy

[0.7622377622377622,
 0.7832167832167832,
 0.8028169014084507,
 0.8098591549295775,
 0.704225352112676]

In [125]:
np.mean(cv_accuracy)

0.8032896680784004

In [None]:
from sklearn.impute import SimpleImputer # also works with categorical when setting 'strategy' = 'fequent' or 'constant'
simple_imputer = SimpleImputer(strategy='constant',fill_value='N')
simple_imputer.fit_transform(titanic_df[['Cabin']])

array([['N'],
       ['C85'],
       ['N'],
       ['C123'],
       ['N'],
       ['N'],
       ['E46'],
       ['N'],
       ['N'],
       ['N'],
       ['G6'],
       ['C103'],
       ['N'],
       ['N'],
       ['N'],
       ['N'],
       ['N'],
       ['N'],
       ['N'],
       ['N'],
       ['N'],
       ['D56'],
       ['N'],
       ['A6'],
       ['N'],
       ['N'],
       ['N'],
       ['C23 C25 C27'],
       ['N'],
       ['N'],
       ['N'],
       ['B78'],
       ['N'],
       ['N'],
       ['N'],
       ['N'],
       ['N'],
       ['N'],
       ['N'],
       ['N'],
       ['N'],
       ['N'],
       ['N'],
       ['N'],
       ['N'],
       ['N'],
       ['N'],
       ['N'],
       ['N'],
       ['N'],
       ['N'],
       ['N'],
       ['D33'],
       ['N'],
       ['B30'],
       ['C52'],
       ['N'],
       ['N'],
       ['N'],
       ['N'],
       ['N'],
       ['B28'],
       ['C83'],
       ['N'],
       ['N'],
       ['N'],
       ['F33'],
       ['N'],
       ['N

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Cabin,Embarked
0,0,3,male,22.0,1,0,,S
1,1,1,female,38.0,1,0,C85,C
2,1,3,female,26.0,0,0,,S
3,1,1,female,35.0,1,0,C123,S
4,0,3,male,35.0,0,0,,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,,S
887,1,1,female,19.0,0,0,B42,S
888,0,3,female,,1,2,,S
889,1,1,male,26.0,0,0,C148,C


In [None]:
# 해줘야할것들
# 먼저 결측지 채운다 
# categorical 에서 'Cabin','Embarked' simple imputer 'N'으로 채운다
# numerical 에서 'age' simple imputer 'mean'으로 채운다

# numerical sacler
# categorical encoding

cat_features = ['Sex','Cabin','Embarked']
numeric_features = titanic_features.columns.difference(['Sex','Cabin','Embarked'])


numeric_transformer = StandardScaler()

categorical_features = ["cat"]
categorical_transformer = 
 OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ] # 원한다면 여기서 추가해서 특정 numeric 혹은 categorical column은 또 다른 scaling 적용가능하다
)




In [132]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


titanic_df = pd.read_csv('Data/titanic_train.csv',header=0)
titanic_df = titanic_df[['Survived' , 'Pclass' , 'Sex' , 'Age' , 'SibSp' , 'Parch' , 'Cabin' , 'Embarked']]
titanic_df
titanic_features = titanic_df.iloc[:,~titanic_df.columns.isin(['Survived'])]
titanic_targets = titanic_df.loc[:,['Survived']]

X_train, X_test, y_train, y_test = train_test_split(titanic_features,titanic_targets,test_size=0.2,random_state=11)


categorical_features = ['Sex','Cabin','Embarked']
numeric_features = titanic_features.columns.difference(['Sex','Cabin','Embarked'])

numeric_transformer =  Pipeline(
    steps=[("imputer", SimpleImputer(strategy="mean")), ("scaler", StandardScaler())]
)


categorical_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="constant",fill_value='N')), ("encoder", OneHotEncoder(handle_unknown="ignore"))]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ] 
)

clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", RandomForestClassifier(random_state=200))]
)


from sklearn.model_selection import cross_val_score
cross_val_score(clf,X_train,y_train,scoring='accuracy',cv=5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Cabin,Embarked
0,0,3,male,22.0,1,0,,S
1,1,1,female,38.0,1,0,C85,C
2,1,3,female,26.0,0,0,,S
3,1,1,female,35.0,1,0,C123,S
4,0,3,male,35.0,0,0,,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,,S
887,1,1,female,19.0,0,0,B42,S
888,0,3,female,,1,2,,S
889,1,1,male,26.0,0,0,C148,C


  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)


array([0.74825175, 0.77622378, 0.78873239, 0.81690141, 0.6971831 ])

In [None]:
[0.7622377622377622,
 0.7832167832167832,
 0.8028169014084507,
 0.8098591549295775,
 0.704225352112676]