# Connect to google account

In [1]:
from google.colab import drive
drive.mount('/content/data')

Drive already mounted at /content/data; to attempt to forcibly remount, call drive.mount("/content/data", force_remount=True).


In [2]:
# 코렙 한글깨짐 방지
!apt -qq -y install fonts-nanum > /dev/null

# 데이터 시각화에 사용할 라이브러리
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import seaborn as sns

fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font_name = fm.FontProperties(fname=fontpath).get_name() 
fm._rebuild()  


%config InlineBackend.figure_format = 'retina'

plt.rc('font', family=font_name)  
plt.rcParams['axes.unicode_minus'] = False 





# Global Variables

In [3]:
import easydict
args = easydict.EasyDict()

# path 정보
args.default_path = '/content/data/MyDrive/lecture/data/titanic/'
args.train_csv = args.default_path+'train.csv'
args.save_path = args.default_path+"model_results.json"

# 데이터 분석을 위한 변수들
args.random_state = 42
args.results = []

# Load Tritanic

- Surived:0=사망, 1=생존
- Pclass: 1=1등석, 2=2등석, 3=3등석
- Sex:male=남성, female=여성
- Age: 나이
- SibSp: 타이타닉 호에 동승한 자매/배우자의 수
- Parch: 타이타닉 호에 동승한 부모/자식의 수
- Ticket: 티켓 번호
- Fare: 승객 요금
- Cabin: 방 호수
- Embarked: 탑승지; C=셰르부르, Q=퀴즈타운, S=사우샘프턴

In [4]:
import numpy as np 
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('fivethirtyeight')
plt.ion()

import warnings
warnings.filterwarnings('ignore')

In [5]:
original = pd.read_csv(args.train_csv)

original.shape

(891, 12)

In [6]:
original.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [7]:
original.columns = [col.lower() for col in original.columns] # 컬럼명 소문자로 변환
original.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

In [8]:
original['passengerid'].nunique(), original.shape[0]

(891, 891)

In [9]:
original.drop('passengerid', axis=1, inplace=True)
original.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# train_test_split

In [10]:
new_survived = pd.Categorical(original["survived"])
new_survived = new_survived.rename_categories(["Died","Survived"])              

new_survived.describe()

Unnamed: 0_level_0,counts,freqs
categories,Unnamed: 1_level_1,Unnamed: 2_level_1
Died,549,0.616162
Survived,342,0.383838


In [11]:
from sklearn.model_selection import train_test_split

In [12]:
ori_train, ori_test = train_test_split(original, test_size=0.3, stratify=original['survived'], random_state=args.random_state)

ori_train.shape, ori_test.shape

((623, 11), (268, 11))

# Base ModelV0

In [13]:
train = ori_train.copy() 
test = ori_test.copy()

train.shape, test.shape

((623, 11), (268, 11))

## Data Preprocessing

In [14]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 623 entries, 748 to 136
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  623 non-null    int64  
 1   pclass    623 non-null    int64  
 2   name      623 non-null    object 
 3   sex       623 non-null    object 
 4   age       504 non-null    float64
 5   sibsp     623 non-null    int64  
 6   parch     623 non-null    int64  
 7   ticket    623 non-null    object 
 8   fare      623 non-null    float64
 9   cabin     136 non-null    object 
 10  embarked  621 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 58.4+ KB


drop columns

In [15]:
print(f'before: {train.shape} / {test.shape}')
drop_cols = ['name', 'ticket', 'cabin']

train.drop(drop_cols, axis=1, inplace=True)
test.drop(drop_cols, axis=1, inplace=True)

print(f'after: {train.shape} / {test.shape}')
train.info()

before: (623, 11) / (268, 11)
after: (623, 8) / (268, 8)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 623 entries, 748 to 136
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  623 non-null    int64  
 1   pclass    623 non-null    int64  
 2   sex       623 non-null    object 
 3   age       504 non-null    float64
 4   sibsp     623 non-null    int64  
 5   parch     623 non-null    int64  
 6   fare      623 non-null    float64
 7   embarked  621 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 43.8+ KB


missing value

In [16]:
train.isnull().sum()

survived      0
pclass        0
sex           0
age         119
sibsp         0
parch         0
fare          0
embarked      2
dtype: int64

In [17]:
test.isnull().sum()

survived     0
pclass       0
sex          0
age         58
sibsp        0
parch        0
fare         0
embarked     0
dtype: int64

In [18]:
age_median = train['age'].median()
embarked_mode = train['embarked'].mode().values[0]

age_median, embarked_mode

(29.0, 'S')

In [19]:
train['age'].fillna(age_median, inplace=True)
test['age'].fillna(age_median, inplace=True)

train['embarked'].fillna(embarked_mode, inplace=True)

train.isnull().sum().sum(), test.isnull().sum().sum()

(0, 0)

data encoding

In [20]:
from sklearn.preprocessing import OneHotEncoder

In [21]:
enc_cols = ['sex', 'embarked']
normal_cols = list(set(train.columns) - set(enc_cols))
normal_cols

['parch', 'fare', 'survived', 'pclass', 'age', 'sibsp']

In [22]:
print(f'before: {train.shape} / {test.shape}')

enc = OneHotEncoder()
tmp_tr = pd.DataFrame(
    enc.fit_transform(train[enc_cols]).toarray(), 
    columns = enc.get_feature_names_out()
)
enc_tr = pd.concat(
    [train[normal_cols].reset_index(drop=True), tmp_tr.reset_index(drop=True)]
    , axis=1
)

tmp_te = pd.DataFrame(
    enc.transform(test[enc_cols]).toarray(), 
    columns = enc.get_feature_names_out()
)
enc_te = pd.concat(
    [test[normal_cols].reset_index(drop=True), tmp_te.reset_index(drop=True)]
    , axis=1
)

print(f'after: {enc_tr.shape} / {enc_te.shape}')
enc_tr.head()

before: (623, 8) / (268, 8)
after: (623, 11) / (268, 11)


Unnamed: 0,parch,fare,survived,pclass,age,sibsp,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
0,0,53.1,0,1,19.0,1,0.0,1.0,0.0,0.0,1.0
1,0,8.05,0,3,29.0,0,0.0,1.0,0.0,0.0,1.0
2,0,7.8792,1,3,29.0,0,1.0,0.0,0.0,1.0,0.0
3,0,0.0,0,1,29.0,0,0.0,1.0,0.0,0.0,1.0
4,0,15.85,0,3,28.0,1,0.0,1.0,0.0,0.0,1.0


## Training

In [23]:
enc_tr.isnull().sum().sum(), enc_te.isnull().sum().sum()

(0, 0)

In [24]:
y_tr = enc_tr['survived']
X_tr = enc_tr.drop(['survived'], axis=1)

y_te = enc_te['survived']
X_te = enc_te.drop(['survived'], axis=1)

X_tr.shape, y_tr.shape

((623, 10), (623,))

In [25]:
from sklearn.tree import DecisionTreeClassifier

In [26]:
modelV0 = DecisionTreeClassifier(random_state=args.random_state)

modelV0.fit(X_tr, y_tr)

DecisionTreeClassifier(random_state=42)

In [27]:
score_tr = modelV0.score(X_tr, y_tr)
score_te = modelV0.score(X_te, y_te) 

score_tr, score_te  

(0.9759229534510433, 0.7611940298507462)

In [28]:
args.results.append(
    {
        'model': 'modelV0',
        'score_tr': score_tr,
        'score_te': score_te,
        'len_features': X_tr.shape[1],
        'dt': '0217'
    }
)

args.results[0]

{'model': 'modelV0',
 'score_tr': 0.9759229534510433,
 'score_te': 0.7611940298507462,
 'len_features': 10,
 'dt': '0217'}

# Base ModelV1

In [29]:
train = ori_train.copy() 
test = ori_test.copy()

train.shape, test.shape

((623, 11), (268, 11))

## Data Preprocessing

In [30]:
print(f'before: {train.shape} / {test.shape}')
drop_cols = ['name', 'ticket', 'cabin']

train.drop(drop_cols, axis=1, inplace=True)
test.drop(drop_cols, axis=1, inplace=True)

print(f'after: {train.shape} / {test.shape}')
train.info()

before: (623, 11) / (268, 11)
after: (623, 8) / (268, 8)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 623 entries, 748 to 136
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  623 non-null    int64  
 1   pclass    623 non-null    int64  
 2   sex       623 non-null    object 
 3   age       504 non-null    float64
 4   sibsp     623 non-null    int64  
 5   parch     623 non-null    int64  
 6   fare      623 non-null    float64
 7   embarked  621 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 43.8+ KB


In [31]:
age_median = train['age'].median()
embarked_mode = train['embarked'].mode().values[0]

age_median, embarked_mode

(29.0, 'S')

In [32]:
train['age'].fillna(age_median, inplace=True)
test['age'].fillna(age_median, inplace=True)

train['embarked'].fillna(embarked_mode, inplace=True)

train.isnull().sum().sum(), test.isnull().sum().sum()

(0, 0)

In [33]:
from sklearn.preprocessing import OneHotEncoder

In [34]:
enc_cols = ['sex', 'embarked']
normal_cols = list(set(train.columns) - set(enc_cols))
normal_cols

['parch', 'fare', 'survived', 'pclass', 'age', 'sibsp']

In [35]:
print(f'before: {train.shape} / {test.shape}')

enc = OneHotEncoder()
tmp_tr = pd.DataFrame(
    enc.fit_transform(train[enc_cols]).toarray(), 
    columns = enc.get_feature_names_out()
)
enc_tr = pd.concat(
    [train[normal_cols].reset_index(drop=True), tmp_tr.reset_index(drop=True)]
    , axis=1
)

tmp_te = pd.DataFrame(
    enc.transform(test[enc_cols]).toarray(), 
    columns = enc.get_feature_names_out()
)
enc_te = pd.concat(
    [test[normal_cols].reset_index(drop=True), tmp_te.reset_index(drop=True)]
    , axis=1
)

print(f'after: {enc_tr.shape} / {enc_te.shape}')
enc_tr.head()

before: (623, 8) / (268, 8)
after: (623, 11) / (268, 11)


Unnamed: 0,parch,fare,survived,pclass,age,sibsp,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
0,0,53.1,0,1,19.0,1,0.0,1.0,0.0,0.0,1.0
1,0,8.05,0,3,29.0,0,0.0,1.0,0.0,0.0,1.0
2,0,7.8792,1,3,29.0,0,1.0,0.0,0.0,1.0,0.0
3,0,0.0,0,1,29.0,0,0.0,1.0,0.0,0.0,1.0
4,0,15.85,0,3,28.0,1,0.0,1.0,0.0,0.0,1.0


Scaler

In [36]:
enc_tr.isnull().sum().sum(), enc_te.isnull().sum().sum()

(0, 0)

In [37]:
y_tr = enc_tr['survived']
X_tr = enc_tr.drop(['survived'], axis=1)

y_te = enc_te['survived']
X_te = enc_te.drop(['survived'], axis=1)

X_tr.shape, y_tr.shape

((623, 10), (623,))

In [38]:
X_tr.columns

Index(['parch', 'fare', 'pclass', 'age', 'sibsp', 'sex_female', 'sex_male',
       'embarked_C', 'embarked_Q', 'embarked_S'],
      dtype='object')

In [39]:
scaling_cols = ['age', 'fare']
not_scaling_cols = list(set(X_tr.columns) - set(scaling_cols))
not_scaling_cols 

['embarked_S',
 'sex_female',
 'embarked_Q',
 'sex_male',
 'parch',
 'embarked_C',
 'pclass',
 'sibsp']

In [40]:
from sklearn.preprocessing import StandardScaler

In [41]:
std = StandardScaler()
std.fit(X_tr[scaling_cols])

X_train_scaled = std.transform(X_tr[scaling_cols])
X_test_scaled = std.transform(X_te[scaling_cols])

In [42]:
print(f'before: {X_tr.shape} / {X_te.shape}')

tmp_tr = pd.DataFrame(
    X_train_scaled, 
    columns = scaling_cols
)
X_tr = pd.concat(
    [X_tr[not_scaling_cols].reset_index(drop=True), tmp_tr.reset_index(drop=True)]
    , axis=1
)

tmp_te = pd.DataFrame(
    X_test_scaled, 
    columns = scaling_cols
)
X_te = pd.concat(
    [X_te[not_scaling_cols].reset_index(drop=True), tmp_te.reset_index(drop=True)]
    , axis=1
)

print(f'after: {X_tr.shape} / {X_te.shape}')
X_tr.head()

before: (623, 10) / (268, 10)
after: (623, 10) / (268, 10)


Unnamed: 0,embarked_S,sex_female,embarked_Q,sex_male,parch,embarked_C,pclass,sibsp,age,fare
0,1.0,0.0,0.0,1.0,0,0.0,1,1,-0.83189,0.465738
1,1.0,0.0,0.0,1.0,0,0.0,3,0,-0.064873,-0.478269
2,0.0,1.0,1.0,0.0,0,0.0,3,0,-0.064873,-0.481848
3,1.0,0.0,0.0,1.0,0,0.0,1,0,-0.064873,-0.646954
4,1.0,0.0,0.0,1.0,0,0.0,3,1,-0.141574,-0.314823


## Training

In [43]:
X_tr.isnull().sum().sum(), X_te.isnull().sum().sum()

(0, 0)

In [44]:
X_tr.shape, y_tr.shape, X_te.shape, y_te.shape

((623, 10), (623,), (268, 10), (268,))

In [45]:
from sklearn.tree import DecisionTreeClassifier

In [46]:
modelV1 = DecisionTreeClassifier(random_state=args.random_state)

modelV1.fit(X_tr, y_tr)

DecisionTreeClassifier(random_state=42)

In [47]:
score_tr = modelV1.score(X_tr, y_tr)
score_te = modelV1.score(X_te, y_te) 

score_tr, score_te  

(0.9759229534510433, 0.7611940298507462)

In [48]:
args.results.append(
    {
        'model': 'modelV1',
        'score_tr': score_tr,
        'score_te': score_te,
        'len_features': X_tr.shape[1],
        'dt': '0217'
    }
)

args.results

[{'model': 'modelV0',
  'score_tr': 0.9759229534510433,
  'score_te': 0.7611940298507462,
  'len_features': 10,
  'dt': '0217'},
 {'model': 'modelV1',
  'score_tr': 0.9759229534510433,
  'score_te': 0.7611940298507462,
  'len_features': 10,
  'dt': '0217'}]

# Base ModelV2

In [49]:
train = ori_train.copy() 
test = ori_test.copy()

train.shape, test.shape

((623, 11), (268, 11))

## Data Preprocessing

In [50]:
print(f'before: {train.shape} / {test.shape}')
drop_cols = ['name', 'ticket', 'cabin']

train.drop(drop_cols, axis=1, inplace=True)
test.drop(drop_cols, axis=1, inplace=True)

print(f'after: {train.shape} / {test.shape}')

before: (623, 11) / (268, 11)
after: (623, 8) / (268, 8)


In [51]:
age_median = train['age'].median()
embarked_mode = train['embarked'].mode().values[0]

train['age'].fillna(age_median, inplace=True)
test['age'].fillna(age_median, inplace=True)

train['embarked'].fillna(embarked_mode, inplace=True)

train.isnull().sum().sum(), test.isnull().sum().sum()

(0, 0)

In [52]:
from sklearn.preprocessing import OneHotEncoder

enc_cols = ['sex', 'embarked']
normal_cols = list(set(train.columns) - set(enc_cols))

print(f'before: {train.shape} / {test.shape}')

enc = OneHotEncoder()
tmp_tr = pd.DataFrame(
    enc.fit_transform(train[enc_cols]).toarray(), 
    columns = enc.get_feature_names_out()
)
enc_tr = pd.concat(
    [train[normal_cols].reset_index(drop=True), tmp_tr.reset_index(drop=True)]
    , axis=1
)

tmp_te = pd.DataFrame(
    enc.transform(test[enc_cols]).toarray(), 
    columns = enc.get_feature_names_out()
)
enc_te = pd.concat(
    [test[normal_cols].reset_index(drop=True), tmp_te.reset_index(drop=True)]
    , axis=1
)

print(f'after: {enc_tr.shape} / {enc_te.shape}')

before: (623, 8) / (268, 8)
after: (623, 11) / (268, 11)


In [53]:
scaling_cols = ['age', 'fare']
not_scaling_cols = list(set(enc_tr.columns) - set(scaling_cols))

from sklearn.preprocessing import RobustScaler

rbs = RobustScaler()
scaled_tr = rbs.fit_transform(enc_tr[scaling_cols])
scaled_te = rbs.transform(enc_te[scaling_cols])

print(f'before: {enc_tr.shape} / {enc_te.shape}')

tmp_tr = pd.DataFrame(
    scaled_tr, 
    columns = scaling_cols
)
pre_tr = pd.concat(
    [enc_tr[not_scaling_cols].reset_index(drop=True), tmp_tr.reset_index(drop=True)]
    , axis=1
)

tmp_te = pd.DataFrame(
    scaled_te, 
    columns = scaling_cols
)
pre_te = pd.concat(
    [enc_te[not_scaling_cols].reset_index(drop=True), tmp_te.reset_index(drop=True)]
    , axis=1
)

print(f'after: {pre_tr.shape} / {pre_te.shape}')

before: (623, 11) / (268, 11)
after: (623, 11) / (268, 11)


In [54]:
pre_tr.columns

Index(['embarked_S', 'sex_female', 'embarked_Q', 'sex_male', 'parch',
       'embarked_C', 'pclass', 'survived', 'sibsp', 'age', 'fare'],
      dtype='object')

## Training

In [55]:
pre_tr.isnull().sum().sum(), pre_te.isnull().sum().sum()

(0, 0)

In [56]:
y_tr = pre_tr['survived']
X_tr = pre_tr.drop(['survived'], axis=1)

y_te = pre_te['survived']
X_te = pre_te.drop(['survived'], axis=1)

X_tr.shape, y_tr.shape

((623, 10), (623,))

In [57]:
from sklearn.tree import DecisionTreeClassifier

modelV0 = DecisionTreeClassifier(random_state=args.random_state)

modelV0.fit(X_tr, y_tr)

score_tr = modelV0.score(X_tr, y_tr)
score_te = modelV0.score(X_te, y_te) 

score_tr, score_te 

(0.9759229534510433, 0.7574626865671642)

In [58]:
args.results.append(
    {
        'model': 'modelV2',
        'score_tr': score_tr,
        'score_te': score_te,
        'len_features': X_tr.shape[1],
        'dt': '0217'
    }
)

args.results

[{'model': 'modelV0',
  'score_tr': 0.9759229534510433,
  'score_te': 0.7611940298507462,
  'len_features': 10,
  'dt': '0217'},
 {'model': 'modelV1',
  'score_tr': 0.9759229534510433,
  'score_te': 0.7611940298507462,
  'len_features': 10,
  'dt': '0217'},
 {'model': 'modelV2',
  'score_tr': 0.9759229534510433,
  'score_te': 0.7574626865671642,
  'len_features': 10,
  'dt': '0217'}]

# Save Results

In [59]:
args.save_path

'/content/data/MyDrive/lecture/data/titanic/model_results.json'

In [60]:
import json

with open(args.save_path, 'w', encoding='utf-8') as file:
    json.dump(args.results, file)

In [61]:
load_results = None
with open(args.save_path, 'r') as file:
    load_results = json.load(file)

load_results

[{'model': 'modelV0',
  'score_tr': 0.9759229534510433,
  'score_te': 0.7611940298507462,
  'len_features': 10,
  'dt': '0217'},
 {'model': 'modelV1',
  'score_tr': 0.9759229534510433,
  'score_te': 0.7611940298507462,
  'len_features': 10,
  'dt': '0217'},
 {'model': 'modelV2',
  'score_tr': 0.9759229534510433,
  'score_te': 0.7574626865671642,
  'len_features': 10,
  'dt': '0217'}]