# titanic호 탑승객의 생존유무(survived) 예측

In [1]:
import pandas as pd
import numpy as np

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
%pwd

'C:\\python\\titanic'

## global variable 

In [4]:
!pip install easydict



In [5]:
import easydict
args = easydict.EasyDict()

# path 정보
args.train_csv = 'train.csv'
args.test_csv = 'test.csv'
args.default_submission_csv = 'submission.csv'

args.submission_csv ='submission_0220.csv'
args.save_results = "model_results.json"

# 데이터 분석을 위한 변수들
args.random_state = 21
args.results = []

## 데이터 불러오기 
- survived : 생존=1, 죽음=0
- pclass : 승객 등급. 1등급=1, 2등급=2, 3등급=3
- sibsp : 함께 탑승한 형제 또는 배우자 수
- parch : 함께 탑승한 부모 또는 자녀 수
- ticket : 티켓 번호
- cabin : 선실 번호
- embarked : 탑승장소 S=Southhampton, C=Cherbourg, Q=Queenstown

In [6]:
raw_train = pd.read_csv(args.train_csv)
raw_test = pd.read_csv(args.test_csv)

raw_train.shape, raw_test.shape   

((916, 12), (393, 11))

In [7]:
raw_train.columns  

Index(['passengerid', 'survived', 'pclass', 'name', 'gender', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

In [8]:
raw_train.head()

Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
0,0,0,2,"Wheeler, Mr. Edwin Frederick""""",male,,0,0,SC/PARIS 2159,12.875,,S
1,1,0,3,"Henry, Miss. Delia",female,,0,0,382649,7.75,,Q
2,2,1,1,"Hays, Mrs. Charles Melville (Clara Jennings Gr...",female,52.0,1,1,12749,93.5,B69,S
3,3,1,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",male,27.0,0,0,350043,7.7958,,S
4,4,0,2,"Hold, Mr. Stephen",male,44.0,1,0,26707,26.0,,S


In [9]:
raw_train['passengerid'].nunique()  
#모두 다른 데이터로 불필요한 col

916

In [10]:
#passengerid 삭제
del raw_train['passengerid']


In [11]:
raw_train.columns

Index(['survived', 'pclass', 'name', 'gender', 'age', 'sibsp', 'parch',
       'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

In [12]:
raw_test.columns

Index(['passengerid', 'pclass', 'name', 'gender', 'age', 'sibsp', 'parch',
       'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

In [13]:
raw_test.set_index(['passengerid'],inplace =True)

In [14]:
raw_test.shape

(393, 10)

## train_test_split

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
y = raw_train['survived']  #타겟 데이터
X = raw_train.drop(['survived'], axis =1) #Feature 데이터

In [19]:
X_tr, X_te, y_tr, y_te = train_test_split(X,y, test_size = 0.3 , stratify=raw_train['survived'],random_state = args.random_state)
X_tr.shape, X_te.shape, y_tr.shape, y_te.shape

((641, 10), (275, 10), (641,), (275,))

### model1 

In [20]:
train = X_tr.copy()
test = X_te.copy()
raw_te = raw_test.copy()

train.shape , test.shape ,raw_te.shape

((641, 10), (275, 10), (393, 10))

In [21]:
#수치형 자료 확인

train.describe(include=np.number)   

Unnamed: 0,pclass,age,sibsp,parch,fare
count,641.0,512.0,641.0,641.0,641.0
mean,2.305772,30.344082,0.466459,0.368175,31.654446
std,0.829221,14.492658,0.961386,0.89714,49.827609
min,1.0,0.17,0.0,0.0,0.0
25%,2.0,21.0,0.0,0.0,7.8958
50%,3.0,28.0,0.0,0.0,14.4
75%,3.0,38.0,1.0,0.0,30.0
max,3.0,80.0,8.0,9.0,512.3292


- pclass : 1<2<3 순으로 많은 탑승객  
- age : 탑승객의 나이는 평균 20-30대 , 결측치 존재
- sibsp : 동승한 형제, 배우자의 수는 평균 0~1명이 많음 
- parch : 동승한 가족,자녀의 수는 대부분 0명 ? / 0.37
- fare : 비용의 중앙값과 평균의 수가 차이나는 것으로 보아 데이터 쏠림 현상 존재

In [22]:
#문자형 자료 확인
train.describe(exclude=np.number)   

Unnamed: 0,name,gender,ticket,cabin,embarked
count,641,641,641,135,641
unique,641,2,529,110,3
top,"Ohman, Miss. Velin",male,3101295,B96 B98,S
freq,1,407,5,3,455


#### Data Preprocessing

In [23]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 641 entries, 812 to 277
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    641 non-null    int64  
 1   name      641 non-null    object 
 2   gender    641 non-null    object 
 3   age       512 non-null    float64
 4   sibsp     641 non-null    int64  
 5   parch     641 non-null    int64  
 6   ticket    641 non-null    object 
 7   fare      641 non-null    float64
 8   cabin     135 non-null    object 
 9   embarked  641 non-null    object 
dtypes: float64(2), int64(3), object(5)
memory usage: 55.1+ KB


##### drop column

In [24]:
print(f'before: {train.shape} / {test.shape}')

drop_col = ['name','ticket','cabin']
train.drop(drop_col , axis =1  , inplace=True)
test.drop(drop_col , axis =1  , inplace=True)
raw_te.drop(drop_col , axis =1  , inplace=True)

print(f'after: {train.shape} / {test.shape}')

before: (641, 10) / (275, 10)
after: (641, 7) / (275, 7)


In [25]:
train.columns

Index(['pclass', 'gender', 'age', 'sibsp', 'parch', 'fare', 'embarked'], dtype='object')

##### missing data

In [26]:
train.isnull().sum()

pclass        0
gender        0
age         129
sibsp         0
parch         0
fare          0
embarked      0
dtype: int64

In [27]:
test.isnull().sum()

pclass       0
gender       0
age         51
sibsp        0
parch        0
fare         0
embarked     1
dtype: int64

In [28]:
raw_te.isnull().sum()

pclass       0
gender       0
age         83
sibsp        0
parch        0
fare         1
embarked     1
dtype: int64

age, embarked, fare에 결측치 존재

In [29]:
train[['age','embarked','fare']].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 641 entries, 812 to 277
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       512 non-null    float64
 1   embarked  641 non-null    object 
 2   fare      641 non-null    float64
dtypes: float64(2), object(1)
memory usage: 20.0+ KB


- 결측치 대체 - age : mean , embarked : mod , fare: median

In [30]:
#train data 기준
age_mean = train['age'].mean()
embarked_mode = train['embarked'].mode().values[0]
fare_median = train['fare'].median()

age_mean,embarked_mode,fare_median

(30.34408203125, 'S', 14.4)

In [31]:
train['age'].fillna(age_mean,inplace=True)
test['age'].fillna(age_mean,inplace=True)
raw_te['age'].fillna(age_mean,inplace=True)

train['embarked'].fillna(embarked_mode,inplace=True)
test['embarked'].fillna(embarked_mode,inplace=True)
raw_te['embarked'].fillna(embarked_mode,inplace=True)

train['fare'].fillna(fare_median,inplace=True)
test['fare'].fillna(fare_median,inplace=True)
raw_te['fare'].fillna(fare_median,inplace=True)

train.isnull().sum(), test.isnull().sum(), raw_te.isnull().sum() 

(pclass      0
 gender      0
 age         0
 sibsp       0
 parch       0
 fare        0
 embarked    0
 dtype: int64,
 pclass      0
 gender      0
 age         0
 sibsp       0
 parch       0
 fare        0
 embarked    0
 dtype: int64,
 pclass      0
 gender      0
 age         0
 sibsp       0
 parch       0
 fare        0
 embarked    0
 dtype: int64)

##### data encoding

In [32]:
from sklearn.preprocessing import OneHotEncoder

In [33]:
train.describe(exclude=np.number)

Unnamed: 0,gender,embarked
count,641,641
unique,2,3
top,male,S
freq,407,455


In [34]:
enc_cols = ['gender','embarked']
normal_cols = train.columns.difference(['gender','embarked'])
#normal_cols = list(set(train.columns)-set(enc_cols))

In [35]:
print(f'before: {train.shape} / {test.shape}')

enc = OneHotEncoder()

#train
tmp_tr = pd.DataFrame(                              
    enc.fit_transform(train[enc_cols]).toarray(),   
    columns = enc.get_feature_names_out()           
)
enc_tr = pd.concat(
    [train[normal_cols].reset_index(drop=True), tmp_tr.reset_index(drop=True)]    
    , axis=1
)

# test
tmp_te = pd.DataFrame(
    enc.transform(test[enc_cols]).toarray(), 
    columns = enc.get_feature_names_out()
)
enc_te = pd.concat(
    [test[normal_cols].reset_index(drop=True), tmp_te.reset_index(drop=True)]
    , axis=1
)

# raw_test
tmp_te = pd.DataFrame(
    enc.transform(raw_te[enc_cols]).toarray(), 
    columns = enc.get_feature_names_out()
)
enc_raw_te = pd.concat(
    [raw_te[normal_cols].reset_index(drop=True), tmp_te.reset_index(drop=True)]
    , axis=1
)

print(f'after: {enc_tr.shape} / {enc_te.shape}')
enc_tr.head()

before: (641, 7) / (275, 7)
after: (641, 10) / (275, 10)


Unnamed: 0,age,fare,parch,pclass,sibsp,gender_female,gender_male,embarked_C,embarked_Q,embarked_S
0,22.0,7.775,0,3,0,1.0,0.0,0.0,0.0,1.0
1,21.0,7.8208,0,3,0,0.0,1.0,0.0,1.0,0.0
2,32.0,7.8542,0,3,0,0.0,1.0,0.0,0.0,1.0
3,11.0,18.7875,0,3,0,0.0,1.0,1.0,0.0,0.0
4,30.0,8.05,0,3,0,0.0,1.0,0.0,0.0,1.0


#### Training

In [36]:
enc_tr.shape, enc_te.shape, enc_raw_te.shape

((641, 10), (275, 10), (393, 10))

In [37]:
from sklearn.tree import DecisionTreeClassifier

In [38]:
model1 = DecisionTreeClassifier(random_state=args.random_state)     #base model 

print(f'{enc_tr.shape} / {y_tr.shape}')
model1.fit(enc_tr, y_tr)

(641, 10) / (641,)


DecisionTreeClassifier(random_state=21)

#### Evaluation

In [39]:
score_tr = model1.score(enc_tr, y_tr)    # 학습한 내용에서 나온경우
score_te = model1.score(enc_te, y_te)    # 학습하지 않은 내용에서 나온 경우

score_tr, score_te  

(0.982839313572543, 0.7818181818181819)

##### ROC Curve

In [40]:
from sklearn.metrics import roc_curve, auc 

In [41]:
y_pred = model1.predict_proba(enc_te)[:,1]   # 예측값 / proba : 확률 예측  > 0과 1사이에 있는 값이 나옴 > 이 값을 submit 파일에 올리면 됨
fpr, tpr, thresholds = roc_curve(y_te,y_pred)
auc_te = auc(fpr, tpr)
print(f'model: {auc_te}')

model: 0.7613866396761133


In [42]:
raw_te_pred = model1.predict_proba(enc_raw_te)[:,1]
raw_te_pred.shape

(393,)

In [43]:
model1.feature_importances_

array([0.16750426, 0.19441649, 0.00751647, 0.06597642, 0.04243071,
       0.49269336, 0.        , 0.0120556 , 0.        , 0.01740669])

In [44]:
enc_tr.columns

Index(['age', 'fare', 'parch', 'pclass', 'sibsp', 'gender_female',
       'gender_male', 'embarked_C', 'embarked_Q', 'embarked_S'],
      dtype='object')

In [45]:
df_feature_importances = pd.DataFrame(model1.feature_importances_,enc_tr.columns).sort_values(by=[0],ascending=False)
df_feature_importances.reset_index(inplace=True)

print(f'{df_feature_importances.shape}')
df_feature_importances

(10, 2)


Unnamed: 0,index,0
0,gender_female,0.492693
1,fare,0.194416
2,age,0.167504
3,pclass,0.065976
4,sibsp,0.042431
5,embarked_S,0.017407
6,embarked_C,0.012056
7,parch,0.007516
8,gender_male,0.0
9,embarked_Q,0.0


In [46]:
args.results.append(
    {
        'model': 'model1',
        'score_tr': score_tr,
        'score_te': score_te,
        'auc_te': auc_te,
        'ori_te_pred': raw_te_pred,
        'len_features': X_tr.shape[1],
        'feaute_importances': list(df_feature_importances['index'].values[:X_tr.shape[1]]),
        'create_dt': '0217'
    }
)

args.results

[{'model': 'model1',
  'score_tr': 0.982839313572543,
  'score_te': 0.7818181818181819,
  'auc_te': 0.7613866396761133,
  'ori_te_pred': array([1.        , 1.        , 1.        , 0.        , 1.        ,
         1.        , 0.        , 0.125     , 1.        , 0.        ,
         0.        , 0.        , 1.        , 0.        , 1.        ,
         0.        , 1.        , 0.        , 0.        , 0.125     ,
         0.        , 0.        , 0.        , 0.        , 0.125     ,
         0.        , 0.        , 1.        , 0.        , 0.        ,
         1.        , 0.        , 0.        , 0.        , 0.        ,
         0.5       , 1.        , 0.        , 0.        , 0.        ,
         0.        , 1.        , 0.        , 1.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         1.        , 0.        , 1.        , 0.        , 0.        ,
         1.        , 0.        , 0.        , 0.        , 0.        ,
         1.        , 1.        , 1.  

### model2

In [84]:
train = X_tr.copy()
test = X_te.copy()
raw_te = raw_test.copy()

train.shape , test.shape ,raw_te.shape

((641, 10), (275, 10), (393, 10))

In [85]:
train.head()

Unnamed: 0,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
812,3,"Ohman, Miss. Velin",female,22.0,0,0,347085,7.775,,S
134,3,"Buckley, Mr. Daniel",male,21.0,0,0,330920,7.8208,,Q
390,3,"Jonsson, Mr. Carl",male,32.0,0,0,350417,7.8542,,S
305,3,"Hassan, Mr. Houssein G N",male,11.0,0,0,2699,18.7875,,C
203,3,"Corn, Mr. Harry",male,30.0,0,0,SOTON/OQ 392090,8.05,,S


In [None]:
train.groupby()

##### drop column

In [86]:
print(f'before: {train.shape} / {test.shape}')

drop_col = ['ticket','cabin']
train.drop(drop_col , axis =1  , inplace=True)
test.drop(drop_col , axis =1  , inplace=True)
raw_te.drop(drop_col , axis =1  , inplace=True)

print(f'after: {train.shape} / {test.shape}')

before: (641, 10) / (275, 10)
after: (641, 8) / (275, 8)


##### missing data 

age : random

In [87]:
train.isnull().sum()

pclass        0
name          0
gender        0
age         129
sibsp         0
parch         0
fare          0
embarked      0
dtype: int64

In [88]:
test.isnull().sum()

pclass       0
name         0
gender       0
age         51
sibsp        0
parch        0
fare         0
embarked     1
dtype: int64

In [89]:
raw_te.isnull().sum()

pclass       0
name         0
gender       0
age         83
sibsp        0
parch        0
fare         1
embarked     1
dtype: int64

In [90]:
train.columns

Index(['pclass', 'name', 'gender', 'age', 'sibsp', 'parch', 'fare',
       'embarked'],
      dtype='object')

In [134]:
#pclass별 fare 평균
pclass = train.groupby(['pclass']).agg({'fare':'mean'})
pclass

Unnamed: 0_level_0,fare
pclass,Unnamed: 1_level_1
1,84.02522
2,20.599437
3,13.259039


In [136]:
age_mean = train['age'].mean()
embarked_mode = train['embarked'].mode().values[0]
fare_mean = train['fare'].mean()

age_mean,embarked_mode,fare_mean

(30.34408203125, 'S', 31.654445709828376)

In [137]:
train['age'].fillna(age_mean,inplace=True)
test['age'].fillna(age_mean,inplace=True)
raw_te['age'].fillna(age_mean,inplace=True)

test['embarked'].fillna(embarked_mode,inplace=True)
raw_te['embarked'].fillna(embarked_mode,inplace=True)

raw_te['fare'].fillna(fare_mean,inplace=True)

train.isnull().sum(), test.isnull().sum(), raw_te.isnull().sum() 

(pclass      0
 name        0
 gender      0
 age         0
 sibsp       0
 parch       0
 fare        0
 embarked    0
 dtype: int64,
 pclass      0
 name        0
 gender      0
 age         0
 sibsp       0
 parch       0
 fare        0
 embarked    0
 dtype: int64,
 pclass      0
 name        0
 gender      0
 age         0
 sibsp       0
 parch       0
 fare        0
 embarked    0
 dtype: int64)

##### Feature Extraction

In [61]:
# dict_designation = {
#     'Mr': '남성',
#     'Master': '남성',
#     'Miss': '미혼 여성',
#     'Ms': '미혼/기혼 여성'
# }
# def add_designation(name): # 호칭 함수
#   designation = "unknown"
#   for key in dict_designation.keys():
#     if key in name:
#       designation = key
#       break
#   return designation

# train['designation'] = train['name'].map(lambda x: add_designation(x))
# test['designation'] = test['name'].map(lambda x: add_designation(x))
# raw_te['designation'] = raw_te['name'].map(lambda x: add_designation(x))

# train.head()

Unnamed: 0,pclass,name,gender,age,sibsp,parch,fare,embarked,designation
812,3,"Ohman, Miss. Velin",female,22.0,0,0,7.775,S,Miss
134,3,"Buckley, Mr. Daniel",male,21.0,0,0,7.8208,Q,Mr
390,3,"Jonsson, Mr. Carl",male,32.0,0,0,7.8542,S,Mr
305,3,"Hassan, Mr. Houssein G N",male,11.0,0,0,18.7875,C,Mr
203,3,"Corn, Mr. Harry",male,30.0,0,0,8.05,S,Mr


In [138]:
del train['name']
del test['name']
del raw_te['name']

In [139]:
train.columns

Index(['pclass', 'gender', 'age', 'sibsp', 'parch', 'fare', 'embarked'], dtype='object')

##### data encoding

In [144]:
train.describe(exclude=np.number)

Unnamed: 0,gender,embarked
count,641,641
unique,2,3
top,male,S
freq,407,455


In [145]:
#train.designation.value_counts()

In [146]:
enc_cols = ['gender','embarked']
normal_cols = train.columns.difference(['gender','embarked'])


In [147]:
#train.designation

In [148]:
print(f'before: {train.shape} / {test.shape}')

enc = OneHotEncoder()

#train
tmp_tr = pd.DataFrame(                              
    enc.fit_transform(train[enc_cols]).toarray(),   
    columns = enc.get_feature_names_out()           
)
enc_tr = pd.concat(
    [train[normal_cols].reset_index(drop=True), tmp_tr.reset_index(drop=True)]    
    , axis=1
)

# test
tmp_te = pd.DataFrame(
    enc.transform(test[enc_cols]).toarray(), 
    columns = enc.get_feature_names_out()
)
enc_te = pd.concat(
    [test[normal_cols].reset_index(drop=True), tmp_te.reset_index(drop=True)]
    , axis=1
)

# raw_test
tmp_te = pd.DataFrame(
    enc.transform(raw_te[enc_cols]).toarray(), 
    columns = enc.get_feature_names_out()
)
enc_raw_te = pd.concat(
    [raw_te[normal_cols].reset_index(drop=True), tmp_te.reset_index(drop=True)]
    , axis=1
)

print(f'after: {enc_tr.shape} / {enc_te.shape}')
enc_tr.head()

before: (641, 7) / (275, 7)
after: (641, 10) / (275, 10)


Unnamed: 0,age,fare,parch,pclass,sibsp,gender_female,gender_male,embarked_C,embarked_Q,embarked_S
0,22.0,7.775,0,3,0,1.0,0.0,0.0,0.0,1.0
1,21.0,7.8208,0,3,0,0.0,1.0,0.0,1.0,0.0
2,32.0,7.8542,0,3,0,0.0,1.0,0.0,0.0,1.0
3,11.0,18.7875,0,3,0,0.0,1.0,1.0,0.0,0.0
4,30.0,8.05,0,3,0,0.0,1.0,0.0,0.0,1.0


#### Training

In [149]:
enc_tr.shape, enc_te.shape, enc_raw_te.shape

((641, 10), (275, 10), (393, 10))

In [150]:
from sklearn.tree import DecisionTreeClassifier

In [151]:
model2 = DecisionTreeClassifier(random_state=args.random_state)     
print(f'{enc_tr.shape} / {y_tr.shape}')
model2.fit(enc_tr, y_tr)

(641, 10) / (641,)


DecisionTreeClassifier(random_state=21)

#### Evaluation

In [152]:
score_tr = model2.score(enc_tr, y_tr)    # 학습한 내용에서 나온경우
score_te = model2.score(enc_te, y_te)    # 학습하지 않은 내용에서 나온 경우

score_tr, score_te  

(0.982839313572543, 0.7818181818181819)

## submission

In [73]:
df_results = pd.DataFrame(args.results).sort_values(by=['auc_te'], ascending=False)
df_results

Unnamed: 0,model,score_tr,score_te,auc_te,ori_te_pred,len_features,feaute_importances,create_dt
0,model1,0.982839,0.781818,0.761387,"[1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.125, 1.0...",10,"[gender_female, fare, age, pclass, sibsp, emba...",217


In [74]:
submission = pd.read_csv(args.default_submission_csv)
submission.head()

Unnamed: 0,passengerid,survived
0,916,0.5
1,917,0.5
2,918,0.5
3,919,0.5
4,920,0.5


In [75]:
submission['survived'] = df_results.loc[0, ['ori_te_pred']].values[0]
print(f'{submission.isnull().sum().sum()}')
submission.head(10)

0


Unnamed: 0,passengerid,survived
0,916,1.0
1,917,1.0
2,918,1.0
3,919,0.0
4,920,1.0
5,921,1.0
6,922,0.0
7,923,0.125
8,924,1.0
9,925,0.0


In [76]:
submission.to_csv(args.submission_csv, header=True, index=False)

In [77]:
args.save_results

'model_results.json'

In [78]:
df_results.columns

Index(['model', 'score_tr', 'score_te', 'auc_te', 'ori_te_pred',
       'len_features', 'feaute_importances', 'create_dt'],
      dtype='object')

In [79]:
df_results.drop(['ori_te_pred'], axis=1, inplace=True) 

In [80]:
df_results.to_json(args.save_results, orient="records")

In [81]:
df_results.head()

Unnamed: 0,model,score_tr,score_te,auc_te,len_features,feaute_importances,create_dt
0,model1,0.982839,0.781818,0.761387,10,"[gender_female, fare, age, pclass, sibsp, emba...",217


In [82]:
import json

load_results = None
with open(args.save_results, 'r') as file:
    load_results = json.load(file)

load_results

[{'model': 'model1',
  'score_tr': 0.9828393136,
  'score_te': 0.7818181818,
  'auc_te': 0.7613866397,
  'len_features': 10,
  'feaute_importances': ['gender_female',
   'fare',
   'age',
   'pclass',
   'sibsp',
   'embarked_S',
   'embarked_C',
   'parch',
   'gender_male',
   'embarked_Q'],
  'create_dt': '0217'}]