In [138]:
import pandas as pd
import numpy as np

In [139]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Data 확인

In [140]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [141]:
train.info() #데이터 형태 확인

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [142]:
train.isnull().sum() #train셋의 결측값 확인

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [143]:
test.isnull().sum() #test셋의 결측값 확인

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [144]:
survived_by_sex = train[train['Survived']==1]["Sex"].value_counts() #성별에 따른 생존자
dead_by_sex = train[train['Survived']==0]["Sex"].value_counts() #성별에 따른 사망자

In [145]:
survived_by_sex

female    233
male      109
Name: Sex, dtype: int64

In [146]:
dead_by_sex

male      468
female     81
Name: Sex, dtype: int64

In [147]:
survived_by_pclass = train[train['Survived']==1]["Pclass"].value_counts() #좌석등급에 따른 생존자
dead_by_pclass = train[train['Survived']==0]["Pclass"].value_counts() #좌석등급에 따른 사망자

In [148]:
survived_by_pclass

1    136
3    119
2     87
Name: Pclass, dtype: int64

In [149]:
dead_by_pclass

3    372
2     97
1     80
Name: Pclass, dtype: int64

# feature engineering

In [150]:
train_test = [train, test]
for data in train_test:
    data['Title'] = data['Name'].str.extract(' ([A-za-z]+)\.', expand=False)

In [151]:
train['Title'].value_counts()

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Mlle          2
Major         2
Col           2
Countess      1
Capt          1
Ms            1
Sir           1
Lady          1
Mme           1
Don           1
Jonkheer      1
Name: Title, dtype: int64

In [152]:
title_mapping = {"Mr":0, "Miss":1, "Mrs": 2, "Master":0,
                "Dr":3, "Rev":3, "Mlle":3, "Major":3, "Col":3, "Countess":3, "Capt": 4, "Ms":1,
                "Sir":0, "Lady":1, "Mme":3, "Don":3, "Jonkheer":3}
for data in train_test:
    data['Title'] = data['Title'].map(title_mapping)

In [153]:
train.head() #Title이 추가된 상태

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [154]:
train.drop('Name', axis = 1, inplace = True)
test.drop('Name', axis = 1, inplace = True) #Name은 이제 필요없으니 드랍

In [155]:
sex_mapping = {'male': 0, 'female': 1} #성별을 수치형으로 변환
for data in train_test:
    data['Sex'] = data['Sex'].map(sex_mapping)

## 결측값 채우기

In [156]:
train["Age"].fillna(train.groupby('Title')["Age"].transform("median"), inplace = True)
test["Age"].fillna(train.groupby('Title')["Age"].transform("median"), inplace = True)

In [157]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
Title            0
dtype: int64

In [158]:
train['Embarked'].value_counts() #S가 제일 많으니까 빈곳은 S로 채워준다

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [159]:
train['Embarked'].fillna('S', inplace=True)

In [160]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
Title            0
dtype: int64

In [161]:
#Embarked도 숫자형 데이터로 변환
embarked_mapping = {"S":0, "C":1, "Q": 2}
for data in train_test:
    data['Embarked'] = data['Embarked'].map(embarked_mapping)

In [162]:
#티켓 가격을 각 선실의 등급별로 나눠서 등급마다 티켓 가격의 중간값으로 대체
train["Fare"].fillna(train.groupby('Pclass')["Fare"].transform("median"), inplace = True)
test["Fare"].fillna(train.groupby('Pclass')["Fare"].transform("median"), inplace = True)

### Cabin, Ticket은 딱히 생존과 관계 없을 것 같아 그냥 드랍하려고 한다

In [163]:
train.drop('Cabin', axis = 1, inplace = True)
test.drop('Cabin', axis = 1, inplace = True) #Name은 이제 필요없으니 드랍

In [164]:
train.drop('Ticket', axis = 1, inplace = True)
test.drop('Ticket', axis = 1, inplace = True) #Name은 이제 필요없으니 드랍

In [165]:
train.drop('PassengerId', axis = 1, inplace = True)#승객Id는 이제 필요없으니 드랍

In [166]:
train.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
Title       0
dtype: int64

In [167]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,22.0,1,0,7.25,0,0
1,1,1,1,38.0,1,0,71.2833,1,2
2,1,3,1,26.0,0,0,7.925,0,1
3,1,1,1,35.0,1,0,53.1,0,2
4,0,3,0,35.0,0,0,8.05,0,0


In [168]:
corr_matrix = train.corr()
corr_matrix["Survived"].sort_values(ascending = True)

Pclass     -0.338481
Age        -0.066127
SibSp      -0.035322
Parch       0.081629
Embarked    0.106811
Fare        0.257307
Title       0.441509
Sex         0.543351
Survived    1.000000
Name: Survived, dtype: float64

강의에서는 SibSp와 Parch를 합쳐서 Family Size를 정의하는데 상관관계를 보면 둘이 반대 성향을 띄므로 합치는건 좋지 않은 방법일 것 같아 합치지 않는다

In [169]:
test[test['Title'].isnull()]

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
414,1306,1,1,39.0,0,0,108.9,1,


In [170]:
test['Title'].fillna(2.0, inplace=True)

In [171]:
test.isnull().sum()

PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
Title          0
dtype: int64

In [172]:
train.head(20)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,22.0,1,0,7.25,0,0
1,1,1,1,38.0,1,0,71.2833,1,2
2,1,3,1,26.0,0,0,7.925,0,1
3,1,1,1,35.0,1,0,53.1,0,2
4,0,3,0,35.0,0,0,8.05,0,0
5,0,3,0,28.0,0,0,8.4583,2,0
6,0,1,0,54.0,0,0,51.8625,0,0
7,0,3,0,2.0,3,1,21.075,0,0
8,1,3,1,27.0,0,2,11.1333,0,2
9,1,2,1,14.0,1,0,30.0708,1,2


### 남겨둔 여지
수치의 정규화를 추가로 해야할까? 나이와 티켓가격의 범주가 다른 특성에 비해서 범위가 넓어서 일단은 한번 결과를 보고 추가로 정규화를 수행한 결과도 봐보겠다

## cross validation

In [173]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits=10, shuffle=True, random_state=42)

# DecisionTree

In [174]:
from sklearn.tree import DecisionTreeClassifier

In [175]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,22.0,1,0,7.25,0,0
1,1,1,1,38.0,1,0,71.2833,1,2
2,1,3,1,26.0,0,0,7.925,0,1
3,1,1,1,35.0,1,0,53.1,0,2
4,0,3,0,35.0,0,0,8.05,0,0


In [176]:
train_data = train.drop("Survived", axis=1)
target = train['Survived']

In [177]:
clf = DecisionTreeClassifier()
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
print(round(np.mean(score)*100, 2))

[0.83333333 0.79775281 0.78651685 0.76404494 0.78651685 0.7752809
 0.74157303 0.7752809  0.7752809  0.83146067]
78.67


# Random Forest

In [178]:
from sklearn.ensemble import RandomForestClassifier

In [179]:
clf = RandomForestClassifier(n_estimators = 13)
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
print(round(np.mean(score)*100, 2))

[0.86666667 0.82022472 0.71910112 0.78651685 0.80898876 0.85393258
 0.76404494 0.82022472 0.79775281 0.86516854]
81.03


# SVC

In [180]:
from sklearn.svm import SVC

In [181]:
clf = SVC()
scoring = 'accuracy'
score = cross_val_score(clf, train_data, target, cv = k_fold, n_jobs=1, scoring=scoring)
print(score)
print(round(np.mean(score)*100, 2))

[0.7        0.62921348 0.6741573  0.75280899 0.68539326 0.68539326
 0.65168539 0.64044944 0.61797753 0.7752809 ]
68.12


# Testing

In [184]:
clf = RandomForestClassifier(n_estimators = 13)
clf.fit(train_data, target)
test_data = test.drop("PassengerId", axis=1).copy()
prediction = clf.predict(test_data)

In [187]:
submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": prediction
})
submission.to_csv("submission.csv", index=False)

In [188]:
submission = pd.read_csv("submission.csv")
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,0
