# Titanic 풀어보기
Kaggle계의 Hello Wor

In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None 

## 파일 읽기
test = pd.read_csv('./test.csv')
train = pd.read_csv('./train.csv')

## 트레이닝 데이터 확인하기
### 성별 확인

In [2]:
print('여자')
print(train["Survived"][train["Sex"] == 'female'].value_counts(normalize = True))
print('남자')
print(train["Survived"][train["Sex"] == 'male'].value_counts(normalize = True))

여자
1    0.742038
0    0.257962
Name: Survived, dtype: float64
남자
0    0.811092
1    0.188908
Name: Survived, dtype: float64


### 나이 확인

In [3]:
train["Child"] = 0;
train["Child"][train["Age"] < 18] = 1

print('아이들')
print(train["Survived"][train["Child"] == 1].value_counts(normalize = True))
print('성인')
print(train["Survived"][train["Child"] == 0].value_counts(normalize = True))

아이들
1    0.539823
0    0.460177
Name: Survived, dtype: float64
성인
0    0.638817
1    0.361183
Name: Survived, dtype: float64


## 예측
### 여자는 무조건 산다
In[2] 에서 여자의 생존확률이 더 높았기 때문

In [4]:
gender_test = test.copy()
gender_test["Survived"] = 0
gender_test["Survived"][gender_test["Sex"] == 'female'] = 1

pd.DataFrame(gender_test).to_csv('gender_predict.csv', columns=['PassengerId', 'Survived'], index=False)

### 아이들은 무조건 산다
In[3] 에서 아이들의 생존 확률이 더 높았기 때문

In [5]:
age_test = test.copy()
age_test["Child"] = 0
age_test["Child"][age_test["Age"] < 18] = 1

age_test["Survived"] = 0
age_test["Survived"][age_test["Child"] == 1] = 1

pd.DataFrame(gender_test).to_csv('age_predict.csv', columns=['PassengerId', 'Survived'], index=False)

## 예측의 질 높히기
결정트리 (decision tree)를 이용하여 질을 높혀보자

In [6]:
### 라이브러리

In [7]:
# Import the Numpy library
import numpy as np

# Import 'tree' from scikit-learn library
from sklearn import tree

### 피쳐 정리하기
- 결정트리가 결정하기 쉽도록 String을 integer로 변경하자
- 비어있는 값에 대해서도 처리하는 로직을 추가하자

In [8]:
# Convert the male and female groups to integer form
train["Sex"][train["Sex"] == "male"] = 0
train["Sex"][train["Sex"] == "female"] = 1

# Impute the Embarked variable
train["Embarked"] = train["Embarked"].fillna('S')

# Convert the Embarked classes to integer form
train["Embarked"][train["Embarked"] == "S"] = 0
train["Embarked"][train["Embarked"] == "C"] = 1
train["Embarked"][train["Embarked"] == "Q"] = 2

# NaN값이 많은 Age필드는 중간값으로 대충 넣어둔다
train["Age"] = train["Age"].fillna(train["Age"].median())

#Print the Sex and Embarked columns
print(train["Sex"])
print(train["Embarked"])

0      0
1      1
2      1
3      1
4      0
5      0
6      0
7      0
8      1
9      1
10     1
11     1
12     0
13     0
14     1
15     1
16     0
17     0
18     1
19     1
20     0
21     0
22     1
23     0
24     1
25     1
26     0
27     0
28     1
29     0
      ..
861    0
862    1
863    1
864    0
865    1
866    1
867    0
868    0
869    0
870    0
871    1
872    0
873    0
874    1
875    1
876    0
877    0
878    0
879    1
880    1
881    0
882    1
883    0
884    0
885    1
886    0
887    1
888    1
889    0
890    0
Name: Sex, Length: 891, dtype: object
0      0
1      1
2      0
3      0
4      0
5      2
6      0
7      0
8      0
9      1
10     0
11     0
12     0
13     0
14     0
15     0
16     2
17     0
18     0
19     1
20     0
21     0
22     2
23     0
24     0
25     0
26     1
27     0
28     2
29     0
      ..
861    0
862    0
863    0
864    0
865    0
866    1
867    0
868    0
869    0
870    0
871    0
872    0
873    0
874    1
875    1

### 결정트리 만들기
1. 학습에 쓸 피쳐들을 선정한다
2. `DecisionTreeClassifier()` 메소드로 트리를 만든다
3. `fit()` 메소드로 학습시킨다
4. `feature_importances_`는 각 피쳐들의 중요도를 리턴한다
5. `score`는 학습이 최종적으로 얼마나 잘 되었는지 점수로 나타낸다

In [9]:
# Print the train data to see the available features
print(train)

# Create the target and features numpy arrays: target, features_one
target = train["Survived"].values
features_one = train[["Pclass", "Sex", "Age", "Fare"]].values

# Fit your first decision tree: my_tree_one
my_tree_one = tree.DecisionTreeClassifier()
my_tree_one = my_tree_one.fit(X=features_one, y=target)

# Look at the importance and score of the included features
print(my_tree_one.feature_importances_)
print(my_tree_one.score(features_one, target))

     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
5              6         0       3   
6              7         0       1   
7              8         0       3   
8              9         1       3   
9             10         1       2   
10            11         1       3   
11            12         1       1   
12            13         0       3   
13            14         0       3   
14            15         0       3   
15            16         1       2   
16            17         0       3   
17            18         1       2   
18            19         0       3   
19            20         1       3   
20            21         0       2   
21            22         1       2   
22            23         1       3   
23            24         1       1   
24            25         0       3   
25          

## 테스트 해보기
### 테스트 데이터 정제하기
테스트 데이터 중 누락 값들을 채워넣자

In [10]:
# Impute the missing value with the median
test["Fare"] = test["Fare"].fillna(test["Fare"].median())

# Convert the male and female groups to integer form
test["Sex"][test["Sex"] == "male"] = 0
test["Sex"][test["Sex"] == "female"] = 1

# Impute the Embarked variable
test["Embarked"] = test["Embarked"].fillna('S')

test["Age"] = test["Age"].fillna(test["Age"].median())

# Convert the Embarked classes to integer form
test["Embarked"][test["Embarked"] == "S"] = 0
test["Embarked"][test["Embarked"] == "C"] = 1
test["Embarked"][test["Embarked"] == "Q"] = 2

### 테스트 돌려보기

In [11]:
# Extract the features from the test set: Pclass, Sex, Age, and Fare.
test_features = test[["Pclass","Sex", "Age", "Fare"]].values

# Make your prediction using the test set
my_prediction = my_tree_one.predict(X=test_features)

# Create a data frame with two columns: PassengerId & Survived. Survived contains your predictions
PassengerId = np.array(test["PassengerId"]).astype(int)
my_solution = pd.DataFrame(my_prediction, PassengerId, columns = ["Survived"])
print(my_solution)

      Survived
892          0
893          0
894          1
895          1
896          1
897          0
898          0
899          0
900          1
901          0
902          0
903          0
904          1
905          1
906          1
907          1
908          0
909          1
910          1
911          0
912          0
913          1
914          1
915          0
916          1
917          0
918          1
919          1
920          1
921          0
...        ...
1280         0
1281         0
1282         0
1283         1
1284         1
1285         0
1286         0
1287         1
1288         0
1289         1
1290         0
1291         0
1292         1
1293         0
1294         1
1295         0
1296         0
1297         0
1298         0
1299         0
1300         1
1301         1
1302         1
1303         1
1304         0
1305         0
1306         1
1307         0
1308         0
1309         0

[418 rows x 1 columns]


In [12]:
# Check that your data frame has 418 entries
print(my_solution.shape)

(418, 1)


### 파일로 출력해서 캐글에 올려보기
아래와 같이 csv로 내보내서 캐글에 올려보자.
점수는 굉장히 낮을 것이다ㅎㅎㅎ... 그 이유는 오버피팅때문

In [13]:
# Write your solution to a csv file with the name my_solution.csv
my_solution.to_csv("first_decision_tree.csv", index_label = ["PassengerId"])

## Overfitting 제거하기
트레이닝 데이터에 너무 과하게 맞춰져서 예측률이 떨어진 것이다!
Decision Tree의 설정 값들을 조절하여 오버피팅을 줄여보자.

-  [의사결정나무(Decision Tree)](https://ratsgo.github.io/machine%20learning/2017/03/26/tree/)

In [14]:
# Create a new array with the added features: features_two
features_two = train[["Pclass","Age","Sex","Fare", "SibSp", "Parch", "Embarked"]].values
print(train.describe())

#Control overfitting by setting "max_depth" to 10 and "min_samples_split" to 5 : my_tree_two
max_depth = 10
min_samples_split = 5
my_tree_two = tree.DecisionTreeClassifier(max_depth = max_depth, min_samples_split = min_samples_split, random_state = 1)
my_tree_two = my_tree_two.fit(features_two, target)

#Print the score of the new decison tree
print(my_tree_two.feature_importances_)
print(my_tree_two.score(features_two, target))

       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  891.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.361582    0.523008   
std     257.353842    0.486592    0.836071   13.019697    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   22.000000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   35.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare       Child  
count  891.000000  891.000000  891.000000  
mean     0.381594   32.204208    0.126824  
std      0.806057   49.693429    0.332962  
min      0.000000    0.000000    0.000000  
25%      0.000000    7.910400    0.000000  
50%      0.000000   14.454200    0.000000  
75%      0.000000   31.000000    0.000000  
max      6.000000

## Feature Engineering
데이터를 보고 가설을 하나 세우고 그 가설을 실험할 수 있도록 피쳐들을 조합해서 새로운 피쳐를 만드는 일련의 행위를 뜻한다.

이번에는 '가족의 단위가 크면 살아남기 힘들었을 것이다' 라는 가정을 가지고 학습을 시켜보자

In [15]:
# Create train_two with the newly defined feature
train_two = train.copy()
train_two["family_size"] = train_two["SibSp"] + train_two["Parch"] + 1

# Create a new feature set and add the new feature
features_three = train_two[["Pclass", "Sex", "Age", "Fare", "SibSp", "Parch", "family_size"]].values

# Define the tree classifier, then fit the model
my_tree_three = tree.DecisionTreeClassifier()
my_tree_three = my_tree_three.fit(features_three, target)

# Print the score of this decision tree
print(my_tree_three.score(features_three, target))

my_solution.to_csv("decision_tree_feature_engineering_family_size.csv", index_label = ["PassengerId"])

0.9797979797979798


이를 캐글에 업로드해보니 `first_decision_tree.csv` 보다는 점수가 올라간 것을 확인할 수 있었다.
하지만 여전히 이분법적으로 (성별/나이) 처리했을때보다 점수가 낮다.