In [2]:
from pycaret.classification import *

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [7]:
# 데이터셋 불러오기

titanic = pd.read_csv('week4/train.csv')
titanic.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [8]:
# 결측치 확인

titanic.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

### 참고한 Kaggle Notebook: [여기](https://www.kaggle.com/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy)

In [49]:
# 결측치 채우기

titanic['Age'].fillna(titanic['Age'].median(), inplace = True) # 중위수
titanic['Embarked'].fillna(titanic['Embarked'].mode()[0], inplace = True) # 최빈값
titanic['Fare'].fillna(titanic['Fare'].median(), inplace = True) # 중위수

In [50]:
# 의미없는 column 제거

drop_column = ['PassengerId', 'Cabin', 'Ticket']
titanic.drop(drop_column, axis=1, inplace = True)

In [53]:
# 새로운 feature 도출

# 총 가족 구성원 수
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch'] + 1
# 혼자 탑승했는지 아닌지
titanic['IsAlone'] = 1 
titanic['IsAlone'].loc[titanic['FamilySize'] > 1] = 0

In [54]:
# AutoML

clf1 = setup(data = titanic, target = 'Survived', ignore_features=['Name'],\
            numeric_features=['SibSp', 'Parch', 'FamilySize'])

Unnamed: 0,Description,Value
0,session_id,152
1,Target,Survived
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(891, 11)"
5,Missing Values,False
6,Numeric Features,5
7,Categorical Features,4
8,Ordinal Features,False
9,High Cardinality Features,False


In [55]:
top3 = compare_models(n_select = 3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8187,0.852,0.7183,0.7847,0.7465,0.6063,0.6108,0.057
ada,Ada Boost Classifier,0.8059,0.843,0.7399,0.7469,0.7412,0.5861,0.5883,0.059
ridge,Ridge Classifier,0.7946,0.0,0.6797,0.7593,0.7111,0.5533,0.5607,0.009
lda,Linear Discriminant Analysis,0.7946,0.8434,0.6797,0.7593,0.7111,0.5533,0.5607,0.012
lightgbm,Light Gradient Boosting Machine,0.793,0.8504,0.6969,0.737,0.7119,0.5515,0.5558,0.149
lr,Logistic Regression,0.7914,0.8434,0.6712,0.7562,0.7057,0.5458,0.5526,0.66
rf,Random Forest Classifier,0.7882,0.8419,0.7134,0.7242,0.7142,0.5466,0.5511,0.155
et,Extra Trees Classifier,0.7592,0.8124,0.7089,0.6758,0.6881,0.4928,0.4972,0.143
nb,Naive Bayes,0.748,0.7957,0.6491,0.6696,0.6578,0.4587,0.4599,0.009
dt,Decision Tree Classifier,0.7385,0.7305,0.662,0.6545,0.6538,0.4445,0.4486,0.011


In [56]:
top3[0]

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=152, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [59]:
gbc = finalize_model(top3[0])

In [63]:
# 테스트셋 불러오기

test_data = pd.read_csv('week4/test.csv')
test_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [64]:
# 트레인셋과 같은 처리

drop_column = ['PassengerId', 'Cabin', 'Ticket']
test_data.drop(drop_column, axis=1, inplace = True)

# 새로운 feature 도출

test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch'] + 1
test_data['IsAlone'] = 1 
test_data['IsAlone'].loc[test_data['FamilySize'] > 1] = 0

In [65]:
predictions = predict_model(gbc, data=test_data)

In [66]:
predictions

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone,Label,Score
0,3,"Kelly, Mr. James",male,34.5,0,0,7.8292,Q,1,1,0,0.9522
1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,7.0000,S,2,0,0,0.6015
2,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,9.6875,Q,1,1,0,0.6929
3,3,"Wirz, Mr. Albert",male,27.0,0,0,8.6625,S,1,1,0,0.8287
4,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,12.2875,S,3,0,0,0.6239
...,...,...,...,...,...,...,...,...,...,...,...,...
413,3,"Spector, Mr. Woolf",male,,0,0,8.0500,S,1,1,0,0.8829
414,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,108.9000,C,1,1,1,0.9601
415,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,7.2500,S,1,1,0,0.9554
416,3,"Ware, Mr. Frederick",male,,0,0,8.0500,S,1,1,0,0.8829


In [68]:
sample_submssion = pd.read_csv('submission2.csv')
# sample_submssion['PassengerId'] = test_data['PassengerId']
sample_submssion['Survived'] = predictions['Label']
sample_submssion.to_csv("submission3.csv", index = False)

In [48]:
sample_submssion

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
