In [1]:
import numpy as np 
import pandas as pd 
import warnings
warnings.filterwarnings(action='ignore') 

## 데이터 불러오기

In [2]:
train_data = pd.read_csv("./titanic/train.csv")
test_data = pd.read_csv("./titanic/test.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
train_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

## 데이터셋 만들기

In [4]:
features = ["Pclass", "Sex", "SibSp", "Parch"]
X_train = train_data[features]
Y_train = train_data["Survived"]
x_test = test_data[features]

In [5]:
X_train = pd.get_dummies(X_train)
x_test = pd.get_dummies(x_test)

In [6]:
from sklearn.model_selection import train_test_split

# train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(X_train, Y_train, test_size=0.2, stratify=Y_train, random_state=22)

x_train['Sex'] = x_train['Sex'].map( {'female': 1, 'male': 0} )
x_test['Sex'] = x_test['Sex'].map( {'female': 1, 'male': 0} )

## 모델 학습

#### RandomForestClassifier

In [7]:
from sklearn.ensemble import RandomForestClassifier

rfmodel = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
rfmodel.fit(x_train, y_train)
rfmodel.score(x_valid, y_valid)

0.8044692737430168

#### LGBMClassifier

In [8]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(n_estimators=200, learning_rate = 0.1)
evals = [(x_valid, y_valid)]
lgbm.fit(x_train, y_train, early_stopping_rounds=100, eval_metric='auc', eval_set=evals, verbose=True)
lgbm.score(x_valid, y_valid)

[1]	valid_0's auc: 0.853953	valid_0's binary_logloss: 0.625293
Training until validation scores don't improve for 100 rounds
[2]	valid_0's auc: 0.853953	valid_0's binary_logloss: 0.592298
[3]	valid_0's auc: 0.853953	valid_0's binary_logloss: 0.565443
[4]	valid_0's auc: 0.853953	valid_0's binary_logloss: 0.543275
[5]	valid_0's auc: 0.855731	valid_0's binary_logloss: 0.523451
[6]	valid_0's auc: 0.856785	valid_0's binary_logloss: 0.50643
[7]	valid_0's auc: 0.857839	valid_0's binary_logloss: 0.492187
[8]	valid_0's auc: 0.861001	valid_0's binary_logloss: 0.480001
[9]	valid_0's auc: 0.85863	valid_0's binary_logloss: 0.470512
[10]	valid_0's auc: 0.861001	valid_0's binary_logloss: 0.461388
[11]	valid_0's auc: 0.85863	valid_0's binary_logloss: 0.45449
[12]	valid_0's auc: 0.854677	valid_0's binary_logloss: 0.448653
[13]	valid_0's auc: 0.858103	valid_0's binary_logloss: 0.44254
[14]	valid_0's auc: 0.858169	valid_0's binary_logloss: 0.437311
[15]	valid_0's auc: 0.859223	valid_0's binary_logloss: 0

0.8435754189944135

#### ExtraTreesClassifier

In [9]:
from sklearn.ensemble import ExtraTreesClassifier
etree = ExtraTreesClassifier(n_estimators=100, max_depth=5, random_state=1)
etree.fit(x_train, y_train)
etree.score(x_valid, y_valid)

0.7988826815642458

#### GradientBoostingClassifier

In [10]:
from sklearn.ensemble import GradientBoostingClassifier
gboost = GradientBoostingClassifier()
gboost.fit(x_train, y_train)
gboost.score(x_valid, y_valid)

0.7877094972067039

### Voting

In [11]:
from sklearn.ensemble import VotingClassifier

votingC = VotingClassifier(estimators=[('rfc', rfmodel), ('lgbc', lgbm), ('extc', etree), ('gbc',gboost)], voting='soft', n_jobs=4)

votingC = votingC.fit(X_train, Y_train)

## 예측 및 제출

In [12]:
predictions = votingC.predict(x_test)
predictions

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [13]:
submit = pd.read_csv("./titanic/gender_submission.csv")
submit

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [14]:
submit['Survived'] = predictions
submit

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [None]:
submit.to_csv('submission1.csv', index=False)