In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
df = pd.read_csv('./ml_data/titanic_data.csv')
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [7]:
df.drop(['PassengerId', 'Cabin', 'Name', 'SibSp', 'Parch', 'Ticket'], axis=1, inplace=True)

In [9]:
df['Age'] = df.Age.fillna(df.Age.mean())

In [10]:
df.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
Fare        0
Embarked    2
dtype: int64

In [11]:
df.dropna(inplace=True)

In [12]:
y = df['Survived']

In [15]:
x = df.drop('Survived', axis=1)

In [17]:
x = pd.get_dummies(x)
x.head()

Unnamed: 0,Pclass,Age,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,22.0,7.25,0,1,0,0,1
1,1,38.0,71.2833,1,0,1,0,0
2,3,26.0,7.925,1,0,0,0,1
3,1,35.0,53.1,1,0,0,0,1
4,3,35.0,8.05,0,1,0,0,1


In [26]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=111)

In [18]:
# VotindClassifier

In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from sklearn.ensemble import VotingClassifier

In [22]:
tree = DecisionTreeClassifier()
log = LogisticRegression()
knn = KNeighborsClassifier()
nb = GaussianNB()
svc = SVC(probability=True)   # predict_proba() : 확률값을 출력가능하도록 하는 옵션..

vot_h = VotingClassifier(estimators = [('log', log), ('tree', tree), ('knn', knn), ('nb', nb), \
                               ('svc', svc)], voting='hard')

vot_s = VotingClassifier(estimators = [('log', log), ('tree', tree), ('knn', knn), ('nb', nb), \
                               ('svc', svc)], voting='soft')

In [23]:
models = [tree, log, knn, nb, svc, vot_h, vot_s]

In [30]:
import warnings
warnings.filterwarnings(action='ignore')

In [31]:
from sklearn.metrics import accuracy_score

In [32]:
for m in models:
    m.fit(x_train, y_train)
    preds = m.predict(x_test)
    acc = accuracy_score(y_test, preds)
    print(m.__class__.__name__, ':', acc)
    print('------------------------')

DecisionTreeClassifier : 0.7940074906367042
------------------------
LogisticRegression : 0.8389513108614233
------------------------
KNeighborsClassifier : 0.700374531835206
------------------------
GaussianNB : 0.8127340823970037
------------------------
SVC : 0.6629213483146067
------------------------
VotingClassifier : 0.8426966292134831
------------------------
VotingClassifier : 0.846441947565543
------------------------


In [33]:
from sklearn.datasets import load_breast_cancer

In [34]:
cancer = load_breast_cancer()

In [36]:
xc = cancer.data
yc = cancer.target

In [37]:
xc_train, xc_test, yc_train, yc_test = train_test_split(xc, yc, test_size=0.2, random_state=77)

In [38]:
for m in models:
    m.fit(xc_train, yc_train)
    preds = m.predict(xc_test)
    acc = accuracy_score(yc_test, preds)
    print(m.__class__.__name__, ':', acc)
    print('------------------------')

DecisionTreeClassifier : 0.9385964912280702
------------------------
LogisticRegression : 0.9385964912280702
------------------------
KNeighborsClassifier : 0.9385964912280702
------------------------
GaussianNB : 0.9298245614035088
------------------------
SVC : 0.9122807017543859
------------------------
VotingClassifier : 0.9473684210526315
------------------------
VotingClassifier : 0.9385964912280702
------------------------


In [39]:
vot_h.fit(x_train, y_train)

In [40]:
import pickle

In [41]:
with open('vot_har_cls.pickle', 'wb') as f:
    pickle.dump(vot_h, f)

In [42]:
with open('vot_har_cls.pickle', 'rb') as f:
    vot_model = pickle.load(f)

In [43]:
vot_model.predict(x_test)

array([1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 1], dtype=int64)

In [45]:
# bagging 

In [46]:
from sklearn.ensemble import BaggingClassifier

In [47]:
bag = BaggingClassifier()

In [48]:
bag.fit(x_train, y_train)

In [49]:
preds_bag = bag.predict(x_test)

In [50]:
accuracy_score(y_test, preds_bag)

0.7902621722846442

In [54]:
bag = BaggingClassifier()
bag.fit(xc_train, yc_train)
preds_bag_c = bag.predict(xc_test)
accuracy_score(yc_test, preds_bag_c)

0.9385964912280702

In [55]:
# randomforest

In [56]:
from sklearn.ensemble import RandomForestClassifier

In [61]:
rfc = RandomForestClassifier(n_estimators=500)

In [62]:
rfc.fit(x_train, y_train)
preds = rfc.predict(x_test)
accuracy_score(y_test, preds)

0.8239700374531835

In [63]:
rfc = RandomForestClassifier(n_estimators=500)
rfc.fit(xc_train, yc_train)
preds = rfc.predict(xc_test)
accuracy_score(yc_test, preds)

0.9385964912280702

In [64]:
# boosting : AdaBoost    
# - 모델이 순차적으로 생성되므로 학습속도가 느리다..
# - 앞 모델이 틀린 데이터를 다음 모델이 학습을 통해서 예측하도록 학습..

In [65]:
from sklearn.ensemble import AdaBoostClassifier

In [72]:
ada = AdaBoostClassifier(n_estimators = 1000)

In [73]:
ada.fit(x_train, y_train)
preds = ada.predict(x_test)
accuracy_score(y_test, preds)

0.8052434456928839

In [75]:
ada = AdaBoostClassifier(n_estimators = 1000)
ada.fit(xc_train, yc_train)
preds = ada.predict(xc_test)
accuracy_score(yc_test, preds)

0.9649122807017544

In [None]:
# boosting : GradientBoosting  

# - 모델이 순차적으로 생성되므로 학습속도가 느리다..
# - 앞 모델의 확률오차 크기를 다음 모델이 예측하도록 학습한다..
# - 모델의 예측확률값을 더하면 예측오차가 보정이 된다..

In [76]:
from sklearn.ensemble import GradientBoostingClassifier

In [80]:
grad = GradientBoostingClassifier(n_estimators = 2000)

In [81]:
grad.fit(x_train, y_train)
preds = grad.predict(x_test)
accuracy_score(y_test, preds)

0.8277153558052435

In [82]:
grad = GradientBoostingClassifier(n_estimators = 2000)
grad.fit(xc_train, yc_train)
preds = grad.predict(xc_test)
accuracy_score(yc_test, preds)

0.9473684210526315

In [None]:
# XGBoostClassifier
# LGBMClassifier

# gradientboosting의 속도를 개선한 알고리즘
# 성능 면에서도 몇 가지 보완을 해서 나온 알고리즘

In [83]:
# anaconda prompt에서
# pip install xgboost
# pip install lightgbm

In [85]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [86]:
xgb = XGBClassifier()
xgb.fit(x_train, y_train)
preds = xgb.predict(x_test)
accuracy_score(y_test, preds)

0.8239700374531835

In [87]:
xgb = XGBClassifier()
xgb.fit(xc_train, yc_train)
preds = xgb.predict(xc_test)
accuracy_score(yc_test, preds)

0.9473684210526315

In [88]:
lgbm = LGBMClassifier()
lgbm.fit(x_train, y_train)
preds = lgbm.predict(x_test)
accuracy_score(y_test, preds)

0.8164794007490637

In [89]:
lgbm = LGBMClassifier()
lgbm.fit(xc_train, yc_train)
preds = lgbm.predict(xc_test)
accuracy_score(yc_test, preds)

0.9649122807017544

In [90]:
from sklearn.metrics import classification_report, roc_auc_score

In [92]:
print(classification_report(yc_test, preds))

              precision    recall  f1-score   support

           0       0.97      0.92      0.95        39
           1       0.96      0.99      0.97        75

    accuracy                           0.96       114
   macro avg       0.97      0.95      0.96       114
weighted avg       0.97      0.96      0.96       114



In [97]:
lgbm.predict_proba(xc_test)[:, 1]

array([9.99976213e-01, 2.66332654e-05, 9.99968441e-01, 2.69773241e-05,
       3.77579594e-05, 8.13221730e-05, 9.98591572e-01, 9.98624529e-01,
       9.96005462e-01, 5.41450820e-05, 5.57835418e-05, 9.30653997e-01,
       9.99893358e-01, 9.99978885e-01, 9.99983656e-01, 9.99976069e-01,
       9.99958459e-01, 9.99938325e-01, 9.99948875e-01, 9.99970394e-01,
       2.61425996e-05, 8.19264237e-01, 4.62599770e-05, 6.66469650e-04,
       9.99979029e-01, 9.99903564e-01, 9.99958600e-01, 9.99973252e-01,
       9.99981929e-01, 3.03624679e-05, 9.99953985e-01, 3.92555788e-01,
       9.99978979e-01, 1.82772581e-04, 9.99981279e-01, 9.99902660e-01,
       2.74287200e-05, 4.07182882e-05, 5.69585631e-05, 9.99160410e-01,
       9.99974827e-01, 3.49797072e-05, 8.71042309e-05, 9.98771444e-01,
       9.99845027e-01, 9.99981489e-01, 4.09028578e-04, 2.31698407e-04,
       9.99974417e-01, 9.99967923e-01, 9.97720460e-01, 1.30381978e-04,
       9.99979481e-01, 1.45008307e-02, 9.83158756e-01, 9.99909564e-01,
      

In [99]:
roc_auc_score(yc_test, lgbm.predict_proba(xc_test)[:, 1])

0.9887179487179487