<a href="https://colab.research.google.com/github/znddi/springboot2/blob/main/ensemble_stacking_bagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.datasets import fetch_openml
import pandas as pd

In [3]:
titanic = fetch_openml('titanic', version=1, as_frame=True)

In [4]:
titanic

{'data':       pclass                                             name     sex  \
 0          1                    Allen, Miss. Elisabeth Walton  female   
 1          1                   Allison, Master. Hudson Trevor    male   
 2          1                     Allison, Miss. Helen Loraine  female   
 3          1             Allison, Mr. Hudson Joshua Creighton    male   
 4          1  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female   
 ...      ...                                              ...     ...   
 1304       3                             Zabour, Miss. Hileni  female   
 1305       3                            Zabour, Miss. Thamine  female   
 1306       3                        Zakarian, Mr. Mapriededer    male   
 1307       3                              Zakarian, Mr. Ortin    male   
 1308       3                               Zimmerman, Mr. Leo    male   
 
           age  sibsp  parch  ticket      fare    cabin embarked boat   body  \
 0     29.0000      0 

In [5]:
type(titanic)

In [6]:
df = titanic.frame[['pclass', 'sex', 'age', 'fare', 'survived']].dropna()

In [7]:
df

Unnamed: 0,pclass,sex,age,fare,survived
0,1,female,29.0000,211.3375,1
1,1,male,0.9167,151.5500,1
2,1,female,2.0000,151.5500,0
3,1,male,30.0000,151.5500,0
4,1,female,25.0000,151.5500,0
...,...,...,...,...,...
1301,3,male,45.5000,7.2250,0
1304,3,female,14.5000,14.4542,0
1306,3,male,26.5000,7.2250,0
1307,3,male,27.0000,7.2250,0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1045 entries, 0 to 1308
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   pclass    1045 non-null   int64   
 1   sex       1045 non-null   category
 2   age       1045 non-null   float64 
 3   fare      1045 non-null   float64 
 4   survived  1045 non-null   category
dtypes: category(2), float64(2), int64(1)
memory usage: 34.9 KB


In [9]:
df['sex'] = df['sex'].map({'male':0, 'female':1})

In [10]:
df

Unnamed: 0,pclass,sex,age,fare,survived
0,1,1,29.0000,211.3375,1
1,1,0,0.9167,151.5500,1
2,1,1,2.0000,151.5500,0
3,1,0,30.0000,151.5500,0
4,1,1,25.0000,151.5500,0
...,...,...,...,...,...
1301,3,0,45.5000,7.2250,0
1304,3,1,14.5000,14.4542,0
1306,3,0,26.5000,7.2250,0
1307,3,0,27.0000,7.2250,0


In [12]:
X = df[['pclass', 'sex', 'age', 'fare']]
y = df['survived']

In [13]:
print(X.shape, y.shape)

(1045, 4) (1045,)


In [14]:
from sklearn.ensemble import StackingClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [16]:
base_models = [
    ('lr', LogisticRegression()),
    ('dt', DecisionTreeClassifier()),
    ('svm', SVC())
]

stack = StackingClassifier(
    estimators=base_models,
    final_estimator=GradientBoostingClassifier()
)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [18]:
stack.fit(X_train, y_train)

In [19]:
pred = stack.predict(X_test)

In [20]:
print("Accuracy:", accuracy_score(y_test, pred))

Accuracy: 0.8133971291866029


In [21]:
# ---- stacking end

In [22]:
# ---- bagging start

In [23]:
from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [24]:
# 1) 데이터 준비
titanic = fetch_openml('titanic', version=1, as_frame=True)

In [25]:
df = titanic.frame[['pclass', 'sex', 'age', 'fare', 'survived']].dropna()

In [26]:
df['sex'] = df['sex'].map({'male':0, 'female':1})

In [27]:
X = df[['pclass', 'sex', 'age', 'fare']]

In [28]:
y = df['survived']

In [29]:
# 2) 학습/테스트 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [30]:
# 3) 랜덤포레스트 모델 선언
rf = RandomForestClassifier()

In [31]:
rf.fit(X_train, y_train)

In [32]:
# 4) 예측 + 정확도
pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, pred))

Accuracy: 0.8277511961722488
