In [1]:
# 모듈 불러오기
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
import pandas as pd

titanic_df = pd.read_csv("../../data/titanic_train.csv")
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:

# 전처리
mean_data = titanic_df["Age"].mean()
titanic_df["Age"] = titanic_df["Age"].fillna(titanic_df["Age"].mean())
titanic_df["Cabin"] = titanic_df["Cabin"].fillna("N")
titanic_df["Embarked"] = titanic_df["Embarked"].fillna("N")

le_Sex = LabelEncoder()
le_Sex.fit(titanic_df["Sex"])
tmp = le_Sex.transform(titanic_df["Sex"])
titanic_df["Sex"] = tmp

le_Embarked = LabelEncoder()
le_Embarked.fit(titanic_df["Embarked"])
tmp = le_Embarked.transform(titanic_df["Embarked"])
titanic_df["Embarked"] = tmp

drop_feature = ["PassengerId", "Name", "Ticket", "Cabin"]
titanic_df = titanic_df.drop(drop_feature, axis=1)

y = titanic_df["Survived"]
X = titanic_df.drop(["Survived"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

In [4]:
# from xgboost import XGBClassifier
import xgboost as xgb  # GPU를 사용하기 위해 xgboost에서 사용하는 본래 기능을 사용해야 함
import time  # CPU 사용시와 GPU 사용시의 속도 차이 확인

# CPU -> GPU로 이동하기 좋은 형태(Demetrix)로 변환
dtrain_cpu = xgb.DMatrix(X_train, label=y_train)
dtest_cpu = xgb.DMatrix(X_test, label=y_test)

dtrain_gpu = xgb.DMatrix(X_train, label=y_train)
dtest_gpu = xgb.DMatrix(X_test, label=y_test)

dtrain_cpu == dtrain_gpu


False

In [5]:
# 파라미터 설정
param_cpu = {
    "objective": "binary:logistic",
    "learning_rate": 0.1,
    "max_depth": 5,
    "eval_metric": "logloss",
}

param_gpu = {
    "objective": "binary:logistic",
    "learning_rate": 0.1,
    "max_depth": 5,
    "eval_metric": "logloss",
    "device": "cuda",
    "tree_method": "hist"  # gpu를 사용하겠다는 의미
}

In [6]:
# cpu 기반으로 학습해보기
start_time = time.time()
model_cpu = xgb.train(
    params=param_cpu,
    dtrain=dtrain_cpu,
    num_boost_round=100
)

end_time = time.time()
print("cpu 학습시간:", end_time - start_time)

cpu 학습시간: 0.08660125732421875


In [7]:
# gpu 기반으로 학습해보기
start_time = time.time()
model_gpu = xgb.train(
    params=param_gpu,
    dtrain=dtrain_gpu,
    num_boost_round=100
)

end_time = time.time()
print("gpu 학습시간:", end_time - start_time)

# 데이터를 GPU로 넘기는 시간때문에 CPU로 돌릴때보다 더 시간이 오래 걸림

gpu 학습시간: 0.5666306018829346


In [8]:
# 더 큰 데이터로 실험해보기

df_credit = pd.read_csv("../../data/creditcard.csv")
df_credit.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [9]:
from imblearn.over_sampling import SMOTE

# 전처리
# train, test 셋 분리
X = df_credit.drop("Class", axis=1)
y = df_credit["Class"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=11)
y_train.value_counts()

# 업샘플링
X = df_credit.drop("Class", axis=1)
y = df_credit["Class"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

smote = SMOTE(random_state=11)
X_usd, y_usd = smote.fit_resample(X_train, y_train)  # upsampling 진행

In [10]:
# CPU -> GPU로 이동하기 좋은 형태(Demetrix)로 변환
dtrain_cpu = xgb.DMatrix(X_usd, label=y_usd)
dtrain_gpu = xgb.DMatrix(X_usd, label=y_usd)


In [11]:
# cpu 기반으로 학습해보기
start_time = time.time()
model_cpu = xgb.train(
    params=param_cpu,
    dtrain=dtrain_cpu,
    num_boost_round=100
)

end_time = time.time()
print("cpu 학습시간:", end_time - start_time)

# gpu 기반으로 학습해보기
start_time = time.time()
model_gpu = xgb.train(
    params=param_gpu,
    dtrain=dtrain_gpu,
    num_boost_round=100
)

end_time = time.time()
print("gpu 학습시간:", end_time - start_time)


cpu 학습시간: 1.1826794147491455
gpu 학습시간: 1.221165418624878
