In [15]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc

from sklearn.tree import export_graphviz
import graphviz

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

from itertools import product
# 한글 폰트 깨지지 않게 하기
rc('font', family='AppleGothic')
plt.rcParams['axes.unicode_minus'] = False

# 실행결과 경고메시지 출력 제외
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("../01.files/02.Bigsata_analysis/체질검사.csv", encoding = 'euc-kr')

df[['WEIGHT', 'WEIGHT_UNIT']] = df['WEIGHT'].str.split(expand = True)
df[['HEIGHT', 'HEIGHT_UNIT']] = df['HEIGHT'].str.split(expand = True)
df['WEIGHT'] = df['WEIGHT'].astype('float64')
df['HEIGHT'] = df['HEIGHT'].astype('float64')
df.info()
df = df.drop(['WEIGHT_UNIT', 'HEIGHT_UNIT'], axis = 1)

df = pd.get_dummies(df)

df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252 entries, 0 to 251
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   FAT          252 non-null    float64
 1   AGE          252 non-null    int64  
 2   WEIGHT       252 non-null    float64
 3   HEIGHT       252 non-null    float64
 4   NECK         252 non-null    float64
 5   CHEST        252 non-null    float64
 6   ABDOMEN      252 non-null    float64
 7   HIP          252 non-null    float64
 8   THIGH        252 non-null    float64
 9   KNEE         252 non-null    float64
 10  ANKLE        252 non-null    float64
 11  BICEPS       252 non-null    float64
 12  FOREARM      252 non-null    float64
 13  WRIST        252 non-null    float64
 14  GENDER       252 non-null    object 
 15  WEIGHT_UNIT  252 non-null    object 
 16  HEIGHT_UNIT  252 non-null    object 
dtypes: float64(13), int64(1), object(3)
memory usage: 33.6+ KB


Unnamed: 0,FAT,AGE,WEIGHT,HEIGHT,NECK,CHEST,ABDOMEN,HIP,THIGH,KNEE,ANKLE,BICEPS,FOREARM,WRIST,GENDER_남성,GENDER_여성
0,35.2,46,363.15,72.25,51.2,136.2,148.1,147.7,87.3,49.1,29.6,45.0,29.0,21.4,1,0
1,11.8,27,168.0,71.25,38.1,93.0,79.1,94.5,57.3,36.2,24.5,29.0,30.0,18.8,1,0
2,22.2,69,177.75,68.5,38.7,102.0,95.0,98.3,55.0,38.3,21.8,30.8,25.7,18.8,1,0
3,10.6,57,147.75,65.75,35.2,99.6,86.4,90.1,53.0,35.0,21.3,31.7,27.3,16.9,0,1
4,47.5,51,219.0,64.0,41.2,119.8,122.1,112.8,62.5,36.9,23.6,34.7,29.1,18.4,0,1


In [4]:
df_y = df['FAT']
df_x = df.drop(['FAT'], axis = 1, inplace=False)

df_train_x, df_test_x, df_train_y, df_test_y = train_test_split(df_x, df_y, test_size=0.3, random_state=777)

print("df_train_x:",df_train_x.shape)
print("df_test_x:",df_test_x.shape)
print("df_train_y:",df_train_y.shape)
print("df_test_y:",df_test_y.shape)

df_train_x: (176, 15)
df_test_x: (76, 15)
df_train_y: (176,)
df_test_y: (76,)


In [5]:
gb_uncustomized = GradientBoostingRegressor(random_state = 777)
gb_uncustomized.fit(df_train_x, df_train_y)

print("Score on training set: {:.3f}".format(gb_uncustomized.score(df_train_x, df_train_y)))
print("Score on test set: {:.3f}".format(gb_uncustomized.score(df_test_x, df_test_y)))

Score on training set: 0.980
Score on test set: 0.628


In [6]:
gb_uncustomized.get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'squared_error',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': 777,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [28]:
def generate_param_combinations(param_grid):
    param_names = list(param_grid.keys())
    param_values = list(param_grid.values())
    
    combinations = list(product(*param_values))
    
    param_combinations = []
    for combination in combinations:
        param_combinations.append(dict(zip(param_names, combination)))
    
    return param_combinations

def run_models(params, train_x, train_y, test_x, test_y, param_name):
    train_score = []
    test_score = []

    params_comb = generate_param_combinations(params)
    for i in params_comb:
        model = GradientBoostingRegressor(random_state = 777, 
                                          n_estimators= i['n_estimators'], 
                                          min_samples_leaf= i['min_samples_leaf'], 
                                          min_samples_split= i['min_samples_split'],
                                          max_depth= i['max_depth'],
                                          learning_rate= i['lr'])
        model.fit(train_x, train_y)
        train_score.append(model.score(train_x, train_y))
        test_score.append(model.score(test_x, test_y))
    df_score = pd.DataFrame()
    df_score[param_name] = params[param_name]
    df_score["TrainScore"] = train_score.round(3)
    df_score["TestScore"] = test_score.round(3)

    plt.plot(params[param_name], train_score, label = "Train score")
    plt.plot(params[param_name], test_score, label = "Test score")
    plt.legend()
    plt.show()
    
    return df_score

In [26]:
# default 파라미터 세팅

hyperparameters = {
    'n_estimators' : [100],
    'min_samples_leaf' : [1],
    'min_samples_split' : [2],
    'max_depth' : [3],
    'lr' : [0.1]
}         