In [None]:
import pandas as pd

# 검진조사, 건강설문조사, 영양조사
data1 = pd.read_sas('hn16_all.sas7bdat')

# 식품섭취빈도조사
data2 = pd.read_sas('hn16_ffq.sas7bdat')

# 식품섭취조사
data3 = pd.read_sas('hn16_24rc.sas7bdat')

data1.shape, data2.shape, data3.shape

In [None]:
data = pd.concat([data1, data2], axis=1)
data.shape

In [None]:
data = data.loc[:,~data.columns.duplicated()]
data.shape

In [None]:
colname_list = ['FF_PIZZA','FF_HAMBER','FF_F_CHIC','FF_INSTNO','FF_ICECM','FF_SNACK',
               'FF_CHOCO','FF_MILK','FF_SOJU','FF_BEER','FF_RWINE','FF_SPROU',
               'FF_VSALAD','FF_F_EGG','FF_MACKER','FF_J_SOYP','FF_J_KIMC','BE5_1','BE3_31',
               'BE8_1','BE8_2','age','HE_ht','HE_wt']

data = data.loc[:,colname_list]
data.shape

In [None]:
data = data.loc[(data['age'] >= 19) & (data['age'] <= 39), :]
data.shape

In [None]:
data['BMI'] = data['HE_wt'] / (data['HE_ht']/100)**2
data.shape

In [None]:
'''data.loc[data['BMI'] < 30, 'BMI'] = 0
data.loc[data['BMI'] >= 30, 'BMI'] = 1
data['BMI'].value_counts()'''

In [None]:
import numpy as np

for name in list(data.columns):
    data[[name]] = data[[name]].replace({88:np.nan, 99:np.nan, 
                                         8:np.nan, 9:np.nan})

In [None]:
#data.dropna(inplace=True)
data.shape

In [None]:
#data.to_csv('data.csv')

In [None]:
data['Sittime'] = data['BE8_1'] * 60 + data['BE8_2'] 
data.shape

In [None]:
data.drop(['age','HE_ht','HE_wt', 'BE8_1', 'BE8_2'],axis=1,inplace=True)
data.shape

In [None]:
#근력운동 일수 [BE5_1] - 전처리 필요X
#1주일간 걷기 일수 [BE3_31]
data['BE3_31'].replace({7:6, 8:6}, inplace=True)
data['BE3_31'].value_counts()

In [None]:
#앉아서 보내는 시간[Sittime]
data.loc[(0 <= data['Sittime'])  &  (data['Sittime'] < 180), 'Sittime'] = 1
data.loc[(180 <= data['Sittime'])  &  (data['Sittime'] < 360), 'Sittime'] = 2
data.loc[(360 <= data['Sittime'])  &  (data['Sittime'] < 540), 'Sittime'] = 3
data.loc[(540 <= data['Sittime'])  &  (data['Sittime'] < 720), 'Sittime'] = 4
data.loc[(720 <= data['Sittime'])  &  (data['Sittime'] < 900), 'Sittime'] = 5
data.loc[(900 <= data['Sittime']), 'Sittime'] = 6

data['Sittime'].value_counts()

In [None]:
df = data.copy()
df.shape

# Train / Test

In [None]:
# 독립변수와 종속변수 분류
X = df.drop('BMI',axis=1)
y = df[['BMI']]
print(X.shape, y.shape)

In [None]:
# train vs test 나누기
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.3, random_state = 0) 
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

In [None]:
train = pd.concat([train_x, train_y], axis=1)
train.shape

In [None]:
train.to_csv('rf.csv')

# RF

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, GridSearchCV

#최적 파라미터 값 찾기
params = [{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
        {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}]

rf_clf = RandomForestRegressor(random_state = 0, n_jobs = -1)
grid_cv = GridSearchCV(rf_clf, param_grid = params, cv = 3, scoring='neg_mean_squared_error', n_jobs = -1)
grid_cv.fit(train_x, train_y)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)

In [None]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(bootstrap = True, max_features = 2, n_estimators = 30, random_state=0)
regr.fit(train_x, train_y)

In [None]:
mse_list = []
for i in range(0, test_x.shape[0]):
    mse_list.append((abs(float(test_y.iloc[i,:]) - float(regr.predict([test_x.iloc[i,]])))) ** 2)
    
sum(mse_list) / test_x.shape[0]

In [None]:
perfo_df = pd.DataFrame(columns=['y', 'pred_y'])
for i in range(0, test_x.shape[0]):
    perfo_df = perfo_df.append(pd.DataFrame([[float(test_y.iloc[i,:]), float(regr.predict([test_x.iloc[i,]]))]], columns=['y', 'pred_y']), ignore_index=True)
#df.set_index('idx', inplace=True)
perfo_df

In [None]:
perfo_df.sort_values(by='pred_y').tail(30)

# pickle

In [None]:
import pickle

with open('./regr.pkl', 'wb') as f:
    pickle.dump(regr, f)

In [None]:
with open('regr.pkl', 'rb') as f:
    data = pickle.load(f)