In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, minmax_scale
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor



import warnings
warnings.filterwarnings(action="ignore")

from xgboost import XGBRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error


In [4]:
from sklearn import datasets

boston = datasets.load_boston()
df = pd.DataFrame(
    boston['data'], columns=boston['feature_names']
)
df['target'] = boston['target']
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


In [5]:
y = df['target']
X = df.drop(["target"], axis=1)

In [6]:
y.shape

(506,)

In [7]:
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  target   506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB


### 결측처리

In [9]:
cols_obj = df.select_dtypes(include="object").columns.values.tolist()
cols_num = df.select_dtypes(exclude="object").columns.values.tolist()

df[cols_obj] = df[cols_obj].fillna("NAN")
df[cols_num] = df[cols_num].fillna(0)


In [10]:
df.isna().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
target     0
dtype: int64

### EDA

In [11]:
for col in cols_obj:
    unique_val_list = df[col].unique().tolist()
    dict = {val : i for i, val in enumerate(unique_val_list)}
    print(col, dict)
    df[col] = df[col].map(dict)

In [12]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [14]:
y_log = np.log1p(df["target"])
X = df.drop("target", axis=1)

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X, y_log, test_size=0.2, random_state=11)

In [17]:
X_train

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
188,0.12579,45.0,3.44,0.0,0.437,6.556,29.1,4.5667,5.0,398.0,15.2,382.84,4.56
319,0.47547,0.0,9.90,0.0,0.544,6.113,58.8,4.0019,4.0,304.0,18.4,396.23,12.73
21,0.85204,0.0,8.14,0.0,0.538,5.965,89.2,4.0123,4.0,307.0,21.0,392.53,13.83
14,0.63796,0.0,8.14,0.0,0.538,6.096,84.5,4.4619,4.0,307.0,21.0,380.02,10.26
369,5.66998,0.0,18.10,1.0,0.631,6.683,96.8,1.3567,24.0,666.0,20.2,375.33,3.73
...,...,...,...,...,...,...,...,...,...,...,...,...,...
337,0.03041,0.0,5.19,0.0,0.515,5.895,59.6,5.6150,5.0,224.0,20.2,394.81,10.56
91,0.03932,0.0,3.41,0.0,0.489,6.405,73.9,3.0921,2.0,270.0,17.8,393.55,8.20
80,0.04113,25.0,4.86,0.0,0.426,6.727,33.5,5.4007,4.0,281.0,19.0,396.90,5.29
191,0.06911,45.0,3.44,0.0,0.437,6.739,30.8,6.4798,5.0,398.0,15.2,389.71,4.69


In [18]:
X_val

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
53,0.04981,21.0,5.64,0.0,0.439,5.998,21.4,6.8147,4.0,243.0,16.8,396.90,8.43
490,0.20746,0.0,27.74,0.0,0.609,5.093,98.0,1.8226,4.0,711.0,20.1,318.43,29.68
240,0.11329,30.0,4.93,0.0,0.428,6.897,54.3,6.3361,6.0,300.0,16.6,391.25,11.38
375,19.60910,0.0,18.10,0.0,0.671,7.313,97.9,1.3163,24.0,666.0,20.2,396.90,13.44
417,25.94060,0.0,18.10,0.0,0.679,5.304,89.1,1.6475,24.0,666.0,20.2,127.36,26.64
...,...,...,...,...,...,...,...,...,...,...,...,...,...
285,0.01096,55.0,2.25,0.0,0.389,6.453,31.9,7.3073,1.0,300.0,15.3,394.72,8.23
7,0.14455,12.5,7.87,0.0,0.524,6.172,96.1,5.9505,5.0,311.0,15.2,396.90,19.15
247,0.19657,22.0,5.86,0.0,0.431,6.226,79.2,8.0555,7.0,330.0,19.1,376.14,10.15
357,3.84970,0.0,18.10,1.0,0.770,6.395,91.0,2.5052,24.0,666.0,20.2,391.34,13.27


In [20]:
scaler = MinMaxScaler()

In [21]:
scaler.fit(np.array(y_train).reshape(-1,1))
y_train_matrix = scaler.transform(np.array(y_train).reshape(-1,1))
y_val_matrix = scaler.transform(np.array(y_val).reshape(-1,1))

y_train_scaler = pd.Series(y_train_matrix.reshape(-1))
y_val_scaler = pd.Series(y_val_matrix.reshape(-1))

# print(y_train_matrix.shape)
print(y_train_scaler.shape)
print(y_val_scaler.shape)

(404,)
(102,)


In [22]:
scaler.fit(X_train)
X_train_matrix = scaler.transform(X_train)
X_val_matrix = scaler.transform(X_val)

X_train_scaler = pd.DataFrame(X_train_matrix, columns=X_train.columns, index = X_train.index)
X_val_scaler = pd.DataFrame(X_val_matrix, columns=X_val.columns, index = X_val.index)

print(X_train_scaler.shape)
print(X_val_scaler.shape)

(404, 13)
(102, 13)


In [23]:
X_train.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
188,0.12579,45.0,3.44,0.0,0.437,6.556,29.1,4.5667,5.0,398.0,15.2,382.84,4.56
319,0.47547,0.0,9.9,0.0,0.544,6.113,58.8,4.0019,4.0,304.0,18.4,396.23,12.73
21,0.85204,0.0,8.14,0.0,0.538,5.965,89.2,4.0123,4.0,307.0,21.0,392.53,13.83
14,0.63796,0.0,8.14,0.0,0.538,6.096,84.5,4.4619,4.0,307.0,21.0,380.02,10.26
369,5.66998,0.0,18.1,1.0,0.631,6.683,96.8,1.3567,24.0,666.0,20.2,375.33,3.73


In [14]:
# scale_cols = X_train.columns.values.tolist()
# print(type(scale_cols))
# scaler = StandardScaler()
# scaler.fit(X_train[scale_cols])

# X_train_scaler = scaler.transform(X_train[scale_cols])
# X_val_scaler = scaler.transform(X_val[scale_cols])

In [21]:
# X_train.drop(scale_cols, axis=1, inplace=True)
# X_val.drop(scale_cols, axis=1, inplace=True)

In [24]:
X_train.shape, X_val.shape

((404, 13), (102, 13))

In [None]:
# X_train_scaler_df = pd.DataFrame(X_train_scaler, columns=scale_cols)
# X_val_scaler_df = pd.DataFrame(X_val_scaler, columns=scale_cols)

# X_train = pd.concat([X_train, X_train_scaler_df], axis=1)
# X_val = pd.concat([X_val, X_val_scaler_df], axis=1)

In [25]:
models = [Ridge(), Lasso(), XGBRegressor(), RandomForestRegressor()]
for model in models:
    model.fit(X_train_scaler, y_train_scaler)
    pred = model.predict(X_val_scaler)
    y_val_exp = np.expm1(y_val_scaler)
    pred_exp = np.expm1(pred)
    RMSE = mean_squared_error(y_val_exp, pred_exp, squared=False)
    
    print(f"{model.__class__.__name__}\t RMSE:{RMSE:.4f}")

Ridge	 RMSE:0.1687
Lasso	 RMSE:0.3415
XGBRegressor	 RMSE:0.1458
RandomForestRegressor	 RMSE:0.1552


In [26]:
X_train_scaler.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
188,0.001343,0.473684,0.109238,0.0,0.106996,0.553887,0.269825,0.312552,0.173913,0.402672,0.276596,0.964547,0.078091
319,0.005273,0.0,0.346041,0.0,0.32716,0.462773,0.575695,0.261192,0.130435,0.223282,0.617021,0.998311,0.303532
21,0.009506,0.0,0.281525,0.0,0.314815,0.432332,0.888774,0.262138,0.130435,0.229008,0.893617,0.988981,0.333885
14,0.007099,0.0,0.281525,0.0,0.314815,0.459276,0.840371,0.303022,0.130435,0.229008,0.893617,0.957436,0.235375
369,0.063658,0.0,0.646628,1.0,0.506173,0.580008,0.967044,0.020651,1.0,0.914122,0.808511,0.94561,0.055188


### 학습량을 늘려라 - KFold

In [44]:
XX_scaler = pd.concat([X_train_scaler, X_val_scaler], axis=0)
XX_scaler.reset_index(drop=True, inplace=True)
XX_scaler

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.001343,0.473684,0.109238,0.0,0.106996,0.553887,0.269825,0.312552,0.173913,0.402672,0.276596,0.964547,0.078091
1,0.005273,0.000000,0.346041,0.0,0.327160,0.462773,0.575695,0.261192,0.130435,0.223282,0.617021,0.998311,0.303532
2,0.009506,0.000000,0.281525,0.0,0.314815,0.432332,0.888774,0.262138,0.130435,0.229008,0.893617,0.988981,0.333885
3,0.007099,0.000000,0.281525,0.0,0.314815,0.459276,0.840371,0.303022,0.130435,0.229008,0.893617,0.957436,0.235375
4,0.063658,0.000000,0.646628,1.0,0.506173,0.580008,0.967044,0.020651,1.000000,0.914122,0.808511,0.945610,0.055188
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.000052,0.578947,0.065616,0.0,0.008230,0.532703,0.298661,0.561767,0.000000,0.215649,0.287234,0.994503,0.179360
502,0.001554,0.131579,0.271628,0.0,0.286008,0.474907,0.959835,0.438387,0.173913,0.236641,0.276596,1.000000,0.480684
503,0.002138,0.231579,0.197947,0.0,0.094650,0.486014,0.785788,0.629805,0.260870,0.272901,0.691489,0.947652,0.232340
504,0.043199,0.000000,0.646628,1.0,0.792181,0.520773,0.907312,0.125090,1.000000,0.914122,0.808511,0.985980,0.318433


In [38]:
yy_scaler = pd.concat([y_train_scaler, y_val_scaler], axis=0)
yy_scaler

0      0.764348
1      0.607123
2      0.576399
3      0.543512
4      1.000000
         ...   
97     0.627894
98     0.721478
99     0.596380
100    0.621759
101    0.545939
Length: 506, dtype: float64

In [30]:
y_train_scaler.shape, y_val_scaler.shape

((404,), (102,))

In [27]:
XX_scaler = pd.concat([X_train_scaler, X_val_scaler], axis=0)
yy_scaler = pd.concat([y_train_scaler, y_val_scaler], axis=0)

XX_scaler

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
188,0.001343,0.473684,0.109238,0.0,0.106996,0.553887,0.269825,0.312552,0.173913,0.402672,0.276596,0.964547,0.078091
319,0.005273,0.000000,0.346041,0.0,0.327160,0.462773,0.575695,0.261192,0.130435,0.223282,0.617021,0.998311,0.303532
21,0.009506,0.000000,0.281525,0.0,0.314815,0.432332,0.888774,0.262138,0.130435,0.229008,0.893617,0.988981,0.333885
14,0.007099,0.000000,0.281525,0.0,0.314815,0.459276,0.840371,0.303022,0.130435,0.229008,0.893617,0.957436,0.235375
369,0.063658,0.000000,0.646628,1.0,0.506173,0.580008,0.967044,0.020651,1.000000,0.914122,0.808511,0.945610,0.055188
...,...,...,...,...,...,...,...,...,...,...,...,...,...
285,0.000052,0.578947,0.065616,0.0,0.008230,0.532703,0.298661,0.561767,0.000000,0.215649,0.287234,0.994503,0.179360
7,0.001554,0.131579,0.271628,0.0,0.286008,0.474907,0.959835,0.438387,0.173913,0.236641,0.276596,1.000000,0.480684
247,0.002138,0.231579,0.197947,0.0,0.094650,0.486014,0.785788,0.629805,0.260870,0.272901,0.691489,0.947652,0.232340
357,0.043199,0.000000,0.646628,1.0,0.792181,0.520773,0.907312,0.125090,1.000000,0.914122,0.808511,0.985980,0.318433


In [28]:
yy_scaler.reset_index

0      0.764348
1      0.607123
2      0.576399
3      0.543512
4      1.000000
         ...   
97     0.627894
98     0.721478
99     0.596380
100    0.621759
101    0.545939
Length: 506, dtype: float64

In [38]:
XX_scaler = pd.concat([X_train_scaler, X_val_scaler], axis=0)
yy_scaler = pd.concat([y_train_scaler, y_val_scaler], axis=0)

# XX_scaler.reset_index(drop=True, inplace=True)
# yy_scaler.reset_index(drop=True, inplace=True)

kf = KFold(n_splits=5, shuffle=True, random_state=0)

tot_rmse_list = []
for train_idx, val_idx in kf.split(XX_scaler, yy_scaler):
    print(train_idx[:5])
    X_train, X_val = XX_scaler.iloc[train_idx], XX_scaler.iloc[val_idx]
    y_train, y_val = yy_scaler.iloc[train_idx], yy_scaler.iloc[val_idx]
    
    model = XGBRegressor()
    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    RMSE = mean_squared_error(y_val, pred, squared=False)
    print(f"RMSE:{RMSE:.4f}")
    tot_rmse_list.append(RMSE)
    
print(np.mean(tot_rmse_list))

    
    

[0 2 3 4 5]
RMSE:0.0828
[0 1 2 3 9]
RMSE:0.0743
[0 1 3 4 5]
RMSE:0.0667
[1 2 4 5 6]
RMSE:0.0551
[0 1 2 3 4]
RMSE:0.0559
0.06695049472674512


In [None]:
df = pd.get_dummies(df)

### 타겟피쳐 로그변환

In [None]:
y_log = np.log1p(df["target"])
X = df.drop("target", axis=1)

### 모델선정 & 학습 & 평가

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y_log, test_size=0.2, random_state=1414)
models = [Ridge(), Lasso(), RandomForestRegressor(), XGBRegressor()]
for model in models:
    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    
    y_val_exp = np.expm1(y_val)
    pred_exp = np.expm1(pred)
    
    rmse = mean_squared_error(y_val_exp, pred_exp, squared="True")
    print(f"{model.__class__.__name__}\t RMSE: {rmse:.4f}")
    

### 스케일링

## log --> 스케일링  

In [61]:
list = [1,5,10,20,30]
arr = np.array(list)
log_arr = np.log(arr)
print(log_arr)

[0.         1.60943791 2.30258509 2.99573227 3.40119738]


In [42]:
X_train_log = np.log1p(X_train)
X_test_log  = np.log1p(X_test)
y_train_log = np.log1p(y_train)
y_test_log  = np.log1p(y_test)

In [43]:
scalers = [StandardScaler()] # , RobustScaler(), MinMaxScaler()]
for mm in scalers:
    #----- Series를 array 세로줄로 바꾸는 방법 ------------------
    mm.fit(np.array(y_train_log).reshape(-1,1))                          # fit       : array, matrix만 가능
    #----------------------------------------------------------
    y_train_matrix = mm.transform(np.array(y_train_log).reshape(-1,1))   # transform : array, matrix만 가능
    y_test_matrix  = mm.transform(np.array(y_test_log).reshape(-1,1))    
    
    #----- matrix 2차를 array 1차로 바꾸는 방법 ------------------
    y_train_scaler = pd.Series(y_train_matrix.reshape(-1))
    #----------------------------------------------------------
    y_test_scaler  = pd.Series(y_test_matrix.reshape(-1))

print(y_train_scaler.shape)
print(y_test_scaler.shape)

(374,)
(94,)


In [44]:
scalers = [StandardScaler()] # , RobustScaler(), MinMaxScaler()]
for mm in scalers:
    mm.fit(X_train_log)                            # fit       : array, matrix만 가능
    X_train_matrix = mm.transform(X_train_log)     # transform : array, matrix만 가능
    X_test_matrix  = mm.transform(X_test_log)    
    
    #print( X_train.index.values)              # index객체 --> array
                                                                           # Index객체  or array
    X_train_scaler = pd.DataFrame(X_train_matrix, columns=X_train.columns, index=X_train.index)
    X_test_scaler  = pd.DataFrame(X_test_matrix , columns=X_test.columns,  index=X_test.index)    
    
print(X_train_scaler.shape)
print(X_test_scaler.shape)

(374, 13)
(94, 13)


In [45]:
models = [LinearRegression(), Ridge(), Lasso(), ElasticNet(), DecisionTreeRegressor(),XGBRegressor(), LGBMRegressor()]
for model in models:
    model.fit(X_train_scaler,  y_train_scaler)      
    pred = model.predict(X_test_scaler)                                   # ---------test scale     
    mse  = mean_squared_error(y_test_scaler, pred)                 # ---------test scale
    rmse = mean_squared_error(y_test_scaler, pred, squared=False)  # ---------test  scale
    print(f"{model.__class__.__name__}\t MSE:{mse:.4f} RMSE:{rmse:.4f}")
 

LinearRegression	 MSE:0.2726 RMSE:0.5221
Ridge	 MSE:0.2733 RMSE:0.5228
Lasso	 MSE:1.0044 RMSE:1.0022
ElasticNet	 MSE:0.6935 RMSE:0.8328
DecisionTreeRegressor	 MSE:0.4428 RMSE:0.6655
XGBRegressor	 MSE:0.2130 RMSE:0.4616
LGBMRegressor	 MSE:0.1895 RMSE:0.4354
