# **Exercise**
---    

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

X = pd.read_csv("../datafile/train.csv", index_col="Id")
X_test_full = pd.read_csv("../datafile/test.csv", index_col="Id")

X.dropna(axis=0, subset=["SalePrice"], inplace=True)
y = X.SalePrice
X.drop(["SalePrice"], axis=1, inplace=True)

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == "object"]

numeric_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ["int64", "float64"]]

my_cols = low_cardinality_cols + numeric_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

X_train = pd.get_dummies(X_train)
X_valid = pd.get_dummies(X_valid)
X_test = pd.get_dummies(X_test)
X_train, X_valid = X_train.align(X_valid, join='left', axis=1)
X_train, X_test = X_train.align(X_test, join="left", axis=1)


* get_dummies : one_hot_encoing처럼 만들어 준다. / 0과 1로만 이루어진 가변수들로 바꾼다
* obj1.align(obj2, join="",axis="")  
  * obj1과 obj2를 정렬한다. 
  * axis=1일경우 열에 대해서 / 0일경우 행에 대해서 / default는 두 축 모두
  * join : left이면 obj1에 있는 index에 대해, right이면 obj2에 있는 index에 대해, inner이면 교집합, outer이면 합집합 / default는 outer

# **Step 1**
---
### **Part A**
gradient boosting 모델을 만든다. (XGBRegressor클래스를 사용하고, random_state를 0으로, 나머지는 default)


### **답**

In [4]:
from xgboost import XGBRegressor

my_model_1 = XGBRegressor(random_state=0)
my_model_1.fit(X_train, y_train)


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

### **Part B**
앞선 모델로 X_valid 검증하기

In [6]:
from sklearn.metrics import mean_absolute_error

predictions_1 = my_model_1.predict(X_valid)


### **Part C**
검증한 값으로 MAE 구하기

In [7]:
mae_1 = mean_absolute_error(predictions_1, y_valid)
mae_1

17662.736729452055

# **Step 2**
---

MAE가 더 낮은 모델로 만들기



In [12]:
my_model_2 = XGBRegressor(n_estimators=500, learning_rate=0.05)
my_model_2.fit(X_train, y_train)
predictions_2 = my_model_2.predict(X_valid)
mae_2 = mean_absolute_error(predictions_2, y_valid)
mae_2

16728.27523009418

# **Step 3**
---
MAE가 더 높은 모델로 만들기

In [14]:
my_model_3 = XGBRegressor(n_estimators=50, learning_rate=0.5)
my_model_3.fit(X_train, y_train)
predictions_3 = my_model_3.predict(X_valid)
mae_3 = mean_absolute_error(predictions_3, y_valid)
mae_3


20948.60493364726