In [1]:
import pandas as pd
import numpy as np

In [2]:
raw_train = pd.read_csv('./train.csv')
raw_test = pd.read_csv('./test.csv')
y = raw_train['SalePrice']

In [3]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [4]:
train = raw_train.copy()
train.drop('SalePrice', axis = 1, inplace = True)

In [5]:
num_col = train.select_dtypes(include = ['float', 'int'])
cat_col = train.select_dtypes(include = 'object')

In [6]:
num_col = num_col.columns
cat_col = cat_col.columns

Setup các *Preprocessing Method*  
Pipeline : impute và onehot cho các cột categorical  
  Pipeline sẽ đi từng bước trong steps -> imputer -> onehot

CT : nếu Pipeline là gom các preprocessing method thành bundle, thì CT sẽ gom các cột cần preprocessed thành bundle

In [7]:
numerical_pp = SimpleImputer(strategy='mean')
categorical_pp = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
preprocessor = ColumnTransformer(
    transformers = [
    ('num', numerical_pp, num_col),
    ('cate', categorical_pp, cat_col)
], remainder='passthrough')

In [8]:
from sklearn.ensemble import RandomForestRegressor
RFR = RandomForestRegressor(random_state=1, max_leaf_nodes=250, max_depth=7)

In [9]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal


Output của pipeline categorical_PP là transformed data. Đây còn gọi là sub-pipeline

Output của PLP là prediction của model : preprocessing -> model. Đây là end-to-end pipeline

In [10]:
PLP = Pipeline(steps = [
    ('preprocessing', preprocessor),
    ('model', RFR) 
])
PLP.fit(train, y)
anticipation = PLP.predict(raw_test)

***CROSS VALIDATION*** : Chia data ra nhiều subsets và test để cho ra kết quả train thực tế nhất  
Thường dùng để kiểm tra dataset lớn, nếu mỗi subset measurements đều cho ra kết quả tương tự thì model tốt

***IMPORT CROSS VALIDATION***

In [12]:
from sklearn.model_selection import cross_val_score

# Multiply by -1 since sklearn calculates *negative* MAE
scores = cross_val_score(
    estimator=PLP,
    X=train,          # dữ liệu đầu vào
    y=y,              # nhãn
    cv=5,             # số fold, mặc định = 5
    scoring='neg_mean_absolute_error'  # metric
)
cvs_score = -1*scores
print("MAE scores:\n", scores.mean())

MAE scores:
 -18834.65531396273


In [14]:
result = np.arange(0,100,50)
result

array([ 0, 50])