In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

### Split the dataset for train and test

In [2]:
X_full = pd.read_csv('home-data-for-ml-course/train.csv', index_col='Id')
X_test_full = pd.read_csv('home-data-for-ml-course/test.csv', index_col='Id')

### Check if there is null in the target set

In [3]:
X_full.shape

(1460, 80)

In [4]:
X_full[X_full['SalePrice'].isnull()]

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [5]:
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)

In [25]:
X_full.head(2)

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500


In [7]:
X = X_full.drop(['SalePrice'], axis=1)
y = X_full['SalePrice']

### Split the dataset

In [8]:
X_train, X_pred, y_train, y_pred = train_test_split(X, y, train_size=0.8, random_state=42)

- ` Divide the dataset into categorical and numerical set`

- ` Select these categorical columns with relatively low number of unique values`

In [9]:
categorical_cols = [cname for cname in X_train.columns if X_train[cname].nunique()<10 
                    and X_train[cname].dtype=='object']

In [10]:
numerical_cols = [nname for nname in X_train.columns if X_train[nname].dtype in ['int64', 'float64']]

- `Handling missing values and Convert categorical data to numerical data`

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [12]:
categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehotencode', OneHotEncoder(handle_unknown='ignore')),
    ])

- `Processing **numerical** missing values`

In [13]:
numerical_transformer = SimpleImputer(strategy='constant')

- `Bundle preprocessing for both num and cat`

In [14]:
from sklearn.compose import ColumnTransformer

In [15]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols),
    ]
)

### Define Model

In [17]:
from sklearn.ensemble import RandomForestRegressor

In [18]:
model = RandomForestRegressor(n_estimators=100, random_state=42)

- Bundle __preproscessing__ and __modle__ together 

In [19]:
my_pipeline = Pipeline(steps=
                      [
                          ('preprocessor', preprocessor),
                          ('model', model)
                      ])

### Fit model

In [21]:
my_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='constant',
                                                                verbose=0),
                                                  ['MSSubClass', 'LotFrontage',
                                                   'LotArea', 'OverallQual',
                                                   'OverallCo

- `prediction`

In [22]:
y_pred_ = my_pipeline.predict(X_pred)

In [23]:
from sklearn.metrics import mean_absolute_error

In [24]:
print('MAE:', mean_absolute_error(y_pred_, y_pred))

MAE: 17678.294143835617
