In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

In [2]:
# Read Data
X = pd.read_csv('home-data-for-ml-course\\train.csv',index_col='Id')
X_test_full = pd.read_csv('home-data-for-ml-course\\test.csv',index_col='Id')
# Remove rows with missing target
X.dropna(axis = 0,subset=['SalePrice'],inplace = True)
# Check missing Value in other column
missing_col = [col for col in X.columns if X[col].isnull().any()]
X[missing_col].isnull().sum()


LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

From data description for some column NaN is mean don't have this in this house Ex. pool, fence, garage and more.

Check each columns value that's missing with no detail. 
> MasVnrType,MasVnrArea,Electrical

In [3]:
# Delete missing value rows in [MasVnrType,MasVnrArea,Electrical] column
X.dropna(axis=0,subset=['MasVnrType','Electrical'],inplace=True)

# Seperate target from data
y = X['SalePrice']
X.drop(['SalePrice'],axis=1,inplace=True)
X[missing_col].isnull().sum()


LotFrontage      257
Alley           1361
MasVnrType         0
MasVnrArea         0
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         0
FireplaceQu      685
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1444
Fence           1170
MiscFeature     1397
dtype: int64

In [4]:
# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid =train_test_split(X , y, train_size=0.8, test_size=0.2, random_state=0)

In [5]:
# Select categorical columns with relatively low cardinality
catcol = [col for col in X_train_full.columns if X_train_full[col].nunique() < 10 and X_train_full[col].dtype == 'object' ]

# Select numerical columns 
numcol = [col for col in X_train_full.columns if X_train_full[col].dtype in ['int64','float64']]

# Keep selected columns only
my_cols = catcol + numcol
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
  ('imputer' , SimpleImputer(strategy='constant',fill_value='None')),
  ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numcol),
        ('cat', categorical_transformer, catcol)
    ])

# Define Model RandomForest
model1 = RandomForestRegressor(n_estimators=100, random_state=0)

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor), ('model', model1)])

# Preprocessing of training data, fit model 
clf.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = clf.predict(X_valid)

print('MAE:', mean_absolute_error(y_valid, preds))

MAE: 14850.333848797249


In [7]:
from xgboost import XGBRegressor

# Define model gradient boost
model2 = XGBRegressor(n_estimators=745,learning_rate=0.048)

# Bundle preprocessing and modeling code in a pipeline
clf2 = Pipeline(steps=[('preprocessor', preprocessor), ('model', model2)])

# Preprocessing of training data, fit model 
clf2.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds2 = clf2.predict(X_valid)



MAE: 14695.327937070446


In [13]:
# Check model accuracy
from sklearn import metrics
print('MAE = ', mean_absolute_error(y_valid, preds2))
print('r sqaure = ',metrics.r2_score(y_valid, preds2))

MAE =  14695.327937070446
r sqaure =  0.9064912470936164
