In [1]:
# Data Manipulation, Import, Export
import numpy as np 
import pandas as pd 

# Data Missing Value Treatment
from sklearn.impute import SimpleImputer

# Data Preprocessing
# Data Transformation - Categorical
from sklearn.preprocessing import OneHotEncoder

# Data Transformation - Numerical
from sklearn.preprocessing import StandardScaler

# Train and Test Split
from sklearn.model_selection import train_test_split

# Build Model
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Evaluate Model
from sklearn.metrics import mean_squared_error  # use squared = False for RMSE

train = pd.read_csv('/content/drive/MyDrive/train.csv')
predict = pd.read_csv('/content/drive/MyDrive/test.csv') # we will call test as Predict to avoid confusion during train_test_split

train.head()

# Drop Categorical Features
train = train.drop(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
       'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood',
       'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish',
       'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence',
       'MiscFeature', 'SaleType', 'SaleCondition'], axis = 1)
predict = predict.drop(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
       'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood',
       'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish',
       'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence',
       'MiscFeature', 'SaleType', 'SaleCondition'], axis = 1)

sample_submission = pd.read_csv('/content/drive/MyDrive/sample_submission.csv')
sample_submission.head()

# train
train.info()

# predict
predict.info()

# Train
y = train['SalePrice']
# only X - no Id, no y
X = train.drop(['Id', 'SalePrice'], axis = 1)
# train.drop('SalePrice' And 'Id',axis=1)

# Predict
X_predict = predict.drop(['Id'], axis = 1)

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_features = X.select_dtypes(include = numerics).columns.values
numerical_features

categorical_features = X.select_dtypes(exclude = numerics).columns.values
categorical_features

# Check Missing Values
X.columns[X.isnull().any()]

X.isnull().sum()

# Declare Simple Imputer
num_impute = SimpleImputer(strategy = 'median')

# Get Mean for all features
num_impute.fit(X[numerical_features])

# Transform Train
X[numerical_features] = num_impute.transform(X[numerical_features])

# Transform Predict
X_predict[numerical_features] = num_impute.transform(X_predict[numerical_features])

# Declare Simple Imputer
#cat_impute = SimpleImputer(strategy = 'constant', fill_value = 'missing')

# Get Mean for all features
#cat_impute.fit(X[categorical_features])

# Tranform Train
#X[categorical_features] = cat_impute.transform(X[categorical_features])

# Tranform Predict
#X_predict[categorical_features] = cat_impute.transform(X_predict[categorical_features])

X.columns[X.isnull().any()]

# Declare StandardScaler
num_scale = StandardScaler()

# Fit
num_scale.fit(X[numerical_features])

# Transform Train
X[numerical_features] = num_scale.transform(X[numerical_features])

# Transform Predict
X_predict[numerical_features]=num_scale.transform(X_predict[numerical_features])

X.head()

# Declare Categorical Encoder
# cat_encoder = OneHotEncoder(handle_unknown = 'ignore')

# Get Stats
# cat_encoder.fit_transform(X[categorical_features])

# Transform Train
#X[categorical_features] = cat_encoder.transform(X[categorical_features])

# Transform Predict
# X_predict[categorical_features] = cat_encoder.transform(X_predict[categorical_features])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

rf = RandomForestRegressor(n_estimators = 10, max_depth = 6, random_state = 0)

rf.fit(X_train, y_train)

# train
train_predict = rf.predict(X_train)
train_predict

# test
test_predict = rf.predict(X_test)
test_predict

# Train RMSE
mean_squared_error(train_predict, y_train, squared = False)

# Test RMSE
mean_squared_error(test_predict, y_test, squared = False)

# Predict
predictions = rf.predict(X_predict)
predictions

# Create Submission
rf_sub = pd.DataFrame({
    'Id' :  predict['Id'],
    'SalePrice' : predictions
})
rf_sub.head()

# Export Submission
rf_sub.to_csv('/content/drive/MyDrive/Book1.csv', index = False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 38 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   OverallQual    1460 non-null   int64  
 5   OverallCond    1460 non-null   int64  
 6   YearBuilt      1460 non-null   int64  
 7   YearRemodAdd   1460 non-null   int64  
 8   MasVnrArea     1452 non-null   float64
 9   BsmtFinSF1     1460 non-null   int64  
 10  BsmtFinSF2     1460 non-null   int64  
 11  BsmtUnfSF      1460 non-null   int64  
 12  TotalBsmtSF    1460 non-null   int64  
 13  1stFlrSF       1460 non-null   int64  
 14  2ndFlrSF       1460 non-null   int64  
 15  LowQualFinSF   1460 non-null   int64  
 16  GrLivArea      1460 non-null   int64  
 17  BsmtFullBath   1460 non-null   int64  
 18  BsmtHalf