In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score,   GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from xgboost import XGBRegressor


In [2]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [3]:
cols_to_drop=['PoolQC', 'MiscFeature','Alley', 'Fence','MasVnrType', 'FireplaceQu', 'WoodDeckSF', '2ndFlrSF', 'OpenPorchSF', 'HalfBath','LotArea', 'BsmtFullBath','BsmtUnfSF','BedroomAbvGr', 'ScreenPorch', 'PoolArea', 'MoSold', '3SsnPorch', 'BsmtFinSF2', 'BsmtHalfBath', 'MiscVal', 'Id', 'LowQualFinSF', 'YrSold', 'OverallCond', 'MSSubClass', 'EnclosedPorch', 'KitchenAbvGr', '1stFlrSF', 'GarageArea', 'GarageYrBlt']

In [4]:
train.drop(cols_to_drop, axis=1, inplace=True)

In [5]:
test.drop(cols_to_drop, axis=1, inplace=True)

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 50 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSZoning       1460 non-null   object 
 1   LotFrontage    1201 non-null   float64
 2   Street         1460 non-null   object 
 3   LotShape       1460 non-null   object 
 4   LandContour    1460 non-null   object 
 5   Utilities      1460 non-null   object 
 6   LotConfig      1460 non-null   object 
 7   LandSlope      1460 non-null   object 
 8   Neighborhood   1460 non-null   object 
 9   Condition1     1460 non-null   object 
 10  Condition2     1460 non-null   object 
 11  BldgType       1460 non-null   object 
 12  HouseStyle     1460 non-null   object 
 13  OverallQual    1460 non-null   int64  
 14  YearBuilt      1460 non-null   int64  
 15  YearRemodAdd   1460 non-null   int64  
 16  RoofStyle      1460 non-null   object 
 17  RoofMatl       1460 non-null   object 
 18  Exterior

In [7]:
train['YearBuilt']=pd.to_datetime(train['YearBuilt'], format='%Y')
train['YearRemodAdd']=pd.to_datetime(train['YearRemodAdd'], format='%Y')
test['YearBuilt']=pd.to_datetime(test['YearBuilt'], format='%Y')
test['YearRemodAdd']=pd.to_datetime(test['YearRemodAdd'], format='%Y')

In [8]:
categ_cols=train.select_dtypes(include=[object, 'datetime']).columns.tolist()
num_cols=train.select_dtypes(include='number').columns.tolist()
num_cols.remove('SalePrice')

In [9]:
for col in categ_cols:
    if col=='Electrical':
        train[col]=train[col].fillna(train[col].mode()[0])
    else:
        train[col]=train[col].fillna('None')

In [10]:
for col in num_cols:
    train[col]=train[col].fillna(0)

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 50 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   MSZoning       1460 non-null   object        
 1   LotFrontage    1460 non-null   float64       
 2   Street         1460 non-null   object        
 3   LotShape       1460 non-null   object        
 4   LandContour    1460 non-null   object        
 5   Utilities      1460 non-null   object        
 6   LotConfig      1460 non-null   object        
 7   LandSlope      1460 non-null   object        
 8   Neighborhood   1460 non-null   object        
 9   Condition1     1460 non-null   object        
 10  Condition2     1460 non-null   object        
 11  BldgType       1460 non-null   object        
 12  HouseStyle     1460 non-null   object        
 13  OverallQual    1460 non-null   int64         
 14  YearBuilt      1460 non-null   datetime64[ns]
 15  YearRemodAdd   1460 n

In [12]:
test['Utilities']=test['Utilities'].fillna(test['Utilities'].mode()[0])
test['Exterior1st']=test['Exterior1st'].fillna(test['Exterior1st'].mode()[0])
test['Exterior2nd']=test['Exterior2nd'].fillna(test['Exterior2nd'].mode()[0])
test['BsmtFinSF1']=test['BsmtFinSF1'].fillna(0)
test['TotalBsmtSF']=test['TotalBsmtSF'].fillna(0)
test['KitchenQual']=test['KitchenQual'].fillna(test['KitchenQual'].mode()[0])
test['Functional']=test['Functional'].fillna(test['Functional'].mode()[0])
test['GarageCars']=test['GarageCars'].fillna(0)
test['SaleType']=test['SaleType'].fillna(test['SaleType'].mode()[0])

In [13]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 49 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   MSZoning       1455 non-null   object        
 1   LotFrontage    1232 non-null   float64       
 2   Street         1459 non-null   object        
 3   LotShape       1459 non-null   object        
 4   LandContour    1459 non-null   object        
 5   Utilities      1459 non-null   object        
 6   LotConfig      1459 non-null   object        
 7   LandSlope      1459 non-null   object        
 8   Neighborhood   1459 non-null   object        
 9   Condition1     1459 non-null   object        
 10  Condition2     1459 non-null   object        
 11  BldgType       1459 non-null   object        
 12  HouseStyle     1459 non-null   object        
 13  OverallQual    1459 non-null   int64         
 14  YearBuilt      1459 non-null   datetime64[ns]
 15  YearRemodAdd   1459 n

In [14]:
for col in num_cols:
    test[col]=test[col].fillna(0)

for col in categ_cols:
    test[col]=test[col].fillna('None')

In [15]:
categ_pipeline=Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
num_pipeline=Pipeline([
    ('scaler', StandardScaler())
])
processor=ColumnTransformer([
    ('categ', categ_pipeline, categ_cols),
    ('num', num_pipeline, num_cols)
])

In [16]:
y_train=train['SalePrice']
X_train=train.drop('SalePrice', axis=1)

In [17]:
lr_model=Pipeline([
    ('processing', processor),
    ('model', LinearRegression())
])
lr_scores=cross_val_score(lr_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

In [18]:
rg_pipeline=Pipeline([
    ('processing', processor),
    ('model', Ridge())
])
rg_param={
    'model__alpha':[0.01, 0.1, 1, 10, 100]
}
rg_grid=GridSearchCV(rg_pipeline, rg_param, cv=5,scoring='neg_mean_squared_error')
rg_grid.fit(X_train, y_train)
rg_model=rg_grid.best_estimator_
rg_scores=cross_val_score(rg_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

In [19]:
lasso_pipeline=Pipeline([
    ('processing', processor),
    ('model', Lasso())
])
lasso_param={
    'model__alpha':[0.01, 0.1, 1, 10, 100]
}
lasso_grid=GridSearchCV(lasso_pipeline, lasso_param,cv=5, scoring='neg_mean_squared_error')
lasso_grid.fit(X_train, y_train)
lasso_model=lasso_grid.best_estimator_
lasso_scores=cross_val_score(lasso_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


In [20]:
rf_pipeline=Pipeline([
    ('processing', processor),
    ('model', RandomForestRegressor())
])
rf_param={
    'model__n_estimators':[10, 20, 50, 100, 150, 200],
    'model__max_depth':[None, 5, 10, 15, 20, 25],
    'model__min_samples_split':[2, 5, 10]
}
rf_grid=GridSearchCV(rf_pipeline, param_grid=rf_param, n_jobs=-1, cv=5, scoring='neg_mean_squared_error')
rf_grid.fit(X_train, y_train)
rf_model=rf_grid.best_estimator_
rf_scores=cross_val_score(rf_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

In [21]:
en_pipeline=Pipeline([
    ('processing', processor),
    ('model', ElasticNet())
])
en_param={
    'model__alpha':[0.001, 0.01, 0.1, 1, 10, 100],
    'model__l1_ratio':[0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
}
en_grid=GridSearchCV(en_pipeline, en_param, n_jobs=-1, cv=5, scoring='neg_mean_squared_error')
en_grid.fit(X_train, y_train)
en_model=en_grid.best_estimator_
en_scores=cross_val_score(en_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

In [22]:
gbr_pipeline=Pipeline([
    ('processing', processor),
    ('model', GradientBoostingRegressor())
])
gbr_param={
    'model__learning_rate': [0.001, 0.01, 0.01, 0.5, 1],
    'model__n_estimators': [10, 50, 100, 200, 500],
    'model__max_depth': [3, 5, 7, 9, 15],
    'model__min_samples_split': [2, 5, 10, 15, 20],
    'model__min_samples_leaf': [1, 5, 10, 15, 20]
}
gbr_grid=GridSearchCV(gbr_pipeline, gbr_param, n_jobs=-1, cv=5, scoring='neg_mean_squared_error')
gbr_grid.fit(X_train, y_train)
gbr_model=gbr_grid.best_estimator_
gbr_scores=cross_val_score(gbr_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

In [23]:
svr_pipeline=Pipeline([
    ('processing', processor),
    ('model', SVR())
])
svr_param={
    'model__C':[0.1, 1, 2, 10, 20, 30, 50, 100],
    'model__kernel':['linear', 'poly', 'rbf', 'sigmoid']
}
svr_grid=GridSearchCV(svr_pipeline, svr_param, n_jobs=-1, cv=5, scoring='neg_mean_squared_error')
svr_grid.fit(X_train, y_train)
svr_model=svr_grid.best_estimator_
svr_scores=cross_val_score(svr_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

In [30]:
cb_model=Pipeline([
    ('processing', processor),
    ('model', CatBoostRegressor())
])
cb_scores=cross_val_score(cb_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

Learning rate set to 0.04196
0:	learn: 78602.3425559	total: 2.32ms	remaining: 2.32s
1:	learn: 76591.4606740	total: 4.49ms	remaining: 2.24s
2:	learn: 74790.3981857	total: 6.65ms	remaining: 2.21s
3:	learn: 72902.9650699	total: 8.75ms	remaining: 2.18s
4:	learn: 70864.0451143	total: 10.6ms	remaining: 2.11s
5:	learn: 69042.0683195	total: 12.6ms	remaining: 2.09s
6:	learn: 67434.8297951	total: 14.9ms	remaining: 2.11s
7:	learn: 65977.1877893	total: 16.7ms	remaining: 2.07s
8:	learn: 64392.2750087	total: 18.6ms	remaining: 2.05s
9:	learn: 62896.6324771	total: 20.6ms	remaining: 2.04s
10:	learn: 61709.2554494	total: 22.3ms	remaining: 2.01s
11:	learn: 60219.9012358	total: 24.1ms	remaining: 1.98s
12:	learn: 58911.1085437	total: 25.8ms	remaining: 1.96s
13:	learn: 57528.9005926	total: 27.7ms	remaining: 1.95s
14:	learn: 56486.3412999	total: 29.3ms	remaining: 1.92s
15:	learn: 55266.6489844	total: 31.3ms	remaining: 1.92s
16:	learn: 54138.0530295	total: 33.3ms	remaining: 1.93s
17:	learn: 52847.5874989	tota

In [25]:
xgbr_model=Pipeline([
    ('processing', processor),
    ('model', XGBRegressor())
])
xgbr_scores=cross_val_score(xgbr_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

In [31]:
print('MSE score for lr_model:', -lr_scores.mean())
print('MSE score for rg_model:', -rg_scores.mean())
print('MSE score for lasso_model:', -lasso_scores.mean())
print('MSE score for rf_model:', -rf_scores.mean())
print('MSE score for en_model:', -en_scores.mean())
print('MSE score for gbr_model:', -gbr_scores.mean())
print('MSE score for xgbr_model:', -xgbr_scores.mean())
print('MSE score for cb_model:', -cb_scores.mean())
print('MSE score for svr_model:', -svr_scores.mean())

MSE score for lr_model: 1428792334.2025971
MSE score for rg_model: 1079165080.553862
MSE score for lasso_model: 1005552591.88554
MSE score for rf_model: 965247352.9393551
MSE score for en_model: 1005552591.88554
MSE score for gbr_model: 806284390.7643422
MSE score for xgbr_model: 965433280.0
MSE score for cb_model: 763662083.4212883
MSE score for svr_model: 1265580099.7633796


In [33]:
cb_model.fit(X_train, y_train)

Learning rate set to 0.043466
0:	learn: 77151.6129081	total: 2.31ms	remaining: 2.31s
1:	learn: 75256.1830671	total: 4.18ms	remaining: 2.09s
2:	learn: 73235.5496185	total: 6.15ms	remaining: 2.04s
3:	learn: 71376.4724481	total: 8.16ms	remaining: 2.03s
4:	learn: 69479.9415571	total: 9.97ms	remaining: 1.98s
5:	learn: 67618.7425003	total: 11.7ms	remaining: 1.94s
6:	learn: 65875.0327269	total: 13.5ms	remaining: 1.91s
7:	learn: 64096.4945930	total: 15.3ms	remaining: 1.9s
8:	learn: 62471.5757460	total: 17.1ms	remaining: 1.88s
9:	learn: 60978.9361511	total: 18.7ms	remaining: 1.85s
10:	learn: 59564.8448303	total: 20.5ms	remaining: 1.85s
11:	learn: 58062.7148051	total: 22.2ms	remaining: 1.83s
12:	learn: 56696.0433700	total: 23.9ms	remaining: 1.82s
13:	learn: 55502.3279811	total: 25.7ms	remaining: 1.81s
14:	learn: 54264.9325373	total: 27.4ms	remaining: 1.8s
15:	learn: 53020.2161527	total: 29.2ms	remaining: 1.8s
16:	learn: 51783.8455127	total: 30.9ms	remaining: 1.79s
17:	learn: 50760.4277760	total:

In [34]:
y_pred=cb_model.predict(test)
y_pred=np.round(y_pred, 1)
predictions=pd.DataFrame({'Id':range(1461, 1461+len(y_pred)), 'SalePrice':y_pred})
predictions.to_csv('submission-5.csv', index=False)