In [46]:
import warnings
warnings.filterwarnings('ignore')
warnings.warn('ignore')

In [47]:
import numpy as np
import pandas as pd

In [48]:
import pickle

with open('preprocessed.pkl', 'rb') as file:
    pre = pickle.load(file)

In [49]:
train = pd.read_csv('train_set.csv')
test = pd.read_csv('test_set.csv')

In [50]:
train

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price,availability_mapped
0,Built-up Area,Kengeri Satellite Town,2.0,560.0,2.0,1.0,16.60,1
1,Built-up Area,Attibele,3.0,2400.0,3.0,3.0,120.00,1
2,Built-up Area,Sarvobhogam Nagar,3.0,1490.0,3.0,0.0,140.00,1
3,Super built-up Area,Yelahanka,3.0,1610.0,4.0,2.0,85.00,1
4,Super built-up Area,Hegde Nagar,2.0,1341.0,2.0,1.0,97.00,1
...,...,...,...,...,...,...,...,...
9527,Super built-up Area,Hebbal,2.0,1294.0,2.0,1.0,115.00,1
9528,Super built-up Area,Raja Rajeshwari Nagar,2.0,1185.0,2.0,1.0,40.17,1
9529,Super built-up Area,Akshayanagara East,3.0,1519.0,3.0,2.0,67.23,1
9530,Super built-up Area,Kengeri Satellite Town,1.0,930.0,1.0,1.0,30.00,1


In [51]:
X_train = train.drop(columns='price',axis=1)
y_train = train['price']

In [52]:
test

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price,availability_mapped
0,Super built-up Area,8th Phase JP Nagar,1.0,500.0,1.0,1.0,31.00,0
1,Super built-up Area,Ardendale,2.0,1224.0,2.0,2.0,67.00,1
2,Super built-up Area,Thigalarapalya,2.0,1418.0,2.0,2.0,105.00,1
3,Super built-up Area,Kanakpura Road,3.0,1100.0,3.0,1.0,58.00,0
4,Super built-up Area,Mysore Road,2.0,1070.0,2.0,1.0,49.65,0
...,...,...,...,...,...,...,...,...
3173,Super built-up Area,Jakkur,4.0,5150.0,4.0,3.0,559.00,1
3174,Super built-up Area,Avalahalli,4.0,2996.0,4.0,3.0,200.00,1
3175,Built-up Area,Chandapura,2.0,650.0,1.0,1.0,17.00,1
3176,Plot Area,HBR Layout,4.0,1200.0,4.0,2.0,115.00,1


In [53]:
X_test = test.drop(columns='price',axis=1)
y_test = test['price']

### Transformer

In [54]:
X_train_tf = pre.transform(X_train)
X_test_tf = pre.transform(X_test)

### Model fitting

### XGBOOST

In [55]:
import xgboost

In [56]:
xgb = xgboost.XGBRegressor()

In [57]:
params = {'eta':[0.01,0.1,0.3,0.5],
         'subsample':[0.5,0.6,0.7,0.8]}

In [58]:
from sklearn.model_selection import GridSearchCV

In [59]:
grid = GridSearchCV(xgb, param_grid=params,n_jobs=-1)
grid.fit(X_train_tf,y_train)

In [60]:
xgb_best = grid.best_estimator_
print(f'best hyperparameters: {grid.best_params_}')

best hyperparameters: {'eta': 0.3, 'subsample': 0.7}


In [61]:
xg = xgb_best.fit(X_train_tf, y_train)
xg_train_pred = xg.predict(X_train_tf)
xg_test_pred = xg.predict(X_test_tf)

In [62]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

In [63]:
xg_train_rmse = np.sqrt(mean_squared_error(xg_train_pred, y_train))
xg_test_rmse = np.sqrt(mean_squared_error(xg_test_pred, y_test))

In [64]:
xg_train_mape = mean_absolute_percentage_error(xg_train_pred, y_train)
xg_test_mape = mean_absolute_percentage_error(xg_test_pred, y_test)

In [65]:
xg_train_mae = mean_absolute_error(xg_train_pred, y_train)
xg_test_mae = mean_absolute_error(xg_test_pred, y_test)

In [66]:
pd.DataFrame({'Metric':['RMSE','MAPE','MAE'],
             'Train':[xg_train_rmse,xg_train_mape,xg_train_mae],
             'Test':[xg_test_rmse,xg_test_mape,xg_test_mae]})

Unnamed: 0,Metric,Train,Test
0,RMSE,39.697809,83.919058
1,MAPE,0.223125,0.260885
2,MAE,23.283511,31.446137


### Linear Regression

In [67]:
from sklearn.linear_model import LinearRegression

In [68]:
lr = LinearRegression()

In [69]:
model_lr = lr.fit(X_train_tf, y_train)

In [70]:
lr_train_pred = model_lr.predict(X_train_tf)
lr_test_pred = model_lr.predict(X_test_tf)

In [71]:
lr_train_rmse = np.sqrt(mean_squared_error(lr_train_pred, y_train))
lr_test_rmse = np.sqrt(mean_squared_error(lr_test_pred, y_test))

In [72]:
lr_train_mape = mean_absolute_percentage_error(lr_train_pred, y_train)
lr_test_mape = mean_absolute_percentage_error(lr_test_pred, y_test)

In [73]:
lr_train_mae = mean_absolute_error(lr_train_pred, y_train)
lr_test_mae = mean_absolute_error(lr_test_pred, y_test)

In [74]:
pd.DataFrame({'Metric':['RMSE','MAPE','MAE'],
             'Train':[lr_train_rmse, lr_train_mape, lr_train_mae],
             'Test':[lr_test_rmse, lr_test_mape, lr_test_mae]})

Unnamed: 0,Metric,Train,Test
0,RMSE,81.852989,92.13656
1,MAPE,0.541614,0.604266
2,MAE,31.886021,37.804316


### Random Forest

In [75]:
from sklearn.ensemble import RandomForestRegressor

In [76]:
params_rf = {'n_estimators': [100, 150, 200, 250, 300],
    'max_depth': [1,2,3,4]}

In [77]:
rf = RandomForestRegressor()

In [78]:
grid_rf = GridSearchCV(rf, param_grid=params_rf, n_jobs=-1)
grid_rf.fit(X_train_tf, y_train)

In [80]:
rf_best = grid_rf.best_estimator_
print(f'best hyperparamters: {grid_rf.best_params_}')

best hyperparamters: {'max_depth': 4, 'n_estimators': 200}


In [81]:
rfr = RandomForestRegressor(max_depth=4,n_estimators=200,bootstrap=True)

In [82]:
model_rfr = rf_best.fit(X_train_tf,y_train)

In [83]:
rfr_train_pred = model_rfr.predict(X_train_tf)
rfr_test_pred = model_rfr.predict(X_test_tf)

In [84]:
rf_train_rmse = np.sqrt(mean_squared_error(rfr_train_pred,y_train))
rf_test_rmse = np.sqrt(mean_squared_error(rfr_test_pred,y_test))

In [85]:
rf_train_mape = mean_absolute_percentage_error(rfr_train_pred,y_train)
rf_test_mape = mean_absolute_percentage_error(rfr_test_pred,y_test)

In [86]:
rf_train_mae = mean_absolute_error(rfr_train_pred,y_train)
rf_test_mae = mean_absolute_error(rfr_test_pred,y_test)

In [87]:
pd.DataFrame({'Metric':['RMSE','MAPE','MAE'],
             'Train':[rf_train_rmse,rf_train_mape,rf_train_mae],
             'Test':[rf_test_rmse,rf_test_mape,rf_test_mae]})

Unnamed: 0,Metric,Train,Test
0,RMSE,74.602542,86.321731
1,MAPE,0.294592,0.306214
2,MAE,34.947039,37.206264


### Gradient Boost Regressor

In [88]:
from sklearn.ensemble import GradientBoostingRegressor

In [92]:
params_gbr = {'n_estimators':np.arange(100,251,50),
             'max_depth':np.arange(1,11,2),
             'learning_rate':[0.01,0.03,0.05,0.1]}

In [93]:
gbr = GradientBoostingRegressor()

In [94]:
grid_gb = GridSearchCV(gbr, param_grid=params_gbr,n_jobs=-1)
grid_gb.fit(X_train_tf,y_train)

In [96]:
gbr_best=grid_gb.best_estimator_
print(f'best hyperparameters: {grid_gb.best_params_}')

best hyperparameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 250}


In [97]:
gb = GradientBoostingRegressor(learning_rate=0.1,max_depth=5,n_estimators=250)

In [98]:
model_gb = gbr_best.fit(X_train_tf,y_train)

In [99]:
gb_train_pred = model_gb.predict(X_train_tf)
gb_test_pred = model_gb.predict(X_test_tf)

In [100]:
gb_train_rmse = np.sqrt(mean_squared_error(gb_train_pred,y_train))
gb_test_rmse = np.sqrt(mean_squared_error(gb_test_pred,y_test))

In [101]:
gb_train_mape = mean_absolute_percentage_error(gb_train_pred,y_train)
gb_test_mape = mean_absolute_percentage_error(gb_test_pred,y_test)

In [102]:
gb_train_mae = mean_absolute_error(gb_train_pred,y_train)
gb_test_mae = mean_absolute_error(gb_test_pred,y_test)

In [103]:
pd.DataFrame({'metric':['RMSE','MAPE','MAE'],
             'Train':[gb_train_rmse,gb_train_mape,gb_train_mae],
             'Test':[gb_test_rmse,gb_test_mape,gb_test_mae]})

Unnamed: 0,metric,Train,Test
0,RMSE,39.80012,78.035971
1,MAPE,0.226803,0.264079
2,MAE,23.793935,30.989451


### After fitting above 4 regression models XGboost and GB regressor performed somewhat similar, out of which XGboost performed slightly better than GB regressor, hence we will deploy the same

### Pickle

In [104]:
xg=xgboost.XGBRegressor(eta=0.3,subsample=0.7)

In [105]:
xg_model = xg.fit(X_train_tf,y_train)

In [106]:
with open('xg.pkl','wb') as file:
    pickle.dump(xg_model,file)