## Krasnoyarsk real estate price prediction 

In [1]:
# Import Libraries
import numpy as np
import pandas as pd
from time import time
from tqdm.notebook import tqdm

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import clone

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics

import warnings 
import pickle

In [2]:
# Loading the Data
df = pd.read_csv("./flats_info_eda.csv")

In [3]:
df.head()

Unnamed: 0,flat_type,city_area,premises_area,floor,floor_tot,layout,status,price
0,1_room_ap,Sovietskiy,29.7,1,9,new,finished,38837.209302
1,3_room_ap,Sovietskiy,64.1,2,9,new,finished,75581.395349
2,3_room_ap,Sverdlovsky,58.4,1,5,khrushchevka,finished,58023.255814
3,3_room_ap,Sverdlovsky,92.3,5,6,new,finished,144186.046512
4,2_room_ap,Oktyabrskiy,48.0,10,17,new,finished,50000.0


In [4]:
df.shape

(2151, 8)

In [5]:
# Removing five-room apartments
df = df[df.flat_type != '5_room_ap']

In [6]:
# Selecting independent and dependent variables
X = df.drop("price", axis=1)
y = df.price

In [7]:
# Selecting independent and dependent variables
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, 
                                                    test_size=0.2, random_state=42)

In [8]:
# Select categorical columns
categorical_cols = [cname for cname in X_train.columns if
                    X_train[cname].nunique() < 20 and 
                    X_train[cname].dtype == "object"]
# Select numerical columns
numerical_cols = [cname for cname in X_train.columns if 
                  X_train[cname].dtype in ['int64', 'float64']]

my_cols = categorical_cols + numerical_cols
X_train = X_train[my_cols].copy()
X_test = X_test[my_cols].copy()

In [9]:
# Function which preprocess input numerical and categorical data + take ml model, and return ready to use pipeline.
def preprocessing(model):    
    # Preprocessing for numerical data
    numerical_transformer = Pipeline(steps=[
    ('MinMaxScaler', MinMaxScaler())])
    
    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
    transformers=[('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)])

    # Pipeline
    pipe = Pipeline(steps=[('preprocessor', clone(preprocessor)),
                      ('model', clone(model))])
    return pipe 

In [10]:
# Dictionary with considered ml models
model_dict = {'Linear Regression' : LinearRegression(),
              'Lasso' : Lasso(max_iter=10000),
              'Ridge' : Ridge(),
              'Decision Tree' : DecisionTreeRegressor(),
              'Random Forest' : RandomForestRegressor(),
              'Extra Trees' : ExtraTreesRegressor(),
              'KNeighbors' : KNeighborsRegressor(),
              'Gradient Boosting' : GradientBoostingRegressor(),
              }

In [11]:
# Cross_validation function to compare ml models performance
def model_tester(model_dict, X, y, cv=5):  
    scoring = []
    for name, model in tqdm(model_dict.items()):
        pipe = preprocessing(model)
        start = time() 
        np.random.seed(42)
        score = cross_validate(pipe, X, y, cv=cv, scoring=('neg_mean_absolute_error', 'neg_root_mean_squared_error', 
                                                           'r2'), return_train_score=True)
        train_time = time() - start
        scoring.append([train_time, (-1)*np.mean(score['test_neg_mean_absolute_error']),  
                (-1)*np.mean(score['test_neg_root_mean_squared_error']), np.mean(score['test_r2'])])

    cv_score = pd.DataFrame(data=scoring)
    cv_score.columns =['cv_time',  'MAE', 'RMSE', 'R2_score'] 
    cv_score.index = list(model_dict.keys())
    return cv_score

In [12]:
model_tester(model_dict, X_train, y_train, cv=5)

  0%|          | 0/8 [00:00<?, ?it/s]

Unnamed: 0,cv_time,MAE,RMSE,R2_score
Linear Regression,0.229705,7863.315917,10878.332076,0.820269
Lasso,3.934423,7861.305536,10877.611143,0.820297
Ridge,0.155491,7872.369683,10951.005646,0.818064
Decision Tree,0.217395,7643.876,12484.29426,0.763175
Random Forest,7.369419,6121.67585,9373.200483,0.866812
Extra Trees,7.55332,5674.019587,9176.155247,0.872127
KNeighbors,0.627494,8385.328874,12293.208513,0.771536
Gradient Boosting,0.776577,6656.799514,9495.487859,0.862847


For further consideration, I will select three models (Random Forest, Extra Trees, Gradient Boosting) that have shown the best results in cross-validation.

## Hyperparameter tuning

### Random Forest

In [13]:
# Random Forest Regressor
pipe_RandomForest = preprocessing(RandomForestRegressor())
param_grid = {'model__n_estimators': [100, 600, 800], 
             'model__max_depth': [None, 8], 
             'model__min_samples_split': [2, 4, 6]}

grid_search_RF = GridSearchCV(pipe_RandomForest, param_grid, cv=5, return_train_score = True)

In [14]:
grid_search_RF.fit(X_train, y_train);

In [15]:
print("Tuned hyperparameters for RandomForestRegressor: \n", grid_search_RF.best_params_)

Tuned hyperparameters for RandomForestRegressor: 
 {'model__max_depth': None, 'model__min_samples_split': 2, 'model__n_estimators': 800}


In [32]:
# Run the RandomForestRegressor model with tuned hyperparameters
tuned_RF_model_dict = {'Random Forest' : RandomForestRegressor(max_depth= None, min_samples_split=2, 
                                                      n_estimators = 800)}

model_tester(tuned_RF_model_dict, X_train, y_train, cv=5)

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,cv_time,MAE,RMSE,R2_score
Random Forest,61.825087,6102.989055,9379.226946,0.866636


### Extra Trees

In [17]:
# Extra Trees Regressor
pipe_ExtraTrees = preprocessing(ExtraTreesRegressor())
param_grid = {'model__n_estimators': [100, 400, 800], 
             'model__max_depth': [None, 10], 
             'model__min_samples_split': [2, 4, 6, 8]}

grid_search_ET = GridSearchCV(pipe_ExtraTrees, param_grid, cv=5, return_train_score = True)

In [18]:
grid_search_ET.fit(X_train, y_train);

In [19]:
print("Tuned hyperparameters for ExtraTreesRegressor: \n", grid_search_ET.best_params_)

Tuned hyperparameters for ExtraTreesRegressor: 
 {'model__max_depth': None, 'model__min_samples_split': 6, 'model__n_estimators': 800}


In [20]:
# Run the ExtraTreesRegressor model with tuned hyperparameters
tuned_ET_model_dict = {'Extra Trees' : ExtraTreesRegressor(max_depth= None, min_samples_split=6, 
                                                      n_estimators = 800)}

model_tester(tuned_ET_model_dict, X_train, y_train, cv=5)

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,cv_time,MAE,RMSE,R2_score
Extra Trees,47.575261,5822.513986,9168.622642,0.872288


### Gradient Boosting

In [21]:
# Gradient Boosting Regressor
pipe_GradientBoosting = preprocessing(GradientBoostingRegressor())
param_grid = { 
    'model__n_estimators': [90, 150, 300],
    'model__max_depth': [None, 5, 10],
    'model__min_samples_split': [4, 6, 8] }

grid_search_GB = GridSearchCV(pipe_GradientBoosting, param_grid, cv=5, return_train_score = True)

In [22]:
grid_search_GB.fit(X_train, y_train);

In [23]:
print("Tuned hyperparameters for GradientBoostingRegressor: \n", grid_search_GB.best_params_)

Tuned hyperparameters for GradientBoostingRegressor: 
 {'model__max_depth': 5, 'model__min_samples_split': 6, 'model__n_estimators': 150}


In [24]:
# Run the GradientBoostingRegressor model with tuned hyperparameters
tuned_GB_model_dict = {'Gradient Boosting' : GradientBoostingRegressor(max_depth= 5, min_samples_split=6, 
                                                      n_estimators = 150)}

model_tester(tuned_GB_model_dict, X_train, y_train, cv=5)

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,cv_time,MAE,RMSE,R2_score
Gradient Boosting,2.003,6077.787961,9258.82021,0.86965


ExtraTreesRegressor showed the best performance after hyperparameter tuning.

## Evaluate models on test data

In [25]:
def best_model_test(grid_search_input, X_test, y_test): 
    
    best_model = grid_search_input.best_estimator_
    y_pred = best_model.predict(X_test)
    # metrics
    MAE = metrics.mean_absolute_error(y_test, y_pred)
    RMSE = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
    R2 = metrics.r2_score(y_test, y_pred)
    score = pd.DataFrame(data=[MAE, RMSE, R2]).T
    # name of columns 
    score.columns =['MAE', 'RMSE', 'R2_score'] 
    # index name
    model_name = best_model.named_steps['model'].__class__.__name__
    score.index = [model_name] 

    return score

In [26]:
best_model_test(grid_search_RF, X_test, y_test)

Unnamed: 0,MAE,RMSE,R2_score
RandomForestRegressor,6241.359178,10772.981109,0.837661


In [27]:
best_model_test(grid_search_ET, X_test, y_test)

Unnamed: 0,MAE,RMSE,R2_score
ExtraTreesRegressor,6071.568913,10533.244063,0.844806


In [28]:
best_model_test(grid_search_GB, X_test, y_test)

Unnamed: 0,MAE,RMSE,R2_score
GradientBoostingRegressor,6248.750126,10191.067212,0.854725


Although ExtraTreesRegressor showed better results in cross-validation and after hyperparameter tuning compared to others, for production, I choose the GradientBoostingRegressor because in two metrics(RMSE and R2_score) model showed beter result for test evaluation.

## Pickle the machine learning model

Save the model for future production.

In [29]:
pickle.dump(grid_search_GB, open('model.p','wb'))
model = pickle.load(open('model.p','rb'))

In [30]:
y_test[:5]

1160    56976.744186
2020    77906.976744
978     45348.837209
759     27862.325581
874     47790.697674
Name: price, dtype: float64

In [31]:
model.predict(X_test[:5])

array([62086.88300826, 81723.02855686, 48771.3470024 , 27142.28855964,
       45463.78074384])