In [40]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
import category_encoders as ce
from sklearn.model_selection import GridSearchCV, KFold



In [41]:
df = pd.read_csv(R'E:\Mumbai Flat Real Estate Intelligence\datasets\mumbai_properties_post_feature_selection_v2.csv')

In [42]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,balcony,property_age,major_location,built_up_area,furnishing_type
0,0.17,1,1,2,New_property,Nalasopara West,550.0,Furnished
1,0.18,1,1,0,New_property,others,665.0,Furnished
2,0.18,1,1,2,New_property,Palghar,630.0,Furnished
3,0.18,1,1,2,New_property,Palghar,630.0,Unfurnished
4,0.18,1,2,1,New_property,Palghar,630.0,Furnished


In [43]:
df['furnishing_type'].value_counts()

furnishing_type
Furnished      5453
Unfurnished    4161
Name: count, dtype: int64

In [44]:
df.shape

(9614, 8)

In [45]:
df = df[df['major_location'] != 'Vakola']

In [46]:
df.shape

(9611, 8)

In [47]:
# df = df[df['price']<18]

In [48]:
# df.shape

In [49]:
X = df.drop(columns=['price'])
y = df['price']

In [50]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

### Ordinal Encoding


In [51]:
columns_to_encode = ['major_location',  'property_age', 'balcony','furnishing_type']


In [52]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedrooms', 'bathrooms', 'built_up_area']),
        ('cat', OrdinalEncoder(), columns_to_encode)
    ], 
    remainder='passthrough'
)

In [53]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [54]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [55]:

scores.mean(),scores.std()

(np.float64(0.7384415010872983), np.float64(0.013974063061669746))

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)


In [57]:
pipeline.fit(X_train,y_train)


0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [58]:
y_pred = pipeline.predict(X_test)

In [59]:
y_pred = np.expm1(y_pred)

In [60]:
mean_absolute_error(np.expm1(y_test),y_pred)

1.0760850631621757

In [22]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [23]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [24]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))


In [25]:
model_output

[['linear_reg', np.float64(0.7384415010872983), 1.0760850631621757],
 ['svr', np.float64(0.7446173229027798), 1.0255749457664811],
 ['ridge', np.float64(0.7384417618897658), 1.076046094685717],
 ['LASSO', np.float64(0.018147290612791356), 1.907486662374778],
 ['decision tree', np.float64(0.8156533083759104), 0.7928211215402012],
 ['random forest', np.float64(0.8959831748026291), 0.6496216036774917],
 ['extra trees', np.float64(0.8561140191523847), 0.7354684508751689],
 ['gradient boosting', np.float64(0.8799952284032789), 0.7268501827430575],
 ['adaboost', np.float64(0.7348709578557362), 1.044176565823486],
 ['mlp', np.float64(0.7656549913069265), 0.9399345727973021],
 ['xgboost', np.float64(0.9236356664535151), 0.5670053339024663]]

In [26]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])


In [27]:
model_df.sort_values(['mae'])


Unnamed: 0,name,r2,mae
10,xgboost,0.923636,0.567005
5,random forest,0.895983,0.649622
7,gradient boosting,0.879995,0.72685
6,extra trees,0.856114,0.735468
4,decision tree,0.815653,0.792821
9,mlp,0.765655,0.939935
1,svr,0.744617,1.025575
8,adaboost,0.734871,1.044177
2,ridge,0.738442,1.076046
0,linear_reg,0.738442,1.076085


### OHE

In [28]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedrooms', 'bathrooms', 'built_up_area']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first'),['major_location','property_age','furnishing_type'])
    ], 
    remainder='passthrough'
)

In [29]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [30]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [31]:
scores.mean()

np.float64(0.9126758226051189)

In [32]:
scores.std()

np.float64(0.00556185924412332)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)


In [34]:
pipeline.fit(X_train,y_train)


0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [35]:
y_pred = pipeline.predict(X_test)



In [36]:
y_pred = np.expm1(y_pred)

In [37]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.6368726471167551

In [38]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [39]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [40]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [41]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [42]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.91575,0.568447
5,random forest,0.914449,0.573147
10,xgboost,0.924092,0.574965
9,mlp,0.912115,0.579068
2,ridge,0.91217,0.636194
0,linear_reg,0.912676,0.636873
4,decision tree,0.860973,0.698063
7,gradient boosting,0.880758,0.7131
1,svr,0.785682,0.963674
8,adaboost,0.742116,1.036663


### OneHotEncoding With PCA

In [43]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedrooms', 'bathrooms', 'built_up_area']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['major_location','property_age'])
    ], 
    remainder='passthrough'
)

In [44]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', LinearRegression())
])

In [45]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [46]:
scores.mean()

np.float64(0.020503307020318308)

In [47]:

scores.std()

np.float64(0.008235909844130708)

In [48]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [49]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [50]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [51]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [52]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.687133,1.074703
6,extra trees,0.65203,1.121458
4,decision tree,0.589933,1.228788
10,xgboost,0.613427,1.260955
7,gradient boosting,0.593137,1.307499
8,adaboost,0.230354,1.794835
1,svr,0.071194,1.824727
9,mlp,0.084408,1.83359
2,ridge,0.020503,1.905793
0,linear_reg,0.020503,1.905793


### Target Encoder

In [53]:
import category_encoders as ce

In [54]:


columns_to_encode = ['major_location', 'balcony', 'property_age', 'furnishing_type']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedrooms', 'bathrooms', 'built_up_area']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['property_age']),
        ('target_enc', ce.TargetEncoder(), ['major_location'])
    ], 
    remainder='passthrough'
)

In [55]:
!pip install category_encoders



In [56]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [57]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [58]:

scores.mean(),scores.std()

(np.float64(0.8907256650300992), np.float64(0.008092702807362219))

In [59]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [60]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [61]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [62]:

model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])


In [63]:
model_df.sort_values(['mae'])


Unnamed: 0,name,r2,mae
10,xgboost,0.930499,0.5394
6,extra trees,0.92545,0.544345
5,random forest,0.928594,0.546205
7,gradient boosting,0.917973,0.591805
9,mlp,0.888853,0.657406
4,decision tree,0.878851,0.677799
0,linear_reg,0.890726,0.70072
2,ridge,0.890727,0.700744
8,adaboost,0.858174,0.789973
1,svr,0.86214,0.793409


### Hyperparameter Tuning

In [64]:
from sklearn.model_selection import GridSearchCV


In [65]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,balcony,property_age,major_location,built_up_area,furnishing_type
0,0.17,1,1,2,New_property,Nalasopara West,550.0,Furnished
1,0.18,1,1,0,New_property,others,665.0,Furnished
2,0.18,1,1,2,New_property,Palghar,630.0,Furnished
3,0.18,1,1,2,New_property,Palghar,630.0,Unfurnished
4,0.18,1,2,1,New_property,Palghar,630.0,Furnished


In [66]:
param_grid = {
    'regressor__n_estimators': [300, 400, 500],
    'regressor__max_depth': [None, 10, 20, 25, 30],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4],
    'regressor__max_features': ['auto', 'sqrt'],
    'regressor__max_samples': [0.5, 0.75, 1.0]
}


In [67]:
columns_to_encode = [ 'balcony', 'furnishing_type']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedrooms', 'bathrooms', 'built_up_area']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['property_age']),
        ('target_enc', ce.TargetEncoder(), ['major_location'])
    ], 
    remainder='passthrough'
)

In [68]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [69]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)


In [70]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [None]:
search.fit(X, y_transformed)

Fitting 10 folds for each of 810 candidates, totalling 8100 fits


In [None]:
final_pipe = search.best_estimator_

In [None]:
search.best_params_


{'regressor__max_depth': None,
 'regressor__max_features': 'sqrt',
 'regressor__max_samples': 1.0,
 'regressor__min_samples_leaf': 1,
 'regressor__min_samples_split': 5,
 'regressor__n_estimators': 400}

In [None]:
search.best_score_


np.float64(0.9257780424828891)

In [None]:
final_pipe.fit(X,y_transformed)


0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,

0,1,2
,n_estimators,400
,criterion,'squared_error'
,max_depth,
,min_samples_split,5
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
y_pred = final_pipe.predict(X_test)


In [None]:
y_pred = np.expm1(y_pred)


In [None]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.33188451599879715

best selected model

In [32]:
numeric_cols = ['bedrooms', 'bathrooms', 'built_up_area']
onehot_cols = ['balcony', 'property_age', 'furnishing_type']
target_cols = ['major_location']

In [33]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat_onehot', OneHotEncoder(drop='first', sparse_output=False), onehot_cols),
        ('target', ce.TargetEncoder(), target_cols)
    ],
    remainder='drop'
)

In [34]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [35]:
param_grid = {
    'regressor__n_estimators': [300],
    'regressor__max_depth': [None, 10, 20, 25, 30],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4],
    'regressor__max_features': ['auto', 'sqrt'],
    'regressor__max_samples': [0.5, 0.75, 1.0]
}

In [36]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

search = GridSearchCV(
    pipeline,
    param_grid,
    cv=kfold,
    scoring='r2',
    n_jobs=-1,
    verbose=4
)

In [37]:
search.fit(X, y_transformed)

Fitting 10 folds for each of 270 candidates, totalling 2700 fits


1350 fits failed out of a total of 2700.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
727 fits failed with the following error:
Traceback (most recent call last):
  File "e:\Mumbai Flat Real Estate Intelligence\mumbai_flat_env\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "e:\Mumbai Flat Real Estate Intelligence\mumbai_flat_env\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "e:\Mumbai Flat Real Estate Intelligence\mumbai_flat_env\Lib\site-packages\sklearn\pipeline.py", line 663, in fit
    self._final_estimator.fit(Xt, y, 

0,1,2
,estimator,Pipeline(step...Regressor())])
,param_grid,"{'regressor__max_depth': [None, 10, ...], 'regressor__max_features': ['auto', 'sqrt'], 'regressor__max_samples': [0.5, 0.75, ...], 'regressor__min_samples_leaf': [1, 2, ...], ...}"
,scoring,'r2'
,n_jobs,-1
,refit,True
,cv,KFold(n_split... shuffle=True)
,verbose,4
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat_onehot', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_missing,'value'
,handle_unknown,'value'
,min_samples_leaf,20
,smoothing,10
,hierarchy,

0,1,2
,n_estimators,300
,criterion,'squared_error'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [38]:
final_pipe = search.best_estimator_

In [61]:
y_pred = final_pipe.predict(X_test)

In [62]:
y_pred = np.expm1(y_pred)

In [63]:
mae = mean_absolute_error(np.expm1(y_test), y_pred)

In [64]:
print("Best Params:", search.best_params_)
print("Best R2 Score:", search.best_score_)
print("MAE:", mean_absolute_error(np.expm1(y_test), y_pred))

Best Params: {'regressor__max_depth': 20, 'regressor__max_features': 'sqrt', 'regressor__max_samples': 1.0, 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 300}
Best R2 Score: 0.9227966512910777
MAE: 0.243559218444499


### exporting pkl 

In [65]:
import pickle

with open("pipeline.pkl", "wb") as file:
    pickle.dump(final_pipe, file)

In [66]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

In [67]:

X

Unnamed: 0,bedrooms,bathrooms,balcony,property_age,major_location,built_up_area,furnishing_type
0,1,1,2,New_property,Nalasopara West,550.0,Furnished
1,1,1,0,New_property,others,665.0,Furnished
2,1,1,2,New_property,Palghar,630.0,Furnished
3,1,1,2,New_property,Palghar,630.0,Unfurnished
4,1,2,1,New_property,Palghar,630.0,Furnished
...,...,...,...,...,...,...,...
9609,5,5,0,Old,Malabar Hill,3699.0,Unfurnished
9610,5,5,0,Mid Age,Malabar Hill,3181.0,Furnished
9611,5,6,1,Old,Malabar Hill,2850.0,Furnished
9612,6,4,3,Mid Age,Lower Parel,5042.0,Furnished


In [68]:
X.shape

(9611, 7)

In [69]:
df.to_csv('mumbai_properties_final.csv', index=False)

### Trying out the predictions

In [70]:
X.columns

Index(['bedrooms', 'bathrooms', 'balcony', 'property_age', 'major_location',
       'built_up_area', 'furnishing_type'],
      dtype='object')

In [71]:
X.iloc[0].values

array([np.int64(1), np.int64(1), '2', 'New_property', 'Nalasopara West',
       np.float64(550.0), 'Furnished'], dtype=object)

In [72]:
data = [[ 2, 2, '0', 'Mid Age', 'Mulund West', 650,  'Unfurnished']]
columns = ['bedrooms', 'bathrooms', 'balcony', 'property_age', 'major_location',
       'built_up_area',  'furnishing_type']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df

Unnamed: 0,bedrooms,bathrooms,balcony,property_age,major_location,built_up_area,furnishing_type
0,2,2,0,Mid Age,Mulund West,650,Unfurnished


In [73]:
np.expm1(final_pipe.predict(one_df))

array([1.76952293])

In [74]:
X.dtypes

bedrooms             int64
bathrooms            int64
balcony             object
property_age        object
major_location      object
built_up_area      float64
furnishing_type     object
dtype: object

In [75]:
sorted(X['major_location'].unique().tolist())


['Agripada',
 'Andheri East',
 'Andheri West',
 'Bandra East',
 'Bandra West',
 'Bhandup East',
 'Bhandup West',
 'Bhayandar East',
 'Bhayandar West',
 'Bhoiwada',
 'Boisar',
 'Bolinj',
 'Borivali East',
 'Borivali West',
 'Byculla',
 'Chakala',
 'Chembur',
 'Chikhal Dongari',
 'Churchgate',
 'Colaba',
 'Cuffe Parade',
 'Cumballa Hill',
 'Dadar East',
 'Dadar West',
 'Dahisar',
 'Dahisar East',
 'Dahisar West',
 'Dattapada',
 'Deonar',
 'Evershine Nagar',
 'Gandhi Nagar',
 'Ghatkopar East',
 'Ghatkopar West',
 'Girgaon',
 'Goregaon',
 'Goregaon East',
 'Goregaon West',
 'Govandi',
 'Hindu Colony',
 'Hmpl Surya Nagar',
 'Jogeshwari East',
 'Jogeshwari West',
 'Juhu',
 'Juhu Scheme',
 'Jvpd Scheme',
 'Kala Nagar',
 'Kamathipura',
 'Kanchpada',
 'Kandivali East',
 'Kandivali West',
 'Kanjurmarg East',
 'Kanjurmarg West',
 'Kashimira',
 'Khar West',
 'Kherwadi',
 'Kurla East',
 'Kurla West',
 'Lower Parel',
 'Lower Parel East',
 'Lower Parel West',
 'Madanpura',
 'Madh',
 'Mahalakshmi',
 '