In [30]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor,
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    AdaBoostRegressor
)
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,r2_score
import category_encoders as ce

from sklearn.decomposition import PCA

In [31]:
df = pd.read_csv('../../../data/processed/advance/gurgaon_properties_post_feature_selection_v2.csv')

In [32]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,0.0,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,0.0,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,0.0,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,1.0,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,0.0,High,Mid Floor


In [33]:
df['furnishing_type'].value_counts()

furnishing_type
0.0    2349
1.0    1018
2.0     187
Name: count, dtype: int64

In [34]:
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished
df['furnishing_type'] = df['furnishing_type'].replace({0.0:'unfurnished',1.0:'semifurnished',2.0:'furnished'})

In [35]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,unfurnished,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,unfurnished,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,unfurnished,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,semifurnished,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,unfurnished,High,Mid Floor


In [36]:
X = df.drop(columns=['price'])
y = df['price']

In [37]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

### Ordinal Encoding

In [38]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [22]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode)
    ], 
    remainder='passthrough'
)

In [15]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [16]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [17]:
scores.mean(),scores.std()

(np.float64(0.7363096633436828), np.float64(0.03238005754429938))

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [25]:
pipeline.fit(X_train,y_train)

NameError: name 'pipeline' is not defined

In [20]:
y_pred = pipeline.predict(X_test)

In [21]:
y_pred = np.expm1(y_pred)

In [22]:
mean_absolute_error(np.expm1(y_test),y_pred)

np.float64(0.9463822160089355)

In [23]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [24]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [25]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [26]:
model_output

[['linear_reg',
  np.float64(0.7363096633436828),
  np.float64(0.9463822160089355)],
 ['svr', np.float64(0.7642012011196353), np.float64(0.8472636473483927)],
 ['ridge', np.float64(0.7363125343993554), np.float64(0.9463387741853388)],
 ['LASSO', np.float64(0.05943378064493572), np.float64(1.528905986892753)],
 ['decision tree',
  np.float64(0.7673085026151244),
  np.float64(0.7442615607081917)],
 ['random forest',
  np.float64(0.8821227799147175),
  np.float64(0.5328741406130693)],
 ['extra trees',
  np.float64(0.8669776195842113),
  np.float64(0.5484517439895076)],
 ['gradient boosting',
  np.float64(0.8726504776846961),
  np.float64(0.5759827420192102)],
 ['adaboost', np.float64(0.7538773267354032), np.float64(0.8319593806608996)],
 ['mlp', np.float64(0.8090576243465157), np.float64(0.7177366996634695)],
 ['xgboost', np.float64(0.8894876835260124), np.float64(0.5040475141482346)]]

In [27]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [28]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.889488,0.504048
5,random forest,0.882123,0.532874
6,extra trees,0.866978,0.548452
7,gradient boosting,0.87265,0.575983
9,mlp,0.809058,0.717737
4,decision tree,0.767309,0.744262
8,adaboost,0.753877,0.831959
1,svr,0.764201,0.847264
2,ridge,0.736313,0.946339
0,linear_reg,0.73631,0.946382


### OneHotEncoding

In [29]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first'),['sector','agePossession','furnishing_type'])
    ], 
    remainder='passthrough'
)

In [30]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [31]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [32]:
scores.mean()

np.float64(0.8546054073648314)

In [33]:
scores.std()

np.float64(0.01599847663314007)

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [35]:
pipeline.fit(X_train,y_train)

In [36]:
y_pred = pipeline.predict(X_test)

In [37]:
y_pred = np.expm1(y_pred)

In [38]:
mean_absolute_error(np.expm1(y_test),y_pred)

np.float64(0.6497382874070646)

In [39]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [40]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [41]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [42]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [43]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.894422,0.466914
10,xgboost,0.89585,0.493456
5,random forest,0.890771,0.500038
9,mlp,0.874392,0.543562
7,gradient boosting,0.876294,0.570925
0,linear_reg,0.854605,0.649738
2,ridge,0.854678,0.652914
4,decision tree,0.807073,0.698884
1,svr,0.769741,0.834124
8,adaboost,0.751862,0.846813


### OneHotEncoding With PCA

In [47]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [48]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=1)),
    ('regressor', LinearRegression())
])

In [49]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [19]:
scores.mean()

np.float64(0.06225201431451136)

In [20]:
scores.std()

np.float64(0.01986059407164015)

In [21]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [22]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [23]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [24]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [25]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.762022,0.656978
6,extra trees,0.74042,0.695043
4,decision tree,0.696442,0.761509
10,xgboost,0.622205,0.967581
7,gradient boosting,0.610623,0.987906
1,svr,0.218073,1.361163
8,adaboost,0.305643,1.403254
9,mlp,0.20823,1.450977
2,ridge,0.062252,1.526707
0,linear_reg,0.062252,1.526707


In [66]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA()),
    ('regressor', LinearRegression())
])

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
pipeline.fit(X_train,y_train)

In [68]:
y_pred = pipeline.predict(X_test)

In [69]:
y_pred = np.expm1(y_pred)

In [70]:
mean_absolute_error(np.expm1(y_test),y_pred)

np.float64(0.6491718597838264)

In [72]:
pipeline.named_steps['pca'].explained_variance_ratio_

array([9.89749744e-01, 2.90220572e-03, 1.68816536e-03, 9.31298980e-04,
       8.44026292e-04, 6.36740619e-04, 5.92022782e-04, 4.54962884e-04,
       4.17327781e-04, 2.82788570e-04, 1.87814840e-04, 1.64976967e-04,
       1.24921517e-04, 8.39921059e-05, 6.66087886e-05, 3.85820012e-05,
       2.76359720e-05, 2.58337502e-05, 2.43649092e-05, 2.42017556e-05,
       2.31068295e-05, 2.25533114e-05, 2.10023668e-05, 2.01997122e-05,
       1.87820810e-05, 1.74002386e-05, 1.69050844e-05, 1.63566237e-05,
       1.60336100e-05, 1.57538036e-05, 1.51270828e-05, 1.50071309e-05,
       1.49639547e-05, 1.42120409e-05, 1.41009031e-05, 1.38577795e-05,
       1.37957491e-05, 1.37128078e-05, 1.32341498e-05, 1.29527938e-05,
       1.28342844e-05, 1.22982822e-05, 1.21536402e-05, 1.16408767e-05,
       1.14468537e-05, 1.13839928e-05, 1.10374731e-05, 1.04931985e-05,
       1.04822944e-05, 1.01475504e-05, 9.93834155e-06, 9.76002650e-06,
       9.70271556e-06, 9.38192944e-06, 8.71443785e-06, 8.10680149e-06,
      

### Target Encoder

In [None]:
!pip install category_encoders




[notice] A new release of pip available: 22.3 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [39]:
import category_encoders as ce

In [40]:


columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [56]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [57]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [58]:
scores.mean(),scores.std()

(np.float64(0.829521918225536), np.float64(0.01838446337912282))

In [59]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [60]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [61]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [62]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [63]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.904798,0.447518
5,random forest,0.901154,0.452289
6,extra trees,0.901255,0.457758
7,gradient boosting,0.888952,0.50787
4,decision tree,0.827177,0.558826
9,mlp,0.849907,0.595438
8,adaboost,0.817599,0.707958
0,linear_reg,0.829522,0.713011
2,ridge,0.829536,0.713523
1,svr,0.782917,0.818851


### Hyperparameter Tuning

In [64]:
from sklearn.model_selection import GridSearchCV

In [65]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__max_samples':[0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['auto', 'sqrt']
}

In [66]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [67]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [68]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [69]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [71]:
search.fit(X, y_transformed)

Fitting 10 folds for each of 128 candidates, totalling 1280 fits


640 fits failed out of a total of 1280.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
302 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\vijit_singh\Desktop\Personal projects repo\Estate-Radar\env\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\vijit_singh\Desktop\Personal projects repo\Estate-Radar\env\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vijit_singh\Desktop\Personal projects repo\Estate-Radar\env\Lib\site-packages\sklearn\pipeline.py", line 473, in fit

In [72]:
final_pipe = search.best_estimator_

In [73]:
search.best_params_

{'regressor__max_depth': 30,
 'regressor__max_features': 'sqrt',
 'regressor__max_samples': 1.0,
 'regressor__n_estimators': 300}

In [74]:
search.best_score_

np.float64(0.9026121231171291)

In [75]:
final_pipe.fit(X,y_transformed)

### Exporting the model

In [76]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [77]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=500))
])

In [78]:
pipeline.fit(X,y_transformed)

In [80]:
import pickle

with open('../../../models/pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [89]:
with open('../../../models/df.pkl', 'wb') as file:
    pickle.dump(X, file)

In [84]:
X

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,3.0,2.0,2,New Property,850.0,0.0,0.0,unfurnished,Low,Low Floor
1,flat,sector 89,2.0,2.0,2,New Property,1226.0,1.0,0.0,unfurnished,Low,Mid Floor
2,flat,sohna road,2.0,2.0,1,New Property,1000.0,0.0,0.0,unfurnished,Low,High Floor
3,flat,sector 92,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,semifurnished,High,Mid Floor
4,flat,sector 102,2.0,2.0,1,Relatively New,582.0,0.0,1.0,unfurnished,High,Mid Floor
...,...,...,...,...,...,...,...,...,...,...,...,...
3549,flat,sector 84,2.0,2.0,1,Relatively New,532.0,0.0,0.0,unfurnished,Medium,Mid Floor
3550,house,sector 109,5.0,5.0,3+,Relatively New,6228.0,1.0,1.0,unfurnished,High,Low Floor
3551,flat,sector 2,1.0,1.0,1,Moderately Old,665.0,0.0,0.0,semifurnished,Medium,Mid Floor
3552,house,sector 43,5.0,6.0,3,Moderately Old,5490.0,1.0,1.0,unfurnished,Medium,Mid Floor


### Trying out the predictions

In [85]:
X.columns

Index(['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category'],
      dtype='object')

In [86]:
X.iloc[0].values

array(['flat', 'sector 36', np.float64(3.0), np.float64(2.0), '2',
       'New Property', np.float64(850.0), np.float64(0.0),
       np.float64(0.0), 'unfurnished', 'Low', 'Low Floor'], dtype=object)

In [87]:
data = [['house', 'sector 102', 4, 3, '3+', 'New Property', 2750, 0, 0, 'unfurnished', 'Low', 'Low Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df


Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,house,sector 102,4,3,3+,New Property,2750,0,0,unfurnished,Low,Low Floor


In [88]:
np.expm1(pipeline.predict(one_df))

array([3.19052593])

In [307]:
X.dtypes

property_type       object
sector              object
bedRoom            float64
bathroom           float64
balcony             object
agePossession       object
built_up_area      float64
servant room       float64
store room         float64
furnishing_type     object
luxury_category     object
floor_category      object
dtype: object

In [306]:
sorted(X['sector'].unique().tolist())

['dwarka expressway',
 'gwal pahari',
 'manesar',
 'sector 1',
 'sector 10',
 'sector 102',
 'sector 103',
 'sector 104',
 'sector 105',
 'sector 106',
 'sector 107',
 'sector 108',
 'sector 109',
 'sector 11',
 'sector 110',
 'sector 111',
 'sector 112',
 'sector 113',
 'sector 12',
 'sector 13',
 'sector 14',
 'sector 15',
 'sector 17',
 'sector 2',
 'sector 21',
 'sector 22',
 'sector 23',
 'sector 24',
 'sector 25',
 'sector 26',
 'sector 27',
 'sector 28',
 'sector 3',
 'sector 30',
 'sector 31',
 'sector 33',
 'sector 36',
 'sector 37',
 'sector 37d',
 'sector 38',
 'sector 39',
 'sector 4',
 'sector 40',
 'sector 41',
 'sector 43',
 'sector 45',
 'sector 46',
 'sector 47',
 'sector 48',
 'sector 49',
 'sector 5',
 'sector 50',
 'sector 51',
 'sector 52',
 'sector 53',
 'sector 54',
 'sector 55',
 'sector 56',
 'sector 57',
 'sector 58',
 'sector 59',
 'sector 6',
 'sector 60',
 'sector 61',
 'sector 62',
 'sector 63',
 'sector 63a',
 'sector 65',
 'sector 66',
 'sector 67',
 'se

### Optuna

In [28]:
# Importing the required libraries
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
import optuna
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

In [44]:
! pip install optuna-integration[xgboost]




[notice] A new release of pip available: 22.3 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [73]:
X.isna().sum()

property_type      0
sector             0
bedRoom            0
bathroom           0
balcony            0
agePossession      0
built_up_area      0
servant room       0
store room         0
furnishing_type    0
luxury_category    0
floor_category     0
dtype: int64

In [78]:
import optuna
from optuna.pruners import MedianPruner
import numpy as np
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

# Data validation
def validate_data(X, y):
    # Remove inf values
    mask = ~np.isinf(y)
    X = X[mask]
    y = y[mask]
    
    # Remove nan values
    mask = ~np.isnan(y)
    X = X[mask]
    y = y[mask]
    
    return X, y

# Clean data
X_train_clean, y_train_clean = validate_data(X_train, y_train)
X_test_clean, y_test_clean = validate_data(X_test, y_test)

def objective(trial):
    try:
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 95, 105),
            'max_depth': trial.suggest_int('max_depth', 5, 7),
            'learning_rate': trial.suggest_float('learning_rate', 0.28, 0.32),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 2),
            'subsample': trial.suggest_float('subsample', 0.95, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.95, 1.0),
            'gamma': trial.suggest_float('gamma', 0, 0.05),
            'reg_alpha': trial.suggest_float('reg_alpha', 0, 0.05),
            'reg_lambda': trial.suggest_float('reg_lambda', 0.95, 1.05),
            'scale_pos_weight': 1.0,
            'tree_method': 'hist'
        }
        
        model = Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', XGBRegressor(**params, random_state=42))
        ])
        
        model.fit(X_train_clean, y_train_clean)
        y_pred = model.predict(X_test_clean)
        mae = mean_absolute_error(y_test_clean, y_pred)
        
        if np.isnan(mae) or np.isinf(mae):
            return float('inf')
        
        return mae
        
    except Exception as e:
        print(f"Trial failed: {str(e)}")
        return float('inf')

# Create and run study
study = optuna.create_study(
    direction='minimize',
    pruner=MedianPruner(n_startup_trials=5)
)

study.optimize(
    objective,
    n_trials=25,
    show_progress_bar=True,
    catch=(Exception,)
)

# Train final model with best params
if study.best_trial is not None:
    best_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', XGBRegressor(**study.best_params, random_state=42))
    ])
    best_pipeline.fit(X_train_clean, y_train_clean)
    y_pred = best_pipeline.predict(X_test_clean)
    final_mae = mean_absolute_error(y_test_clean, y_pred)
    print(f"\nBest parameters: {study.best_params}")
    print(f"Final MAE: {final_mae:.4f}")

[I 2024-12-23 16:22:49,723] A new study created in memory with name: no-name-ecf6ad54-7374-4236-824e-520f39ff15cf
                                                                                   

[I 2024-12-23 16:22:49,853] Trial 0 finished with value: 0.10712532871993755 and parameters: {'n_estimators': 95, 'max_depth': 7, 'learning_rate': 0.3041698013195291, 'min_child_weight': 2, 'subsample': 0.9630241245408929, 'colsample_bytree': 0.9711562771973523, 'gamma': 0.0029443509197990495, 'reg_alpha': 0.0497964737480418, 'reg_lambda': 0.9658763180090582}. Best is trial 0 with value: 0.10712532871993755.
[I 2024-12-23 16:22:49,942] Trial 1 finished with value: 0.1177676504961047 and parameters: {'n_estimators': 105, 'max_depth': 7, 'learning_rate': 0.2854649572543941, 'min_child_weight': 2, 'subsample': 0.9668718464259274, 'colsample_bytree': 0.9841689450494537, 'gamma': 0.03849351790015229, 'reg_alpha': 0.040170656808540664, 'reg_lambda': 0.9515145060003617}. Best is trial 0 with value: 0.10712532871993755.


Best trial: 0. Best value: 0.107125:  16%|█▌        | 4/25 [00:00<00:02, 10.01it/s]

[I 2024-12-23 16:22:50,018] Trial 2 finished with value: 0.11584420667207934 and parameters: {'n_estimators': 101, 'max_depth': 7, 'learning_rate': 0.30296310287641676, 'min_child_weight': 2, 'subsample': 0.9842215663978864, 'colsample_bytree': 0.971390744189321, 'gamma': 0.02549821547476093, 'reg_alpha': 0.029693592304126267, 'reg_lambda': 1.0119596675623042}. Best is trial 0 with value: 0.10712532871993755.
[I 2024-12-23 16:22:50,129] Trial 3 finished with value: 0.11256852740492822 and parameters: {'n_estimators': 95, 'max_depth': 6, 'learning_rate': 0.3150447359062405, 'min_child_weight': 1, 'subsample': 0.9540477974793814, 'colsample_bytree': 0.9661947342127128, 'gamma': 0.00588994695552807, 'reg_alpha': 0.04440380349329048, 'reg_lambda': 1.0050904291190095}. Best is trial 0 with value: 0.10712532871993755.
[I 2024-12-23 16:22:50,202] Trial 4 finished with value: 0.1123400377305549 and parameters: {'n_estimators': 104, 'max_depth': 6, 'learning_rate': 0.3172577432682784, 'min_chil

Best trial: 0. Best value: 0.107125:  32%|███▏      | 8/25 [00:00<00:01, 11.92it/s]

[I 2024-12-23 16:22:50,274] Trial 5 finished with value: 0.11019406180940795 and parameters: {'n_estimators': 100, 'max_depth': 7, 'learning_rate': 0.2919865483551672, 'min_child_weight': 1, 'subsample': 0.9982232988125589, 'colsample_bytree': 0.9836716889626969, 'gamma': 0.03972905552352302, 'reg_alpha': 0.03586812226455215, 'reg_lambda': 1.0046383691219487}. Best is trial 0 with value: 0.10712532871993755.
[I 2024-12-23 16:22:50,355] Trial 6 finished with value: 0.11235230861644453 and parameters: {'n_estimators': 102, 'max_depth': 7, 'learning_rate': 0.3160540520554836, 'min_child_weight': 1, 'subsample': 0.9735226931033056, 'colsample_bytree': 0.9887672782008861, 'gamma': 0.005288155660947702, 'reg_alpha': 0.010571484378145713, 'reg_lambda': 1.0211262271269832}. Best is trial 0 with value: 0.10712532871993755.
[I 2024-12-23 16:22:50,435] Trial 7 finished with value: 0.11297255483981854 and parameters: {'n_estimators': 95, 'max_depth': 5, 'learning_rate': 0.30359040598382075, 'min_c

Best trial: 0. Best value: 0.107125:  40%|████      | 10/25 [00:00<00:01, 12.91it/s]

[I 2024-12-23 16:22:50,503] Trial 8 finished with value: 0.11678745018634584 and parameters: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.28153405821396493, 'min_child_weight': 2, 'subsample': 0.9770033800678106, 'colsample_bytree': 0.9553536292084944, 'gamma': 0.043128565600665913, 'reg_alpha': 0.048468619759915316, 'reg_lambda': 1.0170213346866603}. Best is trial 0 with value: 0.10712532871993755.
[I 2024-12-23 16:22:50,571] Trial 9 finished with value: 0.11658167008813874 and parameters: {'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.31407033306795656, 'min_child_weight': 1, 'subsample': 0.9755199235304827, 'colsample_bytree': 0.9863195250903597, 'gamma': 0.02431535758643752, 'reg_alpha': 0.023434166222267155, 'reg_lambda': 0.9764666284482253}. Best is trial 0 with value: 0.10712532871993755.


Best trial: 0. Best value: 0.107125:  48%|████▊     | 12/25 [00:01<00:01, 11.02it/s]

[I 2024-12-23 16:22:50,686] Trial 10 finished with value: 0.11057438552186677 and parameters: {'n_estimators': 98, 'max_depth': 6, 'learning_rate': 0.2958300102767186, 'min_child_weight': 2, 'subsample': 0.9618674104430696, 'colsample_bytree': 0.9581756887974006, 'gamma': 0.013823578603503972, 'reg_alpha': 0.014406873439607271, 'reg_lambda': 1.0484397415241844}. Best is trial 0 with value: 0.10712532871993755.
[I 2024-12-23 16:22:50,786] Trial 11 finished with value: 0.11356097114673856 and parameters: {'n_estimators': 97, 'max_depth': 7, 'learning_rate': 0.2936398698507498, 'min_child_weight': 2, 'subsample': 0.9960685433563237, 'colsample_bytree': 0.9749917818852333, 'gamma': 0.0498836007543155, 'reg_alpha': 0.03508711447166113, 'reg_lambda': 0.9845756735409332}. Best is trial 0 with value: 0.10712532871993755.


Best trial: 0. Best value: 0.107125:  48%|████▊     | 12/25 [00:01<00:01, 11.02it/s]

[I 2024-12-23 16:22:50,902] Trial 12 finished with value: 0.11212379527589633 and parameters: {'n_estimators': 98, 'max_depth': 7, 'learning_rate': 0.30832145096461405, 'min_child_weight': 2, 'subsample': 0.9979010129379244, 'colsample_bytree': 0.9683066540897234, 'gamma': 0.03203735522450943, 'reg_alpha': 0.03663387731764129, 'reg_lambda': 0.9805916493408088}. Best is trial 0 with value: 0.10712532871993755.


Best trial: 13. Best value: 0.106122:  60%|██████    | 15/25 [00:01<00:01,  8.34it/s]

[I 2024-12-23 16:22:51,133] Trial 13 finished with value: 0.10612199335590736 and parameters: {'n_estimators': 103, 'max_depth': 7, 'learning_rate': 0.289936753149192, 'min_child_weight': 1, 'subsample': 0.9885453102857462, 'colsample_bytree': 0.9784775455226922, 'gamma': 0.00017129249638651216, 'reg_alpha': 0.048244687155197856, 'reg_lambda': 0.9909364362932322}. Best is trial 13 with value: 0.10612199335590736.
[I 2024-12-23 16:22:51,271] Trial 14 finished with value: 0.1093133260301584 and parameters: {'n_estimators': 103, 'max_depth': 6, 'learning_rate': 0.2883638667599162, 'min_child_weight': 1, 'subsample': 0.9866131509827759, 'colsample_bytree': 0.9631772250375594, 'gamma': 0.001297767051772327, 'reg_alpha': 0.04936529753060311, 'reg_lambda': 0.9682495710180355}. Best is trial 13 with value: 0.10612199335590736.


Best trial: 13. Best value: 0.106122:  68%|██████▊   | 17/25 [00:01<00:01,  7.73it/s]

[I 2024-12-23 16:22:51,387] Trial 15 finished with value: 0.10967022548360349 and parameters: {'n_estimators': 103, 'max_depth': 7, 'learning_rate': 0.29866870285449937, 'min_child_weight': 2, 'subsample': 0.9632229427912232, 'colsample_bytree': 0.9777765751778074, 'gamma': 0.013635323192703797, 'reg_alpha': 0.043588212102617735, 'reg_lambda': 0.9909213810303868}. Best is trial 13 with value: 0.10612199335590736.
[I 2024-12-23 16:22:51,551] Trial 16 finished with value: 0.10855773637762466 and parameters: {'n_estimators': 97, 'max_depth': 7, 'learning_rate': 0.30839810968041204, 'min_child_weight': 2, 'subsample': 0.989009607898151, 'colsample_bytree': 0.9507905593319552, 'gamma': 0.00013003339727407854, 'reg_alpha': 0.02831019527888866, 'reg_lambda': 0.9647451345607788}. Best is trial 13 with value: 0.10612199335590736.


Best trial: 13. Best value: 0.106122:  76%|███████▌  | 19/25 [00:02<00:00,  8.13it/s]

[I 2024-12-23 16:22:51,671] Trial 17 finished with value: 0.11239149617283231 and parameters: {'n_estimators': 105, 'max_depth': 6, 'learning_rate': 0.2889408763349834, 'min_child_weight': 1, 'subsample': 0.9706750616143107, 'colsample_bytree': 0.9992503087351793, 'gamma': 0.008487421665809241, 'reg_alpha': 0.04388354134080516, 'reg_lambda': 0.993809778268999}. Best is trial 13 with value: 0.10612199335590736.
[I 2024-12-23 16:22:51,783] Trial 18 finished with value: 0.1140624038927987 and parameters: {'n_estimators': 102, 'max_depth': 5, 'learning_rate': 0.2991993027902329, 'min_child_weight': 1, 'subsample': 0.957337509590527, 'colsample_bytree': 0.9759513704326374, 'gamma': 0.015678500036897396, 'reg_alpha': 0.04926606687779328, 'reg_lambda': 0.9671387633782228}. Best is trial 13 with value: 0.10612199335590736.


Best trial: 13. Best value: 0.106122:  84%|████████▍ | 21/25 [00:02<00:00,  8.51it/s]

[I 2024-12-23 16:22:51,886] Trial 19 finished with value: 0.11176528303658399 and parameters: {'n_estimators': 99, 'max_depth': 7, 'learning_rate': 0.3087460658769928, 'min_child_weight': 2, 'subsample': 0.9929583972012167, 'colsample_bytree': 0.9914836889387248, 'gamma': 0.01853370171374629, 'reg_alpha': 0.03184958069112513, 'reg_lambda': 0.9739775817435826}. Best is trial 13 with value: 0.10612199335590736.
[I 2024-12-23 16:22:52,004] Trial 20 finished with value: 0.10943232582204244 and parameters: {'n_estimators': 96, 'max_depth': 6, 'learning_rate': 0.2829522575961035, 'min_child_weight': 1, 'subsample': 0.9696878609331853, 'colsample_bytree': 0.9609025844360377, 'gamma': 0.008865800193542602, 'reg_alpha': 0.039884091037142105, 'reg_lambda': 0.9933439132839018}. Best is trial 13 with value: 0.10612199335590736.


Best trial: 13. Best value: 0.106122:  92%|█████████▏| 23/25 [00:02<00:00,  7.60it/s]

[I 2024-12-23 16:22:52,173] Trial 21 finished with value: 0.10848875563904091 and parameters: {'n_estimators': 96, 'max_depth': 7, 'learning_rate': 0.3096635674192726, 'min_child_weight': 2, 'subsample': 0.988798675527867, 'colsample_bytree': 0.9515112474069074, 'gamma': 0.0002567096843684828, 'reg_alpha': 0.028486027251149154, 'reg_lambda': 0.9596123040740208}. Best is trial 13 with value: 0.10612199335590736.
[I 2024-12-23 16:22:52,304] Trial 22 finished with value: 0.10984974584008816 and parameters: {'n_estimators': 96, 'max_depth': 7, 'learning_rate': 0.303962124255491, 'min_child_weight': 2, 'subsample': 0.9823226431024137, 'colsample_bytree': 0.9516516036543544, 'gamma': 0.0022423098369757206, 'reg_alpha': 0.018931640859658677, 'reg_lambda': 0.9590973802954148}. Best is trial 13 with value: 0.10612199335590736.


Best trial: 13. Best value: 0.106122: 100%|██████████| 25/25 [00:02<00:00,  8.74it/s]


[I 2024-12-23 16:22:52,425] Trial 23 finished with value: 0.11024736713125144 and parameters: {'n_estimators': 95, 'max_depth': 7, 'learning_rate': 0.30972627544634007, 'min_child_weight': 2, 'subsample': 0.9908890293056101, 'colsample_bytree': 0.9700905023025244, 'gamma': 0.005285481483092144, 'reg_alpha': 0.002402586883013265, 'reg_lambda': 0.9508199823297011}. Best is trial 13 with value: 0.10612199335590736.
[I 2024-12-23 16:22:52,572] Trial 24 finished with value: 0.10826908507971365 and parameters: {'n_estimators': 96, 'max_depth': 7, 'learning_rate': 0.31211437275461645, 'min_child_weight': 2, 'subsample': 0.9847297482685256, 'colsample_bytree': 0.9796976267621498, 'gamma': 0.00012281058714373074, 'reg_alpha': 0.04131164771884857, 'reg_lambda': 0.984218059112258}. Best is trial 13 with value: 0.10612199335590736.

Best parameters: {'n_estimators': 103, 'max_depth': 7, 'learning_rate': 0.289936753149192, 'min_child_weight': 1, 'subsample': 0.9885453102857462, 'colsample_bytree': 

In [79]:
# Train final model with best params
if study.best_trial is not None:
    best_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', XGBRegressor(**study.best_params, random_state=42))
    ])
    best_pipeline.fit(X_train_clean, y_train_clean)
    y_pred = best_pipeline.predict(X_test_clean)
    final_mae = mean_absolute_error(y_test_clean, y_pred)
    print(f"\nBest parameters: {study.best_params}")
    print(f"Final MAE: {final_mae:.4f}")


Best parameters: {'n_estimators': 103, 'max_depth': 7, 'learning_rate': 0.289936753149192, 'min_child_weight': 1, 'subsample': 0.9885453102857462, 'colsample_bytree': 0.9784775455226922, 'gamma': 0.00017129249638651216, 'reg_alpha': 0.048244687155197856, 'reg_lambda': 0.9909364362932322}
Final MAE: 0.1061


In [55]:

# Train final model with best parameters
best_params = study.best_params
final_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(**best_params))
])


kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(final_model, X, y_transformed, cv=kfold, scoring='r2')

In [80]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
best_pipeline.fit(X_train,y_train)

y_pred = best_pipeline.predict(X_test)

y_pred = np.expm1(y_pred)

mean_absolute_error(np.expm1(y_test),y_pred)
    

np.float64(0.4683384417716293)

### outlier teatment