In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('final_gurgaon_properties_for_modelbuilding.csv')
df.head()

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category,price
0,flat,sector 36,3,2,2,new property,850.0,0,0,unfurnished,low,low floor,0.82
1,flat,sector 89,2,2,2,new property,1226.0,1,0,unfurnished,low,mid floor,0.95
2,flat,sohna road,2,2,1,new property,1000.0,0,0,unfurnished,low,high floor,0.32
3,flat,sector 92,3,4,3+,relatively new,1615.0,1,0,semifurnished,high,mid floor,1.6
4,flat,sector 102,2,2,1,relatively new,582.0,0,1,unfurnished,high,mid floor,0.48


In [3]:
from sklearn.model_selection import KFold , cross_val_score,train_test_split
from sklearn.preprocessing import OneHotEncoder , OrdinalEncoder ,StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error , r2_score
from sklearn.linear_model import LinearRegression, Ridge , Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor , ExtraTreesRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor


In [4]:
X = df.drop('price' , axis = 1)
y = df.price

In [5]:
X.head()

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,3,2,2,new property,850.0,0,0,unfurnished,low,low floor
1,flat,sector 89,2,2,2,new property,1226.0,1,0,unfurnished,low,mid floor
2,flat,sohna road,2,2,1,new property,1000.0,0,0,unfurnished,low,high floor
3,flat,sector 92,3,4,3+,relatively new,1615.0,1,0,semifurnished,high,mid floor
4,flat,sector 102,2,2,1,relatively new,582.0,0,1,unfurnished,high,mid floor


In [6]:
y.head()

0    0.82
1    0.95
2    0.32
3    1.60
4    0.48
Name: price, dtype: float64

In [7]:
X.shape , y.shape

((3573, 12), (3573,))

In [8]:
y_transformed = np.log1p(y)

In [9]:
y_transformed.shape

(3573,)

## Ordinal Encoding

### **part 1**

In [11]:
X.columns

Index(['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category'],
      dtype='object')

In [11]:
columns_to_encode = df.select_dtypes('O').columns.values

In [13]:
len(list(set(X.columns) - set(columns_to_encode)))

5

In [13]:
len(columns_to_encode)

7

In [10]:
columns_to_encode

NameError: name 'columns_to_encode' is not defined

In [15]:
preprocessor = ColumnTransformer(
    [
        ('num' , StandardScaler() ,['store room', 'bathroom', 'servant room', 'built_up_area', 'bedRoom']),
        ('cat' , OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1) , columns_to_encode)
    ],
    remainder = 'passthrough'
)

In [16]:
pipeline = Pipeline([
    ('preprocessor' , preprocessor),
    ('model' , LinearRegression())
])

In [17]:
X_transformed = preprocessor.fit_transform(X)

In [18]:
kfolds = KFold(n_splits= 10 , shuffle = True , random_state= 42)
scores = cross_val_score(LinearRegression(), X_transformed, y_transformed, cv = kfolds, scoring = 'r2')
print(scores)
print()
print(scores.mean() , scores.std())

[0.77188187 0.72953958 0.70587393 0.76817345 0.69575026 0.73985801
 0.7498318  0.72856769 0.73302414 0.69672854]

0.7319229283000923 0.02557013680119018


In [19]:
kfolds = KFold(n_splits= 10 , shuffle = True , random_state= 42)
scores = cross_val_score(pipeline, X, y_transformed, cv = kfolds, scoring = 'r2')
print(scores)
print()
print(scores.mean() , scores.std())

[0.77188187 0.72953958 0.70587393 0.76817345 0.69575026 0.73985801
 0.7498318  0.72856769 0.73304693 0.6969825 ]

0.7319506038970769 0.025535368530713906


In [20]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state = 42)

In [21]:
pipeline.fit(X_train , y_train)
y_pred = pipeline.predict(X_test)

In [22]:
mean_absolute_error(np.expm1(y_test) , np.expm1(y_pred))

0.8358982059552437

In [23]:
r2_score(np.expm1(y_test) , np.expm1(y_pred))

0.5938077615443642

In [71]:
def scorer(model_name , model):
    output = []
    output.append(model_name)

    pipeline = Pipeline(
        [
            ('preprocessor' , preprocessor),
            ('model',model )
        ]
    )

    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    output.append(r2_score(np.expm1(y_test) , y_pred))
    
    return output


In [72]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}


In [73]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [74]:
model_output

[['linear_reg', 0.7319506038970769, 0.8358982059552437, 0.5938077615443642],
 ['svr', 0.7507898555262138, 0.8511838154887685, 0.5955017008412864],
 ['ridge', 0.7319562660971997, 0.8359476780904892, 0.593859421804295],
 ['LASSO', 0.05204880366020047, 1.5224188022112894, -0.014709818201247504],
 ['decision tree', 0.7871714774683511, 0.7278075941900202, 0.5217850593934193],
 ['random forest', 0.8853647757597433, 0.5047133587320437, 0.7957715276004269],
 ['extra trees', 0.8723609047811559, 0.574972582200478, 0.7360504993248504],
 ['gradient boosting',
  0.8763076904928042,
  0.5570753091450611,
  0.8161729783514529],
 ['adaboost', 0.75298519574041, 0.8779604508821117, 0.6752491849610446],
 ['mlp', 0.7867744752554244, 0.7712546440520782, 0.7089628180429198],
 ['xgboost', 0.8936498017733132, 0.5042154918917409, 0.8061627511711208]]

In [81]:
model_df = pd.DataFrame(model_output, columns=['name','r2_using_crossval','mae','r2_on_test'])

In [82]:
model_df.sort_values(by = 'mae')

Unnamed: 0,name,r2_using_crossval,mae,r2_on_test
10,xgboost,0.89365,0.504215,0.806163
5,random forest,0.885365,0.504713,0.795772
7,gradient boosting,0.876308,0.557075,0.816173
6,extra trees,0.872361,0.574973,0.73605
4,decision tree,0.787171,0.727808,0.521785
9,mlp,0.786774,0.771255,0.708963
0,linear_reg,0.731951,0.835898,0.593808
2,ridge,0.731956,0.835948,0.593859
1,svr,0.75079,0.851184,0.595502
8,adaboost,0.752985,0.87796,0.675249


### **PART 2** : scaling even the encoded values 

In [14]:
list(columns_to_encode)

['property_type',
 'sector',
 'balcony',
 'agePossession',
 'furnishing_type',
 'luxury_category',
 'floor_category']

In [12]:
numerical_cols = ['store room', 'bathroom', 'servant room', 'built_up_area', 'bedRoom']
categorical_cols = list(columns_to_encode)

In [13]:
numerical_cols

['store room', 'bathroom', 'servant room', 'built_up_area', 'bedRoom']

In [59]:
categorical_cols

['property_type',
 'sector',
 'balcony',
 'agePossession',
 'furnishing_type',
 'luxury_category',
 'floor_category']

In [49]:
numerical_pipeline = Pipeline(
    [
        ('scaler' , StandardScaler() )
    ]
)

In [51]:
categorical_pipeline = Pipeline(
    [
        ('encoder' , OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
        ('scaler' , StandardScaler())
    ]
)

In [61]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num' , numerical_pipeline  ,numerical_cols ),
        ('cat' , categorical_pipeline , categorical_cols)
    ],
    remainder = 'passthrough'
)

In [62]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

In [66]:
X

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,3,2,2,new property,850.0,0,0,unfurnished,low,low floor
1,flat,sector 89,2,2,2,new property,1226.0,1,0,unfurnished,low,mid floor
2,flat,sohna road,2,2,1,new property,1000.0,0,0,unfurnished,low,high floor
3,flat,sector 92,3,4,3+,relatively new,1615.0,1,0,semifurnished,high,mid floor
4,flat,sector 102,2,2,1,relatively new,582.0,0,1,unfurnished,high,mid floor
...,...,...,...,...,...,...,...,...,...,...,...,...
3568,flat,sector 84,2,2,1,relatively new,532.0,0,0,unfurnished,medium,mid floor
3569,house,sector 109,5,5,3+,relatively new,6228.0,1,1,unfurnished,high,low floor
3570,flat,sector 2,1,1,1,moderately old,665.0,0,0,semifurnished,medium,mid floor
3571,house,sector 43,5,6,3,moderately old,5490.0,1,1,unfurnished,medium,mid floor


In [65]:
y_transformed

0       0.598837
1       0.667829
2       0.277632
3       0.955511
4       0.392042
          ...   
3568    0.314811
3569    1.945910
3570    0.470004
3571    2.803360
3572    1.022451
Name: price, Length: 3573, dtype: float64

In [80]:
kfolds = KFold(n_splits= 10 , shuffle = True , random_state= 42)
scores = cross_val_score(pipeline, X, y_transformed, cv = kfolds, scoring = 'r2')
print(scores)
print()
print(scores.mean() , scores.std())

[0.77188187 0.72953958 0.70587393 0.76817345 0.69575026 0.73985801
 0.7498318  0.72856769 0.73304693 0.6969825 ]

0.731950603897076 0.025535368530714694


In [81]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state = 42)

In [82]:
pipeline.fit(X_train , y_train)
y_pred = pipeline.predict(X_test)

In [83]:
mean_absolute_error(np.expm1(y_test) , np.expm1(y_pred))

0.8358982059552436

In [84]:
r2_score(np.expm1(y_test) , np.expm1(y_pred))

0.5938077615443638

In [85]:
r2_score(np.expm1(y_pred),np.expm1(y_test))

0.5216282491921103

**Observation** :  **no benefit of scaling the oridinal encoded values , it was same as the above**

## OneHotEncoding

In [16]:
numerical_cols

['store room', 'bathroom', 'servant room', 'built_up_area', 'bedRoom']

In [17]:
categorical_cols

['property_type',
 'sector',
 'balcony',
 'agePossession',
 'furnishing_type',
 'luxury_category',
 'floor_category']

In [95]:
df[categorical_cols].sample(5)

Unnamed: 0,property_type,sector,balcony,agePossession,furnishing_type,luxury_category,floor_category
893,house,sector 4,2,old property,unfurnished,low,low floor
998,house,sector 24,3,old property,unfurnished,low,mid floor
1438,house,sector 6,1,old property,unfurnished,low,low floor
55,flat,sector 108,3,moderately old,semifurnished,medium,mid floor
2695,flat,sector 102,3+,relatively new,unfurnished,medium,high floor


In [99]:
preprocessor = ColumnTransformer(
    [
        ('num', StandardScaler() , numerical_cols),
        ('cat1' , OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_cols),
        ('cat2' , OneHotEncoder(drop = 'first' , handle_unknown = 'ignore') , ['sector','furnishing_type'])
    ]
    ,remainder = 'passthrough'
)

In [100]:
pipeline = Pipeline(
    [
        ('preprocessor' , preprocessor),
        ('model' , LinearRegression())
    ]
)

In [101]:
kfolds = KFold(n_splits= 10 , shuffle = True , random_state= 42)
scores = cross_val_score(pipeline, X, y_transformed, cv = kfolds, scoring = 'r2')
print(scores)
print()
print(scores.mean() , scores.std())

[0.88507604 0.82635244 0.83035561 0.87763772 0.85175617 0.83083817
 0.85278751 0.82845749 0.85773846 0.85512662]

0.8496126233451309 0.01966229050281529




In [102]:
 X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

In [105]:
pipeline.fit(X_train , y_train)
y_pred = pipeline.predict(X_test)
y_pred = np.expm1(y_pred)

print(mean_absolute_error(np.expm1(y_test) , y_pred))
print(r2_score(np.expm1(y_test) , y_pred))

0.6341582775908337
0.7562461277099701


In [106]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [107]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [108]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [109]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [110]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.894798,0.484534
5,random forest,0.894089,0.490653
10,xgboost,0.899241,0.535551
7,gradient boosting,0.879698,0.562993
9,mlp,0.873419,0.56399
4,decision tree,0.808926,0.625879
0,linear_reg,0.849613,0.634158
2,ridge,0.84963,0.636715
8,adaboost,0.752561,0.838985
1,svr,0.753979,0.844924


## Target Encoder

In [16]:
import category_encoders as ce

columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

preprocessor = ColumnTransformer(
    [
        ('num', StandardScaler() , ['bedRoom' , 'bathroom', 'built_up_area','store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat2' , OneHotEncoder(handle_unknown='ignore' , sparse_output = False ,drop = 'first'),['furnishing_type']),
        ('target_ce',ce.TargetEncoder(),['sector'])
    ],
    remainder = 'passthrough'
)

In [17]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [15]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [16]:
scores.mean(),scores.std()

(0.8179614973663772, 0.0232857477020442)

In [25]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [26]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [27]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [28]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [29]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.903589,0.450665
6,extra trees,0.902717,0.48301
10,xgboost,0.904238,0.48359
7,gradient boosting,0.890765,0.512533
4,decision tree,0.82525,0.58949
9,mlp,0.842345,0.664792
2,ridge,0.817984,0.700742
0,linear_reg,0.817961,0.700776
8,adaboost,0.812711,0.730438
1,svr,0.764227,0.833866


In [24]:
from sklearn.model_selection import GridSearchCV

## Hyperparameter Tuning

In [15]:
from sklearn.compose import ColumnTransformer

In [16]:
import category_encoders as ce


In [17]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

preprocessor = ColumnTransformer(
    [
        ('num', StandardScaler() , ['bedRoom' , 'bathroom', 'built_up_area','store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat2' , OneHotEncoder(handle_unknown='ignore' , sparse_output = False ,drop = 'first'),['furnishing_type']),
        ('target_ce',ce.TargetEncoder(),['sector'])
    ],
    remainder = 'passthrough'
)

In [18]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [21]:
from sklearn.model_selection import RandomizedSearchCV

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.33, random_state=42)

## Randomized Search CV

In [35]:
param_grid = {
    'regressor__n_estimators':[100 , 200 , 300],
    'regressor__max_depth': [None, 5, 10, 20],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4],
    'regressor__max_features':['sqrt', 'log2'],
    'regressor__bootstrap': [True, False]  
}

In [36]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [71]:
random_search = RandomizedSearchCV(
    estimator= pipeline,
    param_distributions= param_grid,
    cv = kfold,
    verbose= 2 ,
    n_jobs= -1,
    scoring = 'r2'
)

In [72]:
random_search.fit(X_train , y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [73]:
best_params = random_search.best_params_
print("Best parameters:", best_params)


Best parameters: {'regressor__n_estimators': 100, 'regressor__min_samples_split': 10, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 'log2', 'regressor__max_depth': None, 'regressor__bootstrap': True}


In [74]:
best_model = random_search.best_estimator_

**with hyperparameter tuning**

TEST DATA

In [75]:
y_pred = best_model.predict(X_test)

In [76]:
mean_absolute_error(np.expm1(y_test) , np.expm1(y_pred))

0.5485376477461876

In [77]:
r2_score(np.expm1(y_test) , np.expm1(y_pred))

0.7603997608138064

TRAIN DATA

In [78]:
y_pred_train = best_model.predict(X_train)
print(mean_absolute_error(np.expm1(y_train) , np.expm1(y_pred_train)))
print(r2_score(np.expm1(y_train) , np.expm1(y_pred_train)))

0.41120801819057107
0.8910148261066263


**without hyperparameter tuning , the score on test and train data**

TEST DATA

In [67]:
pipeline.fit(X_train , y_train)
y_pred = pipeline.predict(X_test)
mean_absolute_error(np.expm1(y_test) , np.expm1(y_pred))

0.44938781338668615

In [48]:
r2_score(np.expm1(y_test) , np.expm1(y_pred))

0.8270103349994644

TRAINING DATA

In [49]:
y_pred_train = pipeline.predict(X_train)
mean_absolute_error(np.expm1(y_train) , np.expm1(y_pred_train))

0.1885823233380924

In [51]:
r2_score(np.expm1(y_train) , np.expm1(y_pred_train))

0.9715693486609327

**Using GridsearchCV**

In [79]:
from sklearn.model_selection import GridSearchCV

In [80]:
grid_search = GridSearchCV(
    estimator= pipeline,
    param_grid= param_grid,
    cv = kfold,
    verbose= 2 ,
    n_jobs= -1,
    scoring = 'r2'
)

In [81]:
grid_search.fit(X_train , y_train)

Fitting 10 folds for each of 432 candidates, totalling 4320 fits


In [83]:
best_model = grid_search.best_estimator_

In [84]:
grid_search.best_params_

{'regressor__bootstrap': False,
 'regressor__max_depth': 20,
 'regressor__max_features': 'log2',
 'regressor__min_samples_leaf': 1,
 'regressor__min_samples_split': 2,
 'regressor__n_estimators': 300}

In [85]:
grid_search.best_score_

0.8916119407802536

In [264]:
final_model = RandomForestRegressor(n_estimators= 100 , max_depth=10 , bootstrap= True , min_samples_leaf=1 , min_samples_split=2 , max_features='log2')

In [265]:
pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor' , final_model)
]
)

In [266]:
pipeline.fit(X_train , y_train)

In [267]:
y_pred_train = pipeline.predict(X_train)

print(mean_absolute_error(np.expm1(y_train),np.expm1(y_pred_train)))

print(r2_score(np.expm1(y_train),np.expm1(y_pred_train)))

0.3530839284087731
0.9343051446599097


In [268]:
y_pred_test = pipeline.predict(X_test)
print(mean_absolute_error(np.expm1(y_test),np.expm1(y_pred_test)))
print(r2_score(np.expm1(y_test),np.expm1(y_pred_test)))


0.5398039189026499
0.7655358043035915


## XGBOOST HyperparameterTuning

In [84]:
from xgboost import XGBRegressor

In [None]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

preprocessor = ColumnTransformer(
    [
        ('num', StandardScaler() , ['bedRoom' , 'bathroom', 'built_up_area','store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat2' , OneHotEncoder(handle_unknown='ignore' , sparse_output = False ,drop = 'first'),['furnishing_type']),
        ('target_ce',ce.TargetEncoder(),['sector'])
    ],
    remainder = 'passthrough'
)

In [85]:
pipeline = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('regressor' , XGBRegressor())
    ]
)

In [91]:
pipeline.fit(X_train , y_train)
y_pred_train = np.expm1(pipeline.predict(X_train))
y_pred = np.expm1(pipeline.predict(X_test))
print(mean_absolute_error(np.expm1(y_test),y_pred))
print(r2_score(np.expm1(y_test) , y_pred))

print(mean_absolute_error(np.expm1(y_train) , y_pred_train))
print(r2_score(np.expm1(y_train) , y_pred_train))

0.4758541800627264
0.8159528466528783
0.10881453177667888
0.9951547884412657


In [97]:
param_grid = {
    'regressor__n_estimators': np.arange(50, 200, 10),
    'regressor__max_depth': np.arange(3, 10, 1),
    'regressor__learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
    'regressor__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'regressor__colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'regressor__gamma': [0, 0.1, 0.2, 0.3, 0.4],
    'regressor__reg_alpha': [0, 0.01, 0.1, 1, 10],
    'regressor__reg_lambda': [0, 0.01, 0.1, 1, 10]
}

In [21]:
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV

In [98]:
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_grid,
    n_iter=100,  # Number of parameter settings that are sampled
    scoring='neg_mean_squared_error',
    cv=5,  # 5-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use all available cores
)

In [99]:
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [100]:
best_model = random_search.best_estimator_


In [102]:
random_search.best_score_

-0.03411242645959372

In [103]:
random_search.best_params_

{'regressor__subsample': 0.8,
 'regressor__reg_lambda': 10,
 'regressor__reg_alpha': 0.01,
 'regressor__n_estimators': 90,
 'regressor__max_depth': 8,
 'regressor__learning_rate': 0.3,
 'regressor__gamma': 0,
 'regressor__colsample_bytree': 0.9}

In [105]:
y_pred = best_model.predict(X_test)
y_pred = np.expm1(y_pred)

In [107]:
mean_absolute_error(np.expm1(y_test) , y_pred)

0.48167988333106043

In [108]:
r2_score(np.expm1(y_test) , y_pred)

0.8368349065475185

**First RandomSearch then GridSearchCV**

In [None]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

preprocessor = ColumnTransformer(
    [
        ('num', StandardScaler() , ['bedRoom' , 'bathroom', 'built_up_area','store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat2' , OneHotEncoder(handle_unknown='ignore' , sparse_output = False ,drop = 'first'),['furnishing_type']),
        ('target_ce',ce.TargetEncoder(),['sector'])
    ],
    remainder = 'passthrough'
)

In [19]:
pipeline = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('regressor' , XGBRegressor())
    ]
)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.33, random_state=42)

In [24]:
param_dist = {
    'regressor__n_estimators': [100, 200, 300, 400],
    'regressor__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'regressor__max_depth': [3, 4, 5, 6, 7],
    'regressor__min_child_weight': [1, 3, 5],
    'regressor__subsample': [0.6, 0.8, 1.0],
    'regressor__colsample_bytree': [0.6, 0.8, 1.0],
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=pipeline,  # Your pre-defined pipeline
    param_distributions=param_dist,
    n_iter=50,  # Number of random parameter combinations to try
    scoring='neg_mean_squared_error',  # Use negative MSE for regression
    cv=5,  # 5-fold cross-validation
    verbose=2,
    random_state=42,  # To ensure reproducibility
    n_jobs=-1  # Use all available cores for faster processing
)

# Fit the RandomizedSearchCV
random_search.fit(X_train, y_train)


Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [25]:
random_search.best_params_

{'regressor__subsample': 0.6,
 'regressor__n_estimators': 400,
 'regressor__min_child_weight': 1,
 'regressor__max_depth': 6,
 'regressor__learning_rate': 0.05,
 'regressor__colsample_bytree': 0.6}

In [26]:
random_search.best_score_

-0.03105045861103183

In [29]:
best_model = random_search.best_estimator_

In [30]:
y_pred = np.expm1(best_model.predict(X_test))
print(mean_absolute_error(np.expm1(y_test) , y_pred))
print(r2_score(np.expm1(y_test) , y_pred))

print()
y_pred_train = np.expm1(best_model.predict(X_train))
print(mean_absolute_error(np.expm1(y_train) , y_pred_train))
print(r2_score(np.expm1(y_train) , y_pred_train))


0.4596009657408221
0.8315744749657903

0.17558710927077528
0.9880903860409881


In [31]:
param_grid = {
    'regressor__n_estimators': [random_search.best_params_['regressor__n_estimators'] - 50, 
                                random_search.best_params_['regressor__n_estimators'], 
                                random_search.best_params_['regressor__n_estimators'] + 50],
    'regressor__learning_rate': [random_search.best_params_['regressor__learning_rate'] - 0.01, 
                                 random_search.best_params_['regressor__learning_rate'], 
                                 random_search.best_params_['regressor__learning_rate'] + 0.01],
    'regressor__max_depth': [random_search.best_params_['regressor__max_depth'] - 1, 
                            random_search.best_params_['regressor__max_depth'], 
                            random_search.best_params_['regressor__max_depth'] + 1],
    'regressor__min_child_weight': [random_search.best_params_['regressor__min_child_weight'] - 1, 
                                   random_search.best_params_['regressor__min_child_weight'], 
                                   random_search.best_params_['regressor__min_child_weight'] + 1],
    'regressor__subsample': [random_search.best_params_['regressor__subsample'] - 0.05, 
                            random_search.best_params_['regressor__subsample'], 
                            random_search.best_params_['regressor__subsample'] + 0.05],
    'regressor__colsample_bytree': [random_search.best_params_['regressor__colsample_bytree'] - 0.05, 
                                    random_search.best_params_['regressor__colsample_bytree'], 
                                    random_search.best_params_['regressor__colsample_bytree'] + 0.05],
}


In [32]:
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=2,
    n_jobs=-1  # Parallelize the grid search
)

# Fit the GridSearchCV
grid_search.fit(X_train, y_train)

# Print the best parameters found by Grid Search
print(f"Best parameters found by Grid Search: {grid_search.best_params_}")

# Print the best score (negative MSE) from Grid Search
print(f"Best score from Grid Search: {grid_search.best_score_}")

Fitting 5 folds for each of 729 candidates, totalling 3645 fits
Best parameters found by Grid Search: {'regressor__colsample_bytree': 0.6, 'regressor__learning_rate': 0.04, 'regressor__max_depth': 6, 'regressor__min_child_weight': 0, 'regressor__n_estimators': 400, 'regressor__subsample': 0.6}
Best score from Grid Search: -0.030720170270182794


In [33]:
best_model = grid_search.best_estimator_

In [34]:
y_pred_train = np.expm1(best_model.predict(X_train))
print(mean_absolute_error(np.expm1(y_train) , y_pred_train))
print(r2_score(np.expm1(y_train) , y_pred_train))

0.20112722268674443
0.9843750315920253


In [36]:
y_pred = np.expm1(best_model.predict(X_test))
print(mean_absolute_error(np.expm1(y_test) , y_pred))
print(r2_score(np.expm1(y_test) , y_pred))

0.45969001938629955
0.8319132226327033


In [38]:
grid_search.best_params_

{'regressor__colsample_bytree': 0.6,
 'regressor__learning_rate': 0.04,
 'regressor__max_depth': 6,
 'regressor__min_child_weight': 0,
 'regressor__n_estimators': 400,
 'regressor__subsample': 0.6}

In [62]:
pipeline = Pipeline(
    [
        ('preprocessor' , preprocessor),
        ('regressor' , XGBRegressor(max_depth = 5 ,learning_rate = 0.04 ,colsample_bytree= 0.6,subsample= 0.8 ,n_estimators = 400,min_child_weight = 0))
    ]
)

In [63]:
pipeline.fit(X_train ,y_train)
y_pred_train = np.expm1(pipeline.predict(X_train))
print(mean_absolute_error(np.expm1(y_train) , y_pred_train))
print(r2_score(np.expm1(y_train) , y_pred_train))

y_pred = np.expm1(pipeline.predict(X_test))
print(mean_absolute_error(np.expm1(y_test) , y_pred))
print(r2_score(np.expm1(y_test) , y_pred))

0.26617661963085026
0.9702202423122084
0.46678872315004716
0.8428983764056626


In [82]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

In [83]:
cross_val_score(pipeline , X_train , y_train , cv = kfold , scoring = 'r2')

array([0.89958402, 0.91188556, 0.91533527, 0.88835819, 0.88575468])

In [66]:
np.mean([0.9103197 , 0.89624055, 0.90927593, 0.92395692, 0.91087869,
       0.9239641 , 0.88104889, 0.89087824, 0.86828299, 0.91363086])

0.9028476870000001

## Pickling 

In [69]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

preprocessor = ColumnTransformer(
    [
        ('num', StandardScaler() , ['bedRoom' , 'bathroom', 'built_up_area','store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat2' , OneHotEncoder(handle_unknown='ignore' , sparse_output = False ,drop = 'first'),['furnishing_type']),
        ('target_ce',ce.TargetEncoder(),['sector'])
    ],
    remainder = 'passthrough'
)

In [70]:
pipeline = Pipeline(
    [
        ('preprocessor' , preprocessor),
        ('regressor' , XGBRegressor(max_depth = 5 ,learning_rate = 0.04 ,colsample_bytree= 0.6,subsample= 0.8 ,n_estimators = 400,min_child_weight = 0))
    ]
)

In [71]:
pipeline.fit(X , y_transformed)

In [72]:
import pickle 

with open('pipeline.pkl','wb') as file:
    pickle.dump(pipeline , file)

In [19]:
X

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,3,2,2,new property,850.0,0,0,unfurnished,low,low floor
1,flat,sector 89,2,2,2,new property,1226.0,1,0,unfurnished,low,mid floor
2,flat,sohna road,2,2,1,new property,1000.0,0,0,unfurnished,low,high floor
3,flat,sector 92,3,4,3+,relatively new,1615.0,1,0,semifurnished,high,mid floor
4,flat,sector 102,2,2,1,relatively new,582.0,0,1,unfurnished,high,mid floor
...,...,...,...,...,...,...,...,...,...,...,...,...
3568,flat,sector 84,2,2,1,relatively new,532.0,0,0,unfurnished,medium,mid floor
3569,house,sector 109,5,5,3+,relatively new,6228.0,1,1,unfurnished,high,low floor
3570,flat,sector 2,1,1,1,moderately old,665.0,0,0,semifurnished,medium,mid floor
3571,house,sector 43,5,6,3,moderately old,5490.0,1,1,unfurnished,medium,mid floor


In [73]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X , file)

In [74]:
X.columns

Index(['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category'],
      dtype='object')

In [28]:
X.iloc[0].values

array(['flat', 'sector 36', 3, 2, '2', 'new property', 850.0, 0, 0,
       'unfurnished', 'low', 'low floor'], dtype=object)

In [99]:
data = [['house', 'sector 102', 4, 4, '3+', 'New Property', 2750, 0, 0, 'unfurnished', 'Low', 'Low Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df


Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,house,sector 102,4,4,3+,New Property,2750,0,0,unfurnished,Low,Low Floor


In [100]:
np.expm1(pipeline.predict(one_df))

array([2.7832735], dtype=float32)