In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.model_selection import KFold,cross_val_score, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor,GradientBoostingRegressor,AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.decomposition import PCA
import category_encoders as ce
from sklearn.neighbors import KNeighborsRegressor
from lightgbm import LGBMRegressor
from catboost import Pool, CatBoostRegressor

In [2]:
df = pd.read_csv('ml_data.csv')

In [3]:
df.drop(columns=['SOCIETY_NAME'], inplace= True)

In [4]:
df

Unnamed: 0,PROP_ID,PROPERTY_TYPE,location,BEDROOM_NUM,BALCONY_NUM,AREA,Price_per_sqft,PRICE,AGE,FURNISH,amenity_luxury,FLOOR_NUM,TOTAL_FLOOR
0,J71214794,flat,Madhyamgram,2.0,1.0,920,4239.0,0.39,Old Property,Unfurnished,246.0,1,11.0
1,F70835394,flat,Garia,1.0,1.0,535,5100.0,0.27,Old Property,Unfurnished,246.0,1,7.0
2,P69854924,flat,Rajarhat,4.0,2.0,1940,4742.0,0.92,Old Property,Unfurnished,246.0,1,20.0
3,E69854912,flat,Rajarhat,2.0,1.0,910,4615.0,0.42,Old Property,Unfurnished,246.0,1,20.0
4,R69167152,flat,New Town,3.0,1.0,1163,4700.0,0.55,Old Property,Unfurnished,246.0,1,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5143,U71217472,flat,Behala,2.0,1.0,730,4520.0,0.33,Relatively New Property,Fully furnished,307.0,2,2.0
5144,E64737128,flat,New Alipore,3.0,1.0,2000,9000.0,1.80,Moderately Old,Luxury furnished,192.0,2,5.0
5145,J66826540,flat,Chak Garia,3.0,2.0,1843,8410.0,1.55,New Property,Luxury furnished,135.0,14,19.0
5146,E66826562,flat,Chak Garia,3.0,2.0,2079,8417.0,1.75,New Property,Luxury furnished,135.0,10,19.0


In [5]:
df.drop(columns=['PROP_ID','PROPERTY_TYPE','Price_per_sqft','TOTAL_FLOOR'], inplace=True)

In [6]:
df

Unnamed: 0,location,BEDROOM_NUM,BALCONY_NUM,AREA,PRICE,AGE,FURNISH,amenity_luxury,FLOOR_NUM
0,Madhyamgram,2.0,1.0,920,0.39,Old Property,Unfurnished,246.0,1
1,Garia,1.0,1.0,535,0.27,Old Property,Unfurnished,246.0,1
2,Rajarhat,4.0,2.0,1940,0.92,Old Property,Unfurnished,246.0,1
3,Rajarhat,2.0,1.0,910,0.42,Old Property,Unfurnished,246.0,1
4,New Town,3.0,1.0,1163,0.55,Old Property,Unfurnished,246.0,1
...,...,...,...,...,...,...,...,...,...
5143,Behala,2.0,1.0,730,0.33,Relatively New Property,Fully furnished,307.0,2
5144,New Alipore,3.0,1.0,2000,1.80,Moderately Old,Luxury furnished,192.0,2
5145,Chak Garia,3.0,2.0,1843,1.55,New Property,Luxury furnished,135.0,14
5146,Chak Garia,3.0,2.0,2079,1.75,New Property,Luxury furnished,135.0,10


# Plan of attack:
### Encoding:
    - Ordinal Encoding
    - One hot encoding
    - Target encoding
 

*  will try all of the encoding for all models then will choose the best encoading technique based on R2 score

# Notes:
**Summary of Encoding Choice**
- **Ordinal Encoding**: Use for features with a natural (e.g., AGE, FURNISH).
- **One-Hot Encoding**: Use for features without any order(e.g. location).

In [7]:
df['PRICE'].describe()

count    5148.000000
mean        0.867953
std         0.960927
min         0.060000
25%         0.350000
50%         0.580000
75%         0.950000
max        10.000000
Name: PRICE, dtype: float64

In [8]:
df['AGE'].value_counts()

AGE
Relatively New Property    2182
Old Property               1926
New Property                649
Moderately Old              391
Name: count, dtype: int64

In [9]:
df['amenity_luxury'] = df['amenity_luxury'].apply(lambda x: 'Low' if x <=200 else 'Medium' if x <= 500 else 'High')

In [10]:
df['FLOOR_NUM'] = df['FLOOR_NUM'].apply(lambda x : 'Low Floor' if x <= 2 else 'Mid Floor' if x <= 10 else 'High Floor')

### Ordinal Encoding

In [11]:
df.head()

Unnamed: 0,location,BEDROOM_NUM,BALCONY_NUM,AREA,PRICE,AGE,FURNISH,amenity_luxury,FLOOR_NUM
0,Madhyamgram,2.0,1.0,920,0.39,Old Property,Unfurnished,Medium,Low Floor
1,Garia,1.0,1.0,535,0.27,Old Property,Unfurnished,Medium,Low Floor
2,Rajarhat,4.0,2.0,1940,0.92,Old Property,Unfurnished,Medium,Low Floor
3,Rajarhat,2.0,1.0,910,0.42,Old Property,Unfurnished,Medium,Low Floor
4,New Town,3.0,1.0,1163,0.55,Old Property,Unfurnished,Medium,Low Floor


In [12]:
x = df.drop(columns=['PRICE'])

In [13]:
y = df['PRICE']

In [14]:
y_transform = np.log1p(y)

In [15]:
x

Unnamed: 0,location,BEDROOM_NUM,BALCONY_NUM,AREA,AGE,FURNISH,amenity_luxury,FLOOR_NUM
0,Madhyamgram,2.0,1.0,920,Old Property,Unfurnished,Medium,Low Floor
1,Garia,1.0,1.0,535,Old Property,Unfurnished,Medium,Low Floor
2,Rajarhat,4.0,2.0,1940,Old Property,Unfurnished,Medium,Low Floor
3,Rajarhat,2.0,1.0,910,Old Property,Unfurnished,Medium,Low Floor
4,New Town,3.0,1.0,1163,Old Property,Unfurnished,Medium,Low Floor
...,...,...,...,...,...,...,...,...
5143,Behala,2.0,1.0,730,Relatively New Property,Fully furnished,Medium,Low Floor
5144,New Alipore,3.0,1.0,2000,Moderately Old,Luxury furnished,Low,Low Floor
5145,Chak Garia,3.0,2.0,1843,New Property,Luxury furnished,Low,High Floor
5146,Chak Garia,3.0,2.0,2079,New Property,Luxury furnished,Low,Mid Floor


In [16]:
y

0       0.39
1       0.27
2       0.92
3       0.42
4       0.55
        ... 
5143    0.33
5144    1.80
5145    1.55
5146    1.75
5147    0.74
Name: PRICE, Length: 5148, dtype: float64

In [17]:
columns_ordinal_encode = ['location','AGE','FURNISH','amenity_luxury','FLOOR_NUM']

In [18]:
encoder = OrdinalEncoder()
encoder.fit(x[columns_ordinal_encode])

In [19]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num',StandardScaler(),['BEDROOM_NUM','BALCONY_NUM','AREA']),
        ('cat', OrdinalEncoder(categories=encoder.categories_),columns_ordinal_encode )
    ],
    remainder= 'passthrough'
)

In [20]:
pipeline = Pipeline([
('preprocessor',preprocessor),
('regressor',LinearRegression())
])

In [21]:
kfold = KFold(n_splits=10,shuffle=True,random_state=42)
scores = cross_val_score(pipeline,x,y_transform, scoring='r2',cv=kfold)

In [22]:
scores.mean()

0.7722905010228691

In [23]:
scores.std()

0.03172485774277793

In [24]:
x_train, x_test, y_train, y_test = train_test_split(x, y_transform, random_state= 42,test_size= 0.2)

In [25]:
pipeline.fit(x_train, y_train)

In [26]:
y_pred = pipeline.predict(x_test)

In [27]:
y_pred = np.expm1(y_pred)

In [28]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.2967759251253067

In [29]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, x, y_transform, cv=kfold, scoring='r2')
    
    # appending score mean
    output.append(scores.mean())
    
    x_train, x_test, y_train, y_test = train_test_split(x,y_transform,test_size=0.2,random_state=42)
    
    pipeline.fit(x_train,y_train)
    
    y_pred = pipeline.predict(x_test)
    
    # appending MAE
    output.append(mean_absolute_error(np.expm1(y_test),np.expm1(y_pred)))
    
    return output
    

In [30]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'xgboost':XGBRegressor(),
    'Catboost' : CatBoostRegressor(verbose=0),
    'LightGBM' : LGBMRegressor(verbose=-1)
}

In [31]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [32]:
model_output

[['linear_reg', 0.7722905010228691, 0.2967759251253067],
 ['svr', 0.34419854045661913, 0.4512679173869664],
 ['ridge', 0.7722919207164489, 0.2967463615801444],
 ['LASSO', -0.0011580725866966457, 0.5609726880527713],
 ['decision tree', 0.747133580695342, 0.25785049138273647],
 ['random forest', 0.8676336503883135, 0.19397206254929572],
 ['extra trees', 0.8555824985759305, 0.1958980043612107],
 ['gradient boosting', 0.8534067512785078, 0.21675798304576904],
 ['adaboost', 0.7297207098865763, 0.31456223481755996],
 ['xgboost', 0.8785601179205573, 0.18652314099929868],
 ['Catboost', 0.8857386343234774, 0.1865868128883847],
 ['LightGBM', 0.8801560293139115, 0.19008836229754575]]

In [33]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [34]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
9,xgboost,0.87856,0.186523
10,Catboost,0.885739,0.186587
11,LightGBM,0.880156,0.190088
5,random forest,0.867634,0.193972
6,extra trees,0.855582,0.195898
7,gradient boosting,0.853407,0.216758
4,decision tree,0.747134,0.25785
2,ridge,0.772292,0.296746
0,linear_reg,0.772291,0.296776
8,adaboost,0.729721,0.314562


# One Hot Encoading with PCA

In [35]:
df.head(1)

Unnamed: 0,location,BEDROOM_NUM,BALCONY_NUM,AREA,PRICE,AGE,FURNISH,amenity_luxury,FLOOR_NUM
0,Madhyamgram,2.0,1.0,920,0.39,Old Property,Unfurnished,Medium,Low Floor


In [36]:
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='sklearn.preprocessing._encoders')

In [37]:
columns_ordinal_encode = ['AGE','FURNISH','amenity_luxury','FLOOR_NUM']
encoder = OrdinalEncoder()
encoder.fit(x[columns_ordinal_encode])


preprocessor = ColumnTransformer(
    transformers=[
        ('num' ,  StandardScaler(),['BEDROOM_NUM','BALCONY_NUM','AREA']),
        ('cat1',OrdinalEncoder(categories=encoder.categories_), columns_ordinal_encode),
        ('cat2', OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'),['location'])
    ], remainder= 'passthrough'
)


pipeline = Pipeline([
    ('preprocessor' , preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', LinearRegression())
])


kfold = KFold(n_splits=10,shuffle=True,random_state=42)
scores = cross_val_score(pipeline,x,y_transform,scoring='r2',cv=kfold)

In [38]:
scores.mean()

0.8077116415504089

In [39]:
scores.std()

0.028672031136341147

In [40]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, x, y_transform, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(x,y_transform,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [41]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.879928,0.178662
10,Catboost,0.889474,0.180652
11,LightGBM,0.879032,0.185818
5,random forest,0.871433,0.190605
9,xgboost,0.869177,0.191151
7,gradient boosting,0.860898,0.203953
1,svr,0.847978,0.217567
0,linear_reg,0.807712,0.264402
2,ridge,0.807725,0.264611
4,decision tree,0.744434,0.271043


## One Hot Encoading without PCA

In [42]:
columns_ordinal_encode = ['AGE','FURNISH','amenity_luxury','FLOOR_NUM']
encoder = OrdinalEncoder()
encoder.fit(x[columns_ordinal_encode])


preprocessor = ColumnTransformer(
    transformers=[
        ('num' ,  StandardScaler(),['BEDROOM_NUM','BALCONY_NUM','AREA']),
        ('cat1',OrdinalEncoder(categories=encoder.categories_), columns_ordinal_encode),
        ('cat2', OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'),['location'])
    ], remainder= 'passthrough'
)


pipeline = Pipeline([
    ('preprocessor' , preprocessor),
    ('regressor', LinearRegression())
])


kfold = KFold(n_splits=10,shuffle=True,random_state=42)
scores = cross_val_score(pipeline,x,y_transform,scoring='r2',cv=kfold)

In [43]:
scores.mean()

0.8631685150200339

In [44]:
scores.std()

0.013185328343754073

In [45]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, x, y_transform, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(x,y_transform,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [46]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.888141,0.174883
5,random forest,0.881393,0.185446
9,xgboost,0.890383,0.185552
10,Catboost,0.89372,0.187759
11,LightGBM,0.873508,0.194112
1,svr,0.885309,0.195567
2,ridge,0.871766,0.214337
7,gradient boosting,0.859516,0.215724
0,linear_reg,0.863169,0.216185
4,decision tree,0.806385,0.236566


## Target Encoading with one hot encoading

In [47]:
columns_ordinal_encode = ['AGE','FURNISH','amenity_luxury']
encoder = OrdinalEncoder()
encoder.fit(x[columns_ordinal_encode])


preprocessor = ColumnTransformer(
    transformers=[
        ('num' ,  StandardScaler(),['BEDROOM_NUM','BALCONY_NUM','AREA']),
        ('cat1',OrdinalEncoder(categories=encoder.categories_), columns_ordinal_encode),
        ('cat2', OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'),['FLOOR_NUM']),
        ('target_enc', ce.TargetEncoder(), ['location'])
    ], remainder= 'passthrough'
)


pipeline = Pipeline([
    ('preprocessor' , preprocessor),
    ('regressor', LinearRegression())
])


kfold = KFold(n_splits=10,shuffle=True,random_state=42)
scores = cross_val_score(pipeline,x,y_transform,scoring='r2',cv=kfold)

In [48]:
scores.mean()

0.8437029945455787

In [49]:
scores.std()

0.021556778849302172

In [50]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, x, y_transform, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(x,y_transform,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [51]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.896667,0.171063
9,xgboost,0.896404,0.171857
10,Catboost,0.903884,0.172359
5,random forest,0.895026,0.177373
11,LightGBM,0.894301,0.180644
7,gradient boosting,0.884284,0.195906
1,svr,0.865847,0.20896
4,decision tree,0.821032,0.224759
0,linear_reg,0.843703,0.237676
2,ridge,0.843741,0.237682


## Target encoding with one hot encoading, without ordinal encoding

In [52]:
x.sample()

Unnamed: 0,location,BEDROOM_NUM,BALCONY_NUM,AREA,AGE,FURNISH,amenity_luxury,FLOOR_NUM
2720,Rajarhat,3.0,1.0,1005,New Property,Luxury furnished,High,Low Floor


In [53]:
columns_ordinal_encode = ['AGE','FURNISH','amenity_luxury','FLOOR_NUM']
encoder = OrdinalEncoder()
encoder.fit(x[columns_ordinal_encode])


preprocessor = ColumnTransformer(
    transformers=[
        ('num' ,  StandardScaler(),['BEDROOM_NUM','BALCONY_NUM','AREA']),
        ('cat2', OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'),columns_ordinal_encode),
        ('target_enc', ce.TargetEncoder(), ['location'])
    ], remainder= 'passthrough'
)


pipeline = Pipeline([
    ('preprocessor' , preprocessor),
    ('regressor', LinearRegression())
])


kfold = KFold(n_splits=10,shuffle=True,random_state=42)
scores = cross_val_score(pipeline,x,y_transform,scoring='r2',cv=kfold)

In [54]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, x, y_transform, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(x,y_transform,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [55]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,Catboost,0.901612,0.172149
6,extra trees,0.895896,0.172573
9,xgboost,0.894853,0.172981
5,random forest,0.896535,0.177739
11,LightGBM,0.895116,0.180912
7,gradient boosting,0.88585,0.198166
1,svr,0.866175,0.21357
4,decision tree,0.814589,0.224937
0,linear_reg,0.84645,0.236616
2,ridge,0.84649,0.236648


## Only One hot encoading

In [56]:
columns_ordinal_encode = ['AGE','FURNISH','amenity_luxury','FLOOR_NUM','location']
encoder = OrdinalEncoder()
encoder.fit(x[columns_ordinal_encode])


preprocessor = ColumnTransformer(
    transformers=[
        ('num' ,  StandardScaler(),['BEDROOM_NUM','BALCONY_NUM','AREA']),
        ('cat2', OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'),columns_ordinal_encode),
    ], remainder= 'passthrough'
)


pipeline = Pipeline([
    ('preprocessor' , preprocessor),
    ('regressor', LinearRegression())
])


kfold = KFold(n_splits=10,shuffle=True,random_state=42)
scores = cross_val_score(pipeline,x,y_transform,scoring='r2',cv=kfold)

In [57]:
scores.mean()

0.8730249892142338

In [58]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, x, y_transform, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(x,y_transform,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [59]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.887335,0.176471
9,xgboost,0.893896,0.182587
5,random forest,0.881944,0.186867
10,Catboost,0.894294,0.188651
11,LightGBM,0.872045,0.194966
1,svr,0.884533,0.199541
2,ridge,0.880178,0.206409
0,linear_reg,0.873025,0.207659
7,gradient boosting,0.858887,0.218063
4,decision tree,0.817118,0.232306


## Final Operation (amenity_luxary --> OHE):

In [60]:
columns_ordinal_encode = ['AGE','FURNISH']
encoder = OrdinalEncoder()
encoder.fit(x[columns_ordinal_encode])


preprocessor = ColumnTransformer(
    transformers=[
        ('num' ,  StandardScaler(),['BEDROOM_NUM','BALCONY_NUM','AREA']),
        ('cat1',OrdinalEncoder(categories=encoder.categories_), columns_ordinal_encode),
        ('cat2', OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'),['FLOOR_NUM','amenity_luxury']),
        ('target_enc', ce.TargetEncoder(), ['location'])
    ], remainder= 'passthrough'
)


pipeline = Pipeline([
    ('preprocessor' , preprocessor),
    ('regressor', LinearRegression())
])


kfold = KFold(n_splits=10,shuffle=True,random_state=42)
scores = cross_val_score(pipeline,x,y_transform,scoring='r2',cv=kfold)

In [61]:
scores.mean()

0.8436710671517247

In [62]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, x, y_transform, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(x,y_transform,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [63]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,Catboost,0.901948,0.172903
6,extra trees,0.895478,0.173322
9,xgboost,0.895258,0.174704
5,random forest,0.89536,0.177663
11,LightGBM,0.893989,0.183535
7,gradient boosting,0.884721,0.195389
1,svr,0.86627,0.211798
4,decision tree,0.819769,0.231898
0,linear_reg,0.843671,0.237583
2,ridge,0.843709,0.237588


## Checking for KNeighborsRegressor :

In [64]:
columns_ordinal_encode = ['AGE','FURNISH']
encoder = OrdinalEncoder()
encoder.fit(x[columns_ordinal_encode])


preprocessor = ColumnTransformer(
    transformers=[
        ('num' ,  StandardScaler(),['BEDROOM_NUM','BALCONY_NUM','AREA']),
        ('cat1',OrdinalEncoder(categories=encoder.categories_), columns_ordinal_encode),
        ('cat2', OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'),['FLOOR_NUM','amenity_luxury']),
        ('target_enc', ce.TargetEncoder(), ['location'])
    ], remainder= 'passthrough'
)


pipeline = Pipeline([
    ('preprocessor' , preprocessor),
    ('regressor', KNeighborsRegressor())
])


kfold = KFold(n_splits=10,shuffle=True,random_state=42)
scores = cross_val_score(pipeline,x,y_transform,scoring='r2',cv=kfold)
scores.mean(), scores.std()

(0.8184972367598815, 0.025323051688963077)

In [65]:
XGBRegressor()

In [None]:
# best model I choose Target Encoading with one hot encoading