In [49]:
#importing library
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR,SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings(action='ignore')

In [50]:
#loading the dataset
df=pd.read_csv('../input/delhi-house-price-prediction/MagicBricks.csv')

df

Unnamed: 0,Area,BHK,Bathroom,Furnishing,Locality,Parking,Price,Status,Transaction,Type,Per_Sqft
0,800.0,3,2.0,Semi-Furnished,Rohini Sector 25,1.0,6500000,Ready_to_move,New_Property,Builder_Floor,
1,750.0,2,2.0,Semi-Furnished,"J R Designers Floors, Rohini Sector 24",1.0,5000000,Ready_to_move,New_Property,Apartment,6667.0
2,950.0,2,2.0,Furnished,"Citizen Apartment, Rohini Sector 13",1.0,15500000,Ready_to_move,Resale,Apartment,6667.0
3,600.0,2,2.0,Semi-Furnished,Rohini Sector 24,1.0,4200000,Ready_to_move,Resale,Builder_Floor,6667.0
4,650.0,2,2.0,Semi-Furnished,Rohini Sector 24 carpet area 650 sqft status R...,1.0,6200000,Ready_to_move,New_Property,Builder_Floor,6667.0
...,...,...,...,...,...,...,...,...,...,...,...
1254,4118.0,4,5.0,Unfurnished,Chittaranjan Park,3.0,55000000,Ready_to_move,New_Property,Builder_Floor,12916.0
1255,1050.0,3,2.0,Semi-Furnished,Chittaranjan Park,3.0,12500000,Ready_to_move,Resale,Builder_Floor,12916.0
1256,875.0,3,3.0,Semi-Furnished,Chittaranjan Park,3.0,17500000,Ready_to_move,New_Property,Builder_Floor,12916.0
1257,990.0,2,2.0,Unfurnished,Chittaranjan Park Block A,1.0,11500000,Ready_to_move,Resale,Builder_Floor,12916.0


In [51]:
#getting information about the datase
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Area         1259 non-null   float64
 1   BHK          1259 non-null   int64  
 2   Bathroom     1257 non-null   float64
 3   Furnishing   1254 non-null   object 
 4   Locality     1259 non-null   object 
 5   Parking      1226 non-null   float64
 6   Price        1259 non-null   int64  
 7   Status       1259 non-null   object 
 8   Transaction  1259 non-null   object 
 9   Type         1254 non-null   object 
 10  Per_Sqft     1018 non-null   float64
dtypes: float64(4), int64(2), object(5)
memory usage: 108.3+ KB


In [52]:
#one_hot encoding
def onehot_encode(df,column,rename=False):
    df=df.copy()
    if rename==True:
        df[column]=df[column].replace({x:i  for i,x in enumerate(df[column].unique())})
    dummies=pd.get_dummies(df[column],prefix=column)
    df=pd.concat([df,dummies],axis=1)
    df=df.drop(column,axis=1)
    return df

In [65]:
#preprocess inputs
def preprocess_inputs(df):
    #creating the copy of the dataset
    df=df.copy()
    #missing rows
    missing_rows=df.loc[df['Per_Sqft'].isna(),:].index
    #dropping missing rows
    df=df.drop(missing_rows,axis=0)
    
    for column in ['Parking','Type']:
        df[column]=df[column].fillna(df[column].mode()[0])
        
    #binary encoding
    df['Status']=df['Status'].replace({'Ready_to_move':0, 'Almost_ready':1})
    df['Transaction']=df['Transaction'].replace({
        'New_Property':0,
        'Resale':1
    })
    df['Type']=df['Type'].replace({
        'Builder_Floor':0,
        'Apartment':1
    })
    df=onehot_encode(df,column='Furnishing',rename=False)
    df=onehot_encode(df,column='Locality'  ,rename=True)
    #Splitting the data into x and y
    y=df['Per_Sqft']
    x=df.drop('Per_Sqft',axis=1)
    #train_test_split
    x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,shuffle=True,random_state=1)
    
    
    #scaling
    scaler=StandardScaler()
    x_train=pd.DataFrame(scaler.fit_transform(x_train),columns=x.columns,index=x_train.index)
    x_test=pd.DataFrame(scaler.fit_transform(x_test),columns=x.columns,index=x_test.index)
    
    
    
    return x_train,x_test,y_train,y_test

In [26]:
#object columns in the dataset
{column:len(x[column].unique()) for column in x.select_dtypes('object').columns.drop('Locality')}

{'Furnishing': 4, 'Status': 2, 'Transaction': 2, 'Type': 3}

In [33]:
df['Furnishing'].mode()[0]

'Semi-Furnished'

In [28]:
#getting list of unique value in each column
{column:list(x[column].unique()) for column in x.select_dtypes('object').columns.drop('Locality')}

{'Furnishing': ['Semi-Furnished', 'Furnished', 'Unfurnished', nan],
 'Status': ['Ready_to_move', 'Almost_ready'],
 'Transaction': ['New_Property', 'Resale'],
 'Type': ['Apartment', 'Builder_Floor', nan]}

In [16]:
#index number with Per_Sqft missing value
df.loc[df['Per_Sqft'].isna(),:].index

Int64Index([   0,   30,   31,   32,   33,   34,   35,   36,   37,   38,
            ...
            1180, 1181, 1182, 1183, 1184, 1185, 1199, 1200, 1229, 1230],
           dtype='int64', length=241)

In [24]:
#checking for missing value in each column
x.isna().sum()

Area            0
BHK             0
Bathroom        0
Furnishing      4
Locality        0
Parking        13
Price           0
Status          0
Transaction     0
Type            4
Per_Sqft        0
dtype: int64

In [13]:
x['Per_Sqft'].isna()

0        True
1       False
2       False
3       False
4       False
        ...  
1254    False
1255    False
1256    False
1257    False
1258    False
Name: Per_Sqft, Length: 1259, dtype: bool

In [66]:
x_train,x_test,y_train,y_test=preprocess_inputs(df)
x_train

Unnamed: 0,Area,BHK,Bathroom,Parking,Price,Status,Transaction,Type,Furnishing_Furnished,Furnishing_Semi-Furnished,...,Locality_301,Locality_302,Locality_303,Locality_304,Locality_305,Locality_306,Locality_307,Locality_308,Locality_309,Locality_310
800,-0.352501,-0.824828,-0.528610,-0.221450,-0.662232,-0.250373,-1.220455,-0.898391,-0.420547,0.918999,...,-0.075165,-0.037503,-0.037503,-0.037503,0.0,-0.130931,-0.037503,0.0,-0.053074,-0.037503
1009,-0.525543,-0.824828,-0.528610,0.120573,-0.721011,-0.250373,-1.220455,1.113101,-0.420547,-1.088140,...,-0.075165,-0.037503,-0.037503,-0.037503,0.0,-0.130931,-0.037503,0.0,-0.053074,-0.037503
1122,-0.047458,0.203679,0.369648,0.120573,0.340462,-0.250373,0.819367,-0.898391,-0.420547,0.918999,...,-0.075165,-0.037503,-0.037503,-0.037503,0.0,-0.130931,-0.037503,0.0,-0.053074,-0.037503
1204,0.257584,0.203679,0.369648,0.120573,0.513340,-0.250373,0.819367,1.113101,-0.420547,-1.088140,...,-0.075165,-0.037503,-0.037503,-0.037503,0.0,-0.130931,-0.037503,0.0,-0.053074,-0.037503
356,-0.477291,1.232187,0.369648,0.120573,-0.230036,-0.250373,0.819367,-0.898391,2.377857,-1.088140,...,-0.075165,-0.037503,-0.037503,-0.037503,0.0,-0.130931,-0.037503,0.0,-0.053074,-0.037503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
940,-0.407963,0.203679,-0.528610,-0.221450,-0.471029,-0.250373,0.819367,1.113101,-0.420547,0.918999,...,-0.075165,-0.037503,-0.037503,-0.037503,0.0,-0.130931,-0.037503,0.0,-0.053074,-0.037503
103,0.745652,1.232187,2.166164,-0.221450,0.807233,-0.250373,-1.220455,-0.898391,-0.420547,-1.088140,...,-0.075165,-0.037503,-0.037503,-0.037503,0.0,-0.130931,-0.037503,0.0,-0.053074,-0.037503
1117,0.534895,1.232187,1.267906,0.120573,1.982805,-0.250373,-1.220455,-0.898391,-0.420547,0.918999,...,-0.075165,-0.037503,-0.037503,-0.037503,0.0,-0.130931,-0.037503,0.0,-0.053074,-0.037503
276,-0.053005,0.203679,0.369648,0.462596,0.115720,-0.250373,-1.220455,-0.898391,-0.420547,0.918999,...,-0.075165,-0.037503,-0.037503,-0.037503,0.0,-0.130931,-0.037503,0.0,-0.053074,-0.037503


In [67]:
x_train.mean()

Area           -5.488743e-17
BHK            -1.596725e-16
Bathroom        1.278628e-16
Parking        -1.746418e-17
Price           6.860929e-17
                    ...     
Locality_306   -5.488743e-17
Locality_307    2.494883e-18
Locality_308    0.000000e+00
Locality_309   -7.484650e-18
Locality_310   -7.484650e-18
Length: 322, dtype: float64

In [68]:
x_train.var()

Area            1.001406
BHK             1.001406
Bathroom        1.001406
Parking         1.001406
Price           1.001406
                  ...   
Locality_306    1.001406
Locality_307    1.001406
Locality_308    0.000000
Locality_309    1.001406
Locality_310    1.001406
Length: 322, dtype: float64

In [62]:
x_train.isna().sum().sum()

0

In [69]:
y_train

800      4138.0
1009    10320.0
1122    11905.0
1204    30000.0
356     22857.0
         ...   
940      9200.0
103     15972.0
1117    11905.0
276     20000.0
55      22000.0
Name: Per_Sqft, Length: 712, dtype: float64

In [71]:
#training the dataset
models={
        'Linear_Regression':LinearRegression(),
        'Linear Regression (L2 Regularization)':Ridge(),
        'Linear Regression (L1 Regularization)':Lasso(),
        'K-Nearest Neighbors':KNeighborsRegressor(),
        'Neural Network': MLPRegressor(),
        'Support Vector Machine (Linear Kernel)':LinearSVR(),
        'Support Vector Machine (RBF Kernel)': SVR(),
        'Decision Tree':DecisionTreeRegressor(),
        'Random Forest': RandomForestRegressor(),
        'Gradient Boosting':GradientBoostingRegressor(),
        'XGBoost': XGBRegressor(),
        'LighGBM': LGBMRegressor(),
        'CatBoost': CatBoostRegressor(verbose=0)
}

In [72]:
models

{'Linear_Regression': LinearRegression(),
 'Linear Regression (L2 Regularization)': Ridge(),
 'Linear Regression (L1 Regularization)': Lasso(),
 'K-Nearest Neighbors': KNeighborsRegressor(),
 'Neural Network': MLPRegressor(),
 'Support Vector Machine (Linear Kernel)': LinearSVR(),
 'Support Vector Machine (RBF Kernel)': SVR(),
 'Decision Tree': DecisionTreeRegressor(),
 'Random Forest': RandomForestRegressor(),
 'Gradient Boosting': GradientBoostingRegressor(),
 'XGBoost': XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, gamma=None,
              gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weigh

In [76]:
#training the model
for name,model in models.items():
    model.fit(x_train,y_train)
    print(name+'trained')

Linear_Regressiontrained
Linear Regression (L2 Regularization)trained
Linear Regression (L1 Regularization)trained
K-Nearest Neighborstrained
Neural Networktrained
Support Vector Machine (Linear Kernel)trained
Support Vector Machine (RBF Kernel)trained
Decision Treetrained
Random Foresttrained
Gradient Boostingtrained
XGBoosttrained
LighGBMtrained
CatBoosttrained


# Result

In [75]:
for name,model in models.items():
    print(name+'R^2 Score:{:.5f}'.format(model.score(x_test,y_test)))

Linear_RegressionR^2 Score:-33455927657158654284605161472.00000
Linear Regression (L2 Regularization)R^2 Score:0.13508
Linear Regression (L1 Regularization)R^2 Score:0.14627
K-Nearest NeighborsR^2 Score:0.16168
Neural NetworkR^2 Score:-0.48146
Support Vector Machine (Linear Kernel)R^2 Score:-0.48167
Support Vector Machine (RBF Kernel)R^2 Score:-0.05058
Decision TreeR^2 Score:-0.12860
Random ForestR^2 Score:-0.00759
Gradient BoostingR^2 Score:0.01915
XGBoostR^2 Score:-0.03764
LighGBMR^2 Score:0.07338
CatBoostR^2 Score:-0.03391
