In [47]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [33]:
df= pd.read_csv("housing_cleaned.csv")

In [34]:
df.head()

Unnamed: 0,Title,Bedrooms,Bathrooms,Floor_no,Occupancy_status,Floor_area,City,Price_in_taka,Location,Area,Section,property_type
0,Flat For Sale In Mirpur Section 6 Near Mosque,2.0,2.0,1.0,vacant,650.0,dhaka,3100000.0,"Section 6, Mirpur",mirpur,section 6,apartment
1,Comfortable Flat Is Up For Sale In Pallabi Nea...,3.0,3.0,5.0,vacant,1200.0,dhaka,7500000.0,"Pallabi, Mirpur",mirpur,pallabi,apartment
2,A Flat With All Facilities Is Up For Sale At W...,3.0,2.0,2.0,vacant,1220.0,dhaka,5000000.0,"West Shewrapara, Mirpur",mirpur,west shewrapara,apartment
3,1240 Square Feet Flat For Sale In The Location...,3.0,3.0,5.0,vacant,1240.0,dhaka,7320000.0,"Section 1, Mirpur",mirpur,section 1,apartment
4,Residential Apartment Is For Sale In Mirpur Ne...,3.0,3.0,1.0,vacant,1050.0,dhaka,4700000.0,"Section 6, Mirpur",mirpur,section 6,bungalow/townhouse


In [35]:
df=df.sort_values(by='Section',ascending=True)

separating target variable and testing variable

In [36]:
X = df[["Bedrooms","Bathrooms","Floor_no","Floor_area","Section","property_type"]]
Y=df['Price_in_taka']

In [37]:
print("Categories in 'Section' variable:     ",end=" " )
print(X['Section'].unique())

print("Categories in'property_type' variable:",end=" " )
print(X['property_type'].unique())



Categories in 'Section' variable:      ['1st colony' '2nd colony' 'ahmed nagar' 'arambag residential area'
 'dakkhin paikpara' 'darussalam' 'east kazipara' 'east monipur'
 'east shewrapara' 'kachukhet road' 'kallyanpur' 'lalkuthi'
 'middle monipur' 'middle paikpara' 'mirpur dohs' 'paikpara' 'pallabi'
 'pirerbag' 'rupnagar r/a' 'section 1' 'section 10' 'section 11'
 'section 12' 'section 13' 'section 15' 'section 2' 'section 6'
 'section 7' 'south monipur' 'west kazipara' 'west monipur'
 'west shewrapara']
Categories in'property_type' variable: ['apartment' 'bungalow/townhouse' 'commercial' 'building']


In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 408 entries, 368 to 144
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Title             408 non-null    object 
 1   Bedrooms          408 non-null    float64
 2   Bathrooms         408 non-null    float64
 3   Floor_no          408 non-null    float64
 4   Occupancy_status  408 non-null    object 
 5   Floor_area        408 non-null    float64
 6   City              408 non-null    object 
 7   Price_in_taka     408 non-null    float64
 8   Location          408 non-null    object 
 9   Area              408 non-null    object 
 10  Section           408 non-null    object 
 11  property_type     408 non-null    object 
dtypes: float64(5), object(7)
memory usage: 41.4+ KB


In [39]:

num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OrdinalEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [40]:
X = preprocessor.fit_transform(X)

In [42]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((326, 6), (82, 6))

In [49]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [50]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 3078776.2273
- Mean Absolute Error: 1640300.8790
- R2 Score: 0.9484
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1904291.3129
- Mean Absolute Error: 1460592.8384
- R2 Score: 0.6921


Lasso
Model performance for Training set
- Root Mean Squared Error: 3078776.2273
- Mean Absolute Error: 1640300.8792
- R2 Score: 0.9484
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1904291.2147
- Mean Absolute Error: 1460592.4067
- R2 Score: 0.6921


Ridge
Model performance for Training set
- Root Mean Squared Error: 3079387.0519
- Mean Absolute Error: 1643673.4094
- R2 Score: 0.9484
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1908345.9587
- Mean Absolute Error: 1460516.4603
- R2 Score: 0.6908


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 8114367

In [51]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
5,Random Forest Regressor,0.797987
6,XGBRegressor,0.753342
1,Lasso,0.692142
0,Linear Regression,0.692142
2,Ridge,0.690829
4,Decision Tree,0.635309
7,CatBoosting Regressor,0.619743
8,AdaBoost Regressor,0.61786
3,K-Neighbors Regressor,0.229181


best performing model for us is Random Forest Regressor

In [52]:
rfg_model=RandomForestRegressor()
rfg_model.fit(X_train, y_train)
y_pred = rfg_model.predict(X_test)
score = r2_score(y_test, y_pred)*100
print(" Accuracy of the model is %.2f" %score)

 Accuracy of the model is 77.06


Since the accuracy is 77.06% it is a workable model