## Model Development

Now that we have our cleaned data with required features, lets proceed with model development 

Importing Libraries

In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import pickle

from xgboost import XGBRegressor
#from lightgbm import LGBMRegressor
#import lightgbm as ltb

from math import sqrt

%matplotlib inline

Loading our data into dataframe

In [None]:
df_properties = pd.read_csv('cleaned_property_data_buy.csv')
#df_properties = df_properties.dropna(column='amenities',axis=1)
df_properties = df_properties.drop(columns = ['amenities','price_per_sqft','neighborhood'],axis=1)

df_properties = pd.get_dummies(df_properties, columns=['quality', ])
df_properties.head()
#partly_furnished

In [None]:
#df_properties.price = df_properties.price * 0.27

In [None]:

df_properties.price.describe()

Training dataframe

In [None]:
df_properties.head()

Testing dataframe

In [None]:
df_properties.shape

Target feature for prediction

In [None]:
y_train = df_properties['price']#.values
#type(y_train)

Excluding target feature from the training dataset

In [None]:
y_train.head()

In [None]:
X = df_properties.drop(columns=['price','id'],axis=1)
#X['maid_room'] = df["maid_room"].astype(int)
#X['maid_room'] = X['maid_room'].replace({True: 1, False: 0})
X = X.replace({False: 0, True: 1}, inplace=False)
#unfurnished
#X['partly_furnished'] = X['partly_furnished'].replace({1: 0, 0: 1})
#X = X[['latitude','longitude','size_in_sqft','no_of_bedrooms','no_of_bathrooms','covered_parking','unfurnished','concierge','kitchen_appliances','pets_allowed','view_of_water']]
y = y_train

In [None]:
X.head()

Splitting training dataset for model training.

In [None]:
# Split into train & test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
X_train.head()

In [None]:
X.shape

In [None]:
X_train.shape

For prediction modelling we will use following models:
    
- LGBMRegressor
- RandomForestRegressor
- XGBRegressor
- LGBMRegressor

We will train our models using training dataset consisting of 1619 properties and then test them against our testing dataset with 2024-1619 properties.

We will check our R2 score for each model and we will select the predictions from the model which is closest to value of 1. R2 scores range from 0 to 1.

Based on best R2 score we will match actual values with predicted values and see the percentage difference.

### GradientBoostingRegressor

#### Training

In [None]:
from sklearn.metrics import mean_absolute_error,mean_absolute_percentage_error

In [None]:

#np.random.seed(42)

for i in [0.001, 0.003, 0.1, 0.3, 1, 2, 4, 6]:
    for j in [100, 250, 500, 700, 750, 800, 850, 900, 950, 1000, 1050]:
       gradient = GradientBoostingRegressor(n_estimators = j, learning_rate=i)
       print('learning rate is: '+ str(i)+ ' and n_estimators is: '+str(j))
       gradient.fit(X_train, y_train)
       gradient_predictions = gradient.predict(X_test)
       gradient_r2_score = r2_score(y_test, gradient_predictions)
       print('R2 Score for GradientBoostingRegressor', gradient_r2_score)
       
       print("MAE is",mean_absolute_error(y_test, gradient_predictions))
       print("MAPE is",mean_absolute_percentage_error(y_test, gradient_predictions))
       print('\n')
 

#### Testing

In [None]:
#gradient_predictions = gradient.predict(df_test)
gradient = GradientBoostingRegressor(n_estimators = 750, learning_rate=0.3)
gradient.fit(X_train, y_train)
       
gradient_predictions = gradient.predict(X_test)
gradient_r2_score = r2_score(y_test, gradient_predictions)
print('R2 Score for GradientBoostingRegressor', gradient_r2_score)

from sklearn.metrics import mean_absolute_error

print("MAE",mean_absolute_error(y_test, gradient_predictions))
print("MAPE is",mean_absolute_percentage_error(y_test, gradient_predictions))
filename = 'GradientBoostingRegressor2_buy.sav'
pickle.dump(gradient, open(filename, 'wb'))

In [None]:
"""
param_test2 = {'max_depth':range(5,16,2), 'min_samples_split':range(200,1001,200)}
gradient = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=60, max_features='sqrt', subsample=0.8, random_state=10), 
param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gradient.fit(train[predictors],train[target])
gradient.grid_scores_, gradient.best_params_, gradient.best_score_
"""

### RandomForestRegressor

#### Training

In [None]:

for j in [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26]:
       random_forest = RandomForestRegressor(max_depth = j)#, learning_rate=i)
       print(' and max_depth is: '+str(j))
       random_forest.fit(X_train, y_train)
       forest_prediction = random_forest.predict(X_test)
       forest_r2_score = r2_score(y_test, forest_prediction)
       print('R2 Score for RandomForestRegressor', forest_r2_score)
       
       print("MAE",mean_absolute_error(y_test, forest_prediction))
       print("MAPE is",mean_absolute_percentage_error(y_test, forest_prediction))
       print('\n')
 


In [None]:
random_forest = RandomForestRegressor(max_depth=16)
random_forest.fit(X_train, y_train)
#forest_prediction = random_forest.predict(X_test)

#### Testing

In [None]:
forest_prediction = random_forest.predict(X_test)
forest_r2_score = r2_score(y_test, forest_prediction)
print('R2 Score for RandomForestRegressor', forest_r2_score)

from sklearn.metrics import mean_absolute_error
print("MAE",mean_absolute_error(y_test, forest_prediction))
print("MAPE is",mean_absolute_percentage_error(y_test, forest_prediction))

filename = 'random_forest2_buy.sav'
pickle.dump(random_forest, open(filename, 'wb'))

### XGBRegressor

#### Training

In [None]:
for i in [0.001, 0.003, 0.1, 0.3, 1, 2, 4, 6]:
    for j in [50, 100, 250, 500, 700, 750, 800, 850, 900, 950, 1000, 1050]:
       xgbr_regressor = XGBRegressor(learning_rate=i, n_estimators=j, n_jobs=-1)
       xgbr_regressor.fit(X_train, y_train)
       
       xgbr_regressor = random_forest.predict(X_test)
       xgbreg_r2_score = r2_score(y_test, xgbr_regressor)
       print('learning rate is '+str(i)+' nestimators is '+str(j))
       print('R2 Score for RandomForestRegressor', xgbreg_r2_score)
       
       print("MAE",mean_absolute_error(y_test, xgbr_regressor))
       print("MAPE is",mean_absolute_percentage_error(y_test, xgbr_regressor))
       print('\n')

In [None]:
xgbr_regressor = XGBRegressor(learning_rate=0.1, n_estimators=100, n_jobs=-1)
xgbr_regressor.fit(X_train, y_train)


In [None]:
"""
xgbr_regressor.feature_importances_
xgbr_regressor.feature_names_in_

fig = plt.figure(figsize=(20, 25))
plt.barh(xgbr_regressor.feature_names_in_, xgbr_regressor.feature_importances_)
plt.xlabel("Courses offered")
plt.ylabel("No. of students enrolled")
plt.title("Students enrolled in different courses")
plt.show()
"""

#### Testing

In [None]:
xgbr_regressor_prediction = xgbr_regressor.predict(X_test)
xgbr_regresso_r2_score = r2_score(y_test, xgbr_regressor_prediction)
print('R2 Score for XGBRegressor', xgbr_regresso_r2_score)

from sklearn.metrics import mean_absolute_error
print("MAE",mean_absolute_error(y_test, xgbr_regressor_prediction))

print("MAPE is",mean_absolute_percentage_error(y_test, xgbr_regressor_prediction))

filename = 'xgbr_regressor2_buy.sav'
pickle.dump(xgbr_regressor, open(filename, 'wb'))

In [None]:
"""

from xgboost import plot_importance


fig = plt.figure(figsize=(115, 95))
plot_importance(xgbr_regressor,height=25.2)
plt.show()
"""
feature_important = xgbr_regressor.get_booster().get_score(importance_type='weight')
keys = list(feature_important.keys())
values = list(feature_important.values())

data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)
data.nlargest(40, columns="score").plot(kind='barh', figsize = (30,20)) ## plot top 40 features


In [None]:
"""
data = pd.DataFrame(columns=[X.columns.values,'bonjour'])
data.loc['latitude']=1.2
"""

In [None]:
data.head()

In [None]:
#data.

In [None]:
"""
import gradio as gr

def greet(latitude=0,longitude=0,size_in_sqft=0,no_of_bedrooms=1,no_of_bathrooms=1, view_of_water=True, unfurnished=False,covered_parking=False,pets_allowed=False, kitchen_appliances=False, concierge=False):
    print('bonjour')
    tab =['latitude','longitude','size_in_sqft','no_of_bedrooms','no_of_bathrooms','covered_parking','unfurnished','concierge','kitchen_appliances','pets_allowed','view_of_water']
    print(latitude)
    print(longitude)
    print(unfurnished)
    dt = pd.DataFrame({'latitude': [latitude],
                        'longitude': [longitude],
                        'size_in_sqft':  [size_in_sqft],
                        'no_of_bedrooms':  [no_of_bedrooms],
                        'no_of_bathrooms':  [no_of_bathrooms],
                        'view_of_water':  [1] if view_of_water==True else [0],
                        'unfurnished':  [1] if unfurnished==True else [0],
                        'covered_parking':  [1] if covered_parking==True else [0],
                        'pets_allowed': [1] if pets_allowed==True else [0],
                        'kitchen_appliances': [1] if kitchen_appliances==True else [0],
                        'concierge': [1] if concierge==True else [0]
                        } )
    
         #print('X.columns',X.columns)
    
    dt.head        
    print(dt['latitude'])
    print(X.shape)
    
    
    filename = 'GradientBoostingRegressor2.sav'
    loaded_model = pickle.load(open(filename, 'rb'))
    result = loaded_model.predict(dt)
    result = result[0] *0.27
         
    print('result',result) 
    #dt.head()
    return result   
    #gradient_predictions = gradient.predict(tab)
    #return gradient_predictions

demo = gr.Interface(
    fn=greet,
    inputs=["number", "number", "number", "number","number","checkbox","checkbox","checkbox","checkbox","checkbox","checkbox"],
    outputs=["number"],
)
demo.launch()
"""

In [None]:
#keras
from tensorflow import keras
from PIL import ImageFont
#from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.preprocessing.text import one_hot
from keras.layers import Dense, Activation, Flatten

import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error 
from matplotlib import pyplot as plt
import seaborn as sb
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [None]:
"""
NN_model = Sequential()

# The Input Layer :
NN_model.add(Dense(128, kernel_initializer='normal',input_dim = X.shape[1], activation='relu'))

# The Hidden Layers :
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
#NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
#NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))


# The Output Layer :
NN_model.add(Dense(1, kernel_initializer='normal',activation='linear'))

# Compile the network :
NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
NN_model.summary()
"""

In [None]:
"""
checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]
"""

In [None]:
"""history = NN_model.fit(X, y, epochs=200, batch_size=32, validation_split = 0.2, callbacks=callbacks_list)"""

In [None]:
"""import seaborn as sns
import matplotlib.pyplot as plt

history_df = pd.DataFrame.from_dict(history.history)
sns.lineplot(data=history_df[['mean_absolute_error', 'val_mean_absolute_error']])
plt.xlabel("epochs")
plt.ylabel("MAE")"""