<a href="https://colab.research.google.com/github/tylerphonglam/Project4-Homepriceprediction/blob/main/project4_ML_model_KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

# Loading Data

In [2]:
# Github URL
url = "https://raw.githubusercontent.com/tylerphonglam/Project4-Homepriceprediction/main/ontario.csv"

In [3]:
# Reading Ontario dataset into a dataframe
ontario = pd.read_csv(url)

In [4]:
df = ontario.copy()

In [5]:
df.columns

Index(['City', 'Price', 'Address', 'Number_Beds', 'Number_Baths', 'Province',
       'Population', 'Latitude', 'Longitude', 'Median_Family_Income'],
      dtype='object')

In [6]:
df

Unnamed: 0,City,Price,Address,Number_Beds,Number_Baths,Province,Population,Latitude,Longitude,Median_Family_Income
0,Toronto,779900.0,#318 -20 SOUTHPORT ST,3,2,Ontario,5647656,43.7417,-79.3733,97000.0
1,Toronto,799999.0,#818 -60 SOUTHPORT ST,3,1,Ontario,5647656,43.7417,-79.3733,97000.0
2,Toronto,799900.0,#714 -859 THE QUEENSWAY,2,2,Ontario,5647656,43.7417,-79.3733,97000.0
3,Toronto,1200000.0,275 MORTIMER AVE,4,2,Ontario,5647656,43.7417,-79.3733,97000.0
4,Toronto,668800.0,#420 -388 RICHMOND ST,1,1,Ontario,5647656,43.7417,-79.3733,97000.0
...,...,...,...,...,...,...,...,...,...,...
14010,Regina,399900.0,1777 FORGET ST,4,1,Ontario,226404,50.4547,-104.6067,90000.0
14011,Saskatoon,1099900.0,2556 MAYFAIR,3,3,Ontario,266141,52.1333,-106.6833,89000.0
14012,Saskatoon,1149900.0,2552 MAYFAIR,4,3,Ontario,266141,52.1333,-106.6833,89000.0
14013,Saskatoon,1099900.0,2548 MAYFAIR,4,3,Ontario,266141,52.1333,-106.6833,89000.0


# Making New Features

In [7]:
df['Income_9_percent'] = df['Median_Family_Income'] * 0.09
df['Income_18_percent'] = df['Median_Family_Income'] * 0.18
df['Income_27_percent'] = df['Median_Family_Income'] * 0.27

In [8]:
df['Installments_9_percent'] = df['Price'] / df['Income_9_percent']
df['Installments_18_percent'] = df['Price'] / df['Income_18_percent']
df['Installments_27_percent'] = df['Price'] / df['Income_27_percent']

In [9]:
df

Unnamed: 0,City,Price,Address,Number_Beds,Number_Baths,Province,Population,Latitude,Longitude,Median_Family_Income,Income_9_percent,Income_18_percent,Income_27_percent,Installments_9_percent,Installments_18_percent,Installments_27_percent
0,Toronto,779900.0,#318 -20 SOUTHPORT ST,3,2,Ontario,5647656,43.7417,-79.3733,97000.0,8730.0,17460.0,26190.0,89.335624,44.667812,29.778541
1,Toronto,799999.0,#818 -60 SOUTHPORT ST,3,1,Ontario,5647656,43.7417,-79.3733,97000.0,8730.0,17460.0,26190.0,91.637915,45.818958,30.545972
2,Toronto,799900.0,#714 -859 THE QUEENSWAY,2,2,Ontario,5647656,43.7417,-79.3733,97000.0,8730.0,17460.0,26190.0,91.626575,45.813288,30.542192
3,Toronto,1200000.0,275 MORTIMER AVE,4,2,Ontario,5647656,43.7417,-79.3733,97000.0,8730.0,17460.0,26190.0,137.457045,68.728522,45.819015
4,Toronto,668800.0,#420 -388 RICHMOND ST,1,1,Ontario,5647656,43.7417,-79.3733,97000.0,8730.0,17460.0,26190.0,76.609393,38.304696,25.536464
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14010,Regina,399900.0,1777 FORGET ST,4,1,Ontario,226404,50.4547,-104.6067,90000.0,8100.0,16200.0,24300.0,49.370370,24.685185,16.456790
14011,Saskatoon,1099900.0,2556 MAYFAIR,3,3,Ontario,266141,52.1333,-106.6833,89000.0,8010.0,16020.0,24030.0,137.315855,68.657928,45.771952
14012,Saskatoon,1149900.0,2552 MAYFAIR,4,3,Ontario,266141,52.1333,-106.6833,89000.0,8010.0,16020.0,24030.0,143.558052,71.779026,47.852684
14013,Saskatoon,1099900.0,2548 MAYFAIR,4,3,Ontario,266141,52.1333,-106.6833,89000.0,8010.0,16020.0,24030.0,137.315855,68.657928,45.771952


In [10]:
df.columns

Index(['City', 'Price', 'Address', 'Number_Beds', 'Number_Baths', 'Province',
       'Population', 'Latitude', 'Longitude', 'Median_Family_Income',
       'Income_9_percent', 'Income_18_percent', 'Income_27_percent',
       'Installments_9_percent', 'Installments_18_percent',
       'Installments_27_percent'],
      dtype='object')

# Imports for Models

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

# Split data for test and train

In [12]:
# Select features (X) and target variable (y)
features = ['Number_Beds', 'Number_Baths','Income_9_percent', 'Income_18_percent', 'Income_27_percent',
       'Installments_9_percent', 'Installments_18_percent',
       'Installments_27_percent']
X = df[features]
y = df['Price']

In [13]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standard Scaler

In [14]:
# Standardize features for SVM and KNN
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Linear Regression

In [15]:
# Linear Regression
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
y_pred_linear = linear_reg.predict(X_test)

# Support Vector Regression (SVR)

In [16]:
# Support Vector Regression (SVR)
svr = SVR()
svr.fit(X_train_scaled, y_train)
y_pred_svr = svr.predict(X_test_scaled)

# KNN Regression

In [17]:
# K-Neighbors Regression
knn_reg = KNeighborsRegressor()
knn_reg.fit(X_train_scaled, y_train)
y_pred_knn = knn_reg.predict(X_test_scaled)

# Evaluating the Models

In [18]:
# Evaluate models
def evaluate_model(model_name, y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    print(f"R2 Score: {r2:.2f}")

In [19]:
# Evaluate Linear Regression
evaluate_model("Linear Regression", y_test, y_pred_linear)

R2 Score: 0.99


In [20]:
# Evaluate SVR
evaluate_model("SVR", y_test, y_pred_svr)

R2 Score: -0.04


### Best R2 Score : 0.77 with KNN

In [21]:
# Evaluate K-Neighbors Regression
evaluate_model("K-Neighbors Regression", y_test, y_pred_knn)

R2 Score: 0.77


# Optimize the KNN Model

In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

# Define the parameter grid
param_grid = {'n_neighbors': [3, 5, 7, 9, 11], 'weights': ['uniform', 'distance']}

# Create a KNN regressor
knn_reg = KNeighborsRegressor()

# Instantiate GridSearchCV with R-squared as the scoring metric
scorer = make_scorer(r2_score)
grid_search = GridSearchCV(knn_reg, param_grid, cv=5, scoring=scorer)

# Fit the grid search to the data
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Use the best parameters to train the model
knn_reg_best = KNeighborsRegressor(n_neighbors=best_params['n_neighbors'], weights=best_params['weights'])
knn_reg_best.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred_knn_best = knn_reg_best.predict(X_test_scaled)

## Best Params for KNN

In [23]:
best_params

{'n_neighbors': 3, 'weights': 'distance'}

### R2 Score :0.78 optimized model

In [24]:
# Evaluate the model
r2 = r2_score(y_test, y_pred_knn_best)

print(f'R-squared: {r2}')

R-squared: 0.7834454743046255


### Function to Predict the House in particular price range with provided beds and baths in ontario

In [25]:
def predict_house_price():
    # Take input from the user
    number_of_beds = int(input("Enter the number of beds: "))
    number_of_baths = int(input("Enter the number of baths: "))
    installment_percentage = float(input("Enter the desired installment yearly: "))
    house_price = float(input("Enter the House Price: "))

    installment_percentage1 = installment_percentage
    installment_percentage2 = installment_percentage * 2
    installment_percentage3 = installment_percentage * 3

    hp1 = house_price / installment_percentage1
    hp2 = house_price / installment_percentage2
    hp3 = house_price / installment_percentage3

    # Create input data for prediction
    input_data = [[number_of_beds, number_of_baths, installment_percentage1,
                   installment_percentage2, installment_percentage3,
                   hp1, hp2, hp3]]

    # Placeholder values for installments
    input_data_scaled = scaler.transform(input_data)

    # Make predictions
    predicted_price = knn_reg_best.predict(input_data_scaled)

    # Display the predicted price
    print(f"Predicted Price: {predicted_price[0]:.2f}")

    # Start with a percentage range of 1
    percentage_range = 1

    while percentage_range <= 20:  # You can adjust the upper limit as needed
        # Calculate the lower and upper bounds of the range
        lower_bound = predicted_price * (1 - percentage_range / 100)
        upper_bound = predicted_price * (1 + percentage_range / 100)

        mr = df[(df['Price'] >= lower_bound[0]) & (df['Price'] <= upper_bound[0])]

        matching_rows = mr[(mr['Number_Beds'] == number_of_beds) & (mr['Number_Baths'] == number_of_baths)]

        # Fetch rows from the original DataFrame within the specified range
        if not matching_rows.empty:
            # Display relevant information
            print(f"\nHuse Predicted (within {percentage_range}% range):")
            print(matching_rows[['City', 'Price','Number_Beds','Number_Baths','Province']])
            break  # Exit the loop if matching rows are found
        else:
            percentage_range += 1  # Increase the percentage range

    if percentage_range > 30:
        print("\nNo house found within the specified range (up to 30%).")

## Final Output

In [27]:
# Call the function
predict_house_price()

KeyboardInterrupt: ignored