In [21]:
import pandas as pd
from geopy.distance import distance
import requests
from API_keys import geoapify_key

## Import file
---

In [95]:
df = pd.read_csv("./prediction_test.csv")
df

Unnamed: 0,Address,Floor Area,Land Area,Built Year,Year Sold
0,"570C, Marmion Street, Booragoon, Western Austr...",250,844,1967,2016
1,"17 Aveley Street Willetton, Western Australia,...",278,809,1961,2020


## Tranform data
---

In [96]:
# Set parameters to search for a hotel
radius = "10000"
params = {
    "apiKey" : geoapify_key,
}

# Print a message to follow up the hotel search
print("Starting House coordinates search")

# Iterate through the hotel_df DataFrame
for index, row in df.iterrows():
    address = row["Address"]

    # Add filter and bias parameters with the current city's latitude and longitude to the params dictionary
    params["text"] = f'{address}'
    
    # Set base URL
    base_url = "https://api.geoapify.com/v1/geocode/search?"


    # Make and API request using the params dictionaty
    # and then Convert the API response to JSON format
    response = requests.get(base_url, params = params).json()
    
    
    # Grab the first hotel from the results and store the name in the hotel_df DataFrame
    try:
        df.loc[index, "Lat"] = response["features"][0]["properties"]["lat"]
        df.loc[index, "Lng"] =response["features"][0]["properties"]["lon"]
    except (KeyError, IndexError):
        # If no hotel is found, set the hotel name as "No hotel found".
        df.loc[index, "Lat"] = "No Coordinates"
        df.loc[index, "Lng"] = "No Coordinates"
        
    # Log the search results
    print(f"{df.loc[index, 'Address']} - coordinates:[{df.loc[index, 'Lat']}, {df.loc[index, 'Lng']}]")

# Display sample data
df.head()

Starting hotel search
570C, Marmion Street, Booragoon, Western Australia, Australia - coordinates:[-32.037021, 115.831968]
17 Aveley Street Willetton, Western Australia, Australia - coordinates:[-32.0418, 115.87421]


Unnamed: 0,Address,Floor Area,Land Area,Built Year,Year Sold,Lat,Lng
0,"570C, Marmion Street, Booragoon, Western Austr...",250,844,1967,2016,-32.037021,115.831968
1,"17 Aveley Street Willetton, Western Australia,...",278,809,1961,2020,-32.0418,115.87421


In [97]:
# Perth CBD coordinate
CBD_coordinate = (-31.95477452902314, 115.85673098939384)

for index, row in df.iterrows():
    # name a valuable for house coordinate
    house_coordinate = tuple([row["Lat"],row["Lng"]])
    # calculate the distance from CBD
    dum_dis = distance(CBD_coordinate, house_coordinate).m
    try:
        df.loc[index, "CBD_DIST"] = dum_dis
    except (KeyError, IndexError):
        df.loc[index, "CBD_DIST"] = "Not known"
df.head()


Unnamed: 0,Address,Floor Area,Land Area,Built Year,Year Sold,Lat,Lng,CBD_DIST
0,"570C, Marmion Street, Booragoon, Western Austr...",250,844,1967,2016,-32.037021,115.831968,9415.463454
1,"17 Aveley Street Willetton, Western Australia,...",278,809,1961,2020,-32.0418,115.87421,9790.303325


In [98]:
# Set base URL
base_url = "https://api.geoapify.com/v2/places"

#set parameter
params_edu = {
    "apiKey" : geoapify_key,
    "categories" : "education.school"
}
params_station = {
    "apiKey" : geoapify_key,
    "categories" : "public_transport.train"
}

# Print a message to follow up the hotel search
print("Starting station and school search")

# Iterate through the hotel_df DataFrame
for index, row in df.iterrows():
    # get latitude, longitude from the DataFrame
    lng = row["Lng"] # getting column "Lng" from each row, can also used: hotel_df[index, "Lng"]
    lat = row["Lat"]
    
    # Add filter and bias parameters with the current city's latitude and longitude to the params dictionary
    params_edu["bias"] = f'proximity:{lng},{lat}'
    params_station["bias"] = f'proximity:{lng},{lat}'
    params_edu["limit"] = f'1'
    params_station["limit"] = f'1'
    

    # Make and API request using the params dictionaty
    # and then Convert the API response to JSON format
    education_response = requests.get(base_url, params = params_edu).json()
    station_response = requests.get(base_url, params = params_station).json()
    
    
    # Grab the first station and school from the results and store the distance in the df
    try:
        df.loc[index, "school_DIS"] = education_response["features"][0]["properties"]["distance"]
        df.loc[index, "station_DIS"] = station_response["features"][0]["properties"]["distance"]
    except (KeyError, IndexError):
        # If no hotel is found, set the hotel name as "No hotel found".
        df.loc[index, "school_DIS"] = "No schoool found"
        df.loc[index, "station_DIS"] = "No station found"
        
    # Log the search results
    print(f"{df.loc[index, 'Address']} - nearest school: {df.loc[index, 'school_DIS']}")
    print(f"{df.loc[index, 'Address']} - nearest statioin: {df.loc[index, 'station_DIS']}")

# Display sample data
df

Starting station and school search
570C, Marmion Street, Booragoon, Western Australia, Australia - nearest school: 444.0
570C, Marmion Street, Booragoon, Western Australia, Australia - nearest statioin: 2371.0
17 Aveley Street Willetton, Western Australia, Australia - nearest school: 412.0
17 Aveley Street Willetton, Western Australia, Australia - nearest statioin: 1987.0


Unnamed: 0,Address,Floor Area,Land Area,Built Year,Year Sold,Lat,Lng,CBD_DIST,school_DIS,station_DIS
0,"570C, Marmion Street, Booragoon, Western Austr...",250,844,1967,2016,-32.037021,115.831968,9415.463454,444.0,2371.0
1,"17 Aveley Street Willetton, Western Australia,...",278,809,1961,2020,-32.0418,115.87421,9790.303325,412.0,1987.0


In [99]:
import json

In [100]:
# import coastline geojson
with open("../Dataset/Coastline.geojson", "r") as f:
    json_file = json.load(f)

json_file = json_file["features"][0]["geometry"]["coordinates"]

coastline = [] # create an empty list for calculation
# arrange the lat and long in the list and put it as a tuple in coastline list for calcualtion
for coordinate in json_file:
    lat = coordinate[1]
    lng = coordinate[0]
    coastline.append(tuple([lat,lng])) 

len(coastline)

224

In [101]:
coast_distance = []
counter = 0
for coastline_coordinate in coastline:
    for index, row in df.iterrows():
        if counter == 0:
            current_dis = float(1000000000)
        else:
            current_dis = coast_distance[index]  # saving the values in the list
        house_coordinate = tuple([row["Lat"],row["Lng"]])
        # calculate the distance from coast
        dum_dis = distance(coastline_coordinate, house_coordinate).m
        # if  dum_dis smaller than current dis
        if counter == 0:
            coast_distance.append(dum_dis)
        elif dum_dis < current_dis:
            coast_distance[index] = dum_dis # put the distance into coast_distance list
    counter += 1

# adding the list into dataframe
df["Coast_DIST"] = coast_distance
df.head()

Unnamed: 0,Address,Floor Area,Land Area,Built Year,Year Sold,Lat,Lng,CBD_DIST,school_DIS,station_DIS,Coast_DIST
0,"570C, Marmion Street, Booragoon, Western Austr...",250,844,1967,2016,-32.037021,115.831968,9415.463454,444.0,2371.0,7834.698256
1,"17 Aveley Street Willetton, Western Australia,...",278,809,1961,2020,-32.0418,115.87421,9790.303325,412.0,1987.0,11857.776112


In [108]:
df_not_for_scaled = df.drop(columns= ["Lat", "Lng"])
df_not_for_scaled

Unnamed: 0,Address,Floor Area,Land Area,Built Year,Year Sold,CBD_DIST,school_DIS,station_DIS,Coast_DIST
0,"570C, Marmion Street, Booragoon, Western Austr...",250,844,1967,2016,9415.463454,444.0,2371.0,7834.698256
1,"17 Aveley Street Willetton, Western Australia,...",278,809,1961,2020,9790.303325,412.0,1987.0,11857.776112


In [111]:
df_cleaned = df.drop(columns= ["Address", "Lat", "Lng"])
df_cleaned["Property age"] = df_cleaned["Year Sold"] - df_cleaned["Built Year"]
df_cleaned = df_cleaned.drop(columns= "Built Year")
df_cleaned["school_DIS"] = df_cleaned["school_DIS"]/1000
df_cleaned


Unnamed: 0,Floor Area,Land Area,Year Sold,CBD_DIST,school_DIS,station_DIS,Coast_DIST,Property age
0,250,844,2016,9415.463454,0.444,2371.0,7834.698256,49
1,278,809,2020,9790.303325,0.412,1987.0,11857.776112,59


In [112]:
df_cleaned.columns =["FLOOR_AREA", "LAND_AREA", "YEAR_SOLD", "CBD_DIST", "NEAREST_SCH_DIST", "NEAREST_STN_DIST", "COAST_DIST", "PROPERTY_AGE"]
df_cleaned

Unnamed: 0,FLOOR_AREA,LAND_AREA,YEAR_SOLD,CBD_DIST,NEAREST_SCH_DIST,NEAREST_STN_DIST,COAST_DIST,PROPERTY_AGE
0,250,844,2016,9415.463454,0.444,2371.0,7834.698256,49
1,278,809,2020,9790.303325,0.412,1987.0,11857.776112,59


In [113]:
df_cleaned = df_cleaned.reindex(columns= ["LAND_AREA", "FLOOR_AREA", "CBD_DIST", "NEAREST_STN_DIST", "NEAREST_SCH_DIST",
                                          "YEAR_SOLD", "COAST_DIST", "PROPERTY_AGE"])
df_cleaned

Unnamed: 0,LAND_AREA,FLOOR_AREA,CBD_DIST,NEAREST_STN_DIST,NEAREST_SCH_DIST,YEAR_SOLD,COAST_DIST,PROPERTY_AGE
0,844,250,9415.463454,2371.0,0.444,2016,7834.698256,49
1,809,278,9790.303325,1987.0,0.412,2020,11857.776112,59


## Scale data
---

In [130]:
from sklearn.preprocessing import StandardScaler

In [150]:
train_df = pd.read_csv("../DataCleaning/output/Perth_housing_ML.csv")
train_df = train_df.dropna()
train_df = train_df.drop(columns= ["SUBURB", "PRICE"])
train_df

Unnamed: 0,LAND_AREA,FLOOR_AREA,CBD_DIST,NEAREST_STN_DIST,NEAREST_SCH_DIST,YEAR_SOLD,COAST_DIST,PROPERTY_AGE
0,600,160,18300,1800,0.828339,2018.0,7553.802466,15.0
1,351,139,26900,4900,5.524324,2019.0,7994.792382,6.0
2,719,86,22600,1900,1.649178,2015.0,21643.941437,36.0
3,651,59,17900,3600,1.571401,2018.0,26743.430605,65.0
4,466,131,11200,2000,1.514922,2016.0,18311.330257,18.0
...,...,...,...,...,...,...,...,...
31623,292,245,16100,1500,1.430350,2016.0,327.438083,3.0
31624,228,114,9600,4600,1.679644,2017.0,8423.337128,2.0
31625,261,112,9600,4600,1.669159,2017.0,8424.924200,1.0
31626,457,85,12600,4400,0.358494,2016.0,8647.695550,42.0


In [148]:
# Create a StandardScaler instances
scaler = StandardScaler()
# Fit the StandardScaler
train_scaler = scaler.fit(train_df)
# Scale the data
scaler_array = train_scaler.scale_
scaler_array

array([1.25276219e+04, 7.06923471e+01, 1.13716492e+04, 4.32068447e+03,
       1.65901781e+00, 2.16833436e+00, 1.00784614e+04, 2.09565698e+01])

In [155]:
df_scaler = scaler.fit(train_df)
df_scaled = df_scaler.transform(df_cleaned)

In [156]:
df_scaled

array([[-0.09723491,  0.95839701, -0.89237176, -0.4548622 , -0.7803774 ,
        -0.56307866, -0.42614675,  1.04514145],
       [-0.10002873,  1.35447948, -0.8594091 , -0.54373701, -0.79966592,
         1.28165529, -0.02697095,  1.52231878]])

## Prediction
---

In [157]:
from joblib import load

In [158]:
model = load("./output/Best_SoFar.joblib") 

In [159]:
# predicting price 
prediction = model.predict(df_scaled)

# puting prediction price into df
df_not_for_scaled["Predict Price"] = prediction

In [160]:
df_not_for_scaled

Unnamed: 0,Address,Floor Area,Land Area,Built Year,Year Sold,CBD_DIST,school_DIS,station_DIS,Coast_DIST,Predict Price
0,"570C, Marmion Street, Booragoon, Western Austr...",250,844,1967,2016,9415.463454,444.0,2371.0,7834.698256,1078015.0
1,"17 Aveley Street Willetton, Western Australia,...",278,809,1961,2020,9790.303325,412.0,1987.0,11857.776112,1028432.0
