# Setup

In [1]:
# Install dependencies
!pip install -qU openpyxl scikit-learn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import time
import pickle
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


# Preparing Data

In [3]:
df = pd.read_csv("resale-flat-prices-from-jan2017-onwards.csv")
df.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price
0,2017-01,ANG MO KIO,2 ROOM,406,ANG MO KIO AVE 10,10 TO 12,44.0,Improved,1979,61 years 04 months,232000.0
1,2017-01,ANG MO KIO,3 ROOM,108,ANG MO KIO AVE 4,01 TO 03,67.0,New Generation,1978,60 years 07 months,250000.0
2,2017-01,ANG MO KIO,3 ROOM,602,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62 years 05 months,262000.0
3,2017-01,ANG MO KIO,3 ROOM,465,ANG MO KIO AVE 10,04 TO 06,68.0,New Generation,1980,62 years 01 month,265000.0
4,2017-01,ANG MO KIO,3 ROOM,601,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62 years 05 months,265000.0


In [4]:
# Categorical encode some columns
def categorical_encode(l):
    filtered_list = sorted(list(set(l)))
    categorical_mappings = {}
    for i in range(len(filtered_list)):
        categorical_mappings[filtered_list[i]] = (i+1)
    return categorical_mappings

for col in ["town", "storey_range"]:
    categorical_mappings = categorical_encode(df[col])
    df[col + "_category"] = df[col].map(categorical_mappings)
    print(categorical_mappings)

{'ANG MO KIO': 1, 'BEDOK': 2, 'BISHAN': 3, 'BUKIT BATOK': 4, 'BUKIT MERAH': 5, 'BUKIT PANJANG': 6, 'BUKIT TIMAH': 7, 'CENTRAL AREA': 8, 'CHOA CHU KANG': 9, 'CLEMENTI': 10, 'GEYLANG': 11, 'HOUGANG': 12, 'JURONG EAST': 13, 'JURONG WEST': 14, 'KALLANG/WHAMPOA': 15, 'MARINE PARADE': 16, 'PASIR RIS': 17, 'PUNGGOL': 18, 'QUEENSTOWN': 19, 'SEMBAWANG': 20, 'SENGKANG': 21, 'SERANGOON': 22, 'TAMPINES': 23, 'TOA PAYOH': 24, 'WOODLANDS': 25, 'YISHUN': 26}
{'01 TO 03': 1, '04 TO 06': 2, '07 TO 09': 3, '10 TO 12': 4, '13 TO 15': 5, '16 TO 18': 6, '19 TO 21': 7, '22 TO 24': 8, '25 TO 27': 9, '28 TO 30': 10, '31 TO 33': 11, '34 TO 36': 12, '37 TO 39': 13, '40 TO 42': 14, '43 TO 45': 15, '46 TO 48': 16, '49 TO 51': 17}


# Build & Persist Model

In [5]:
# Build a model using linear regression
def build_model(df):
    # Split the data into training and testing sets
    data_train, data_test = train_test_split(df, test_size=0.2, random_state=200)

    model = LinearRegression(n_jobs=4)
    model.fit(data_train[['town_category', 'floor_area_sqm', 'storey_range_category', 'lease_commence_date']], data_train['resale_price'])

    # Calculate the score of the model
    score = model.score(data_test[['town_category', 'floor_area_sqm', 'storey_range_category', 'lease_commence_date']], data_test['resale_price'])
    print("Score: ", score)

    return model

start_time = time.time()
model = build_model(df)
with open(r"model.pkl", "wb") as output_file:
    pickle.dump(model, output_file)
end_time = time.time()
print("Build Time: " + str(round(end_time - start_time, 2)) + " seconds")

Score:  0.5613124709926463
Build Time: 0.08 seconds


# Predict With Model

In [6]:
with open(r"model.pkl", "rb") as input_file:
    p = pickle.load(input_file)

# Predict the y value for a given x value
y = p.predict([[1, 90, 4, 1980]])
print("Ang Mo Kio unit: ", y)
y = p.predict([[3, 90, 4, 1980]])
print("Bishan unit: ", y)
y = p.predict([[25, 90, 4, 1980]])
print("Woodlands unit: ", y)

Ang Mo Kio unit:  [470373.30350888]
Bishan unit:  [464645.33660725]
Woodlands unit:  [401637.70068928]


