### Cleaning Steps of test data for kaggle submission

The following codes are for cleaning the test dataset for kaggle submission.

In [11]:
#Import the required packages for cleaning.
import pandas as pd
import numpy as np
import math

In [12]:
#Set the display option to show all columns and rows.
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [13]:
test_df = pd.read_csv("../datasets/test.csv")

# List of columns to replace
columns_to_replace = ["residential", "commercial", "market_hawker", "multistorey_carpark", "precinct_pavilion"]

# Replace 'N' with 0, and 'Y' with 1
test_df[columns_to_replace] = test_df[columns_to_replace].replace(["N", "Y"], [0, 1])

# Impute null values in specific columns with 0

# List of columns to impute
columns_to_impute = ["Mall_Within_500m", "Mall_Within_1km","Mall_Within_2km", "Hawker_Within_500m", "Hawker_Within_1km", "Hawker_Within_2km"]

# Impute null values with 0 in the specified columns
test_df[columns_to_impute] = test_df[columns_to_impute].fillna(value = 0, axis = 1)

# Convert all column names to be lowercase
test_df.columns = test_df.columns.str.lower()

# To impute the missing postal with the nearest neighbour imputation.
test_df["postal"] = test_df["postal"].replace("NIL","540538")

# To imput the mall_nearest_distance with the mean value.
test_df["mall_nearest_distance"] = test_df["mall_nearest_distance"].fillna(np.mean(test_df["mall_nearest_distance"]))

#Feature engineer to create postal sector based on the first two digits of postal code.
test_df["postal"] = test_df["postal"].astype("str").str.zfill(6)
test_df["postal_sector"] = test_df["postal"].astype("str").str[:2]

# Calculate the distance between each resale flat and CBD (Raffles Place mrt)
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of the earth in km
    dLat = math.radians(lat2 - lat1)
    dLon = math.radians(lon2 - lon1)
    a = math.sin(dLat / 2) * math.sin(dLat / 2) + math.cos(math.radians(lat1)) \
        * math.cos(math.radians(lat2)) * math.sin(dLon / 2) * math.sin(dLon / 2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    d = R * c  # Distance in km
    return d * 1000

# Distance between each resale flat and Raffles Place mrt in km
test_df['dist_CBD'] = test_df.apply(lambda row: haversine(row['latitude'], row['longitude'], 1.283931, 103.851461), axis=1)

  test_df = pd.read_csv("../datasets/test.csv")


In [14]:
test_df.to_csv("../datasets/cleaned_test.csv", index=False)

In [15]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16737 entries, 0 to 16736
Data columns (total 79 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         16737 non-null  int64  
 1   tranc_yearmonth            16737 non-null  object 
 2   town                       16737 non-null  object 
 3   flat_type                  16737 non-null  object 
 4   block                      16737 non-null  object 
 5   street_name                16737 non-null  object 
 6   storey_range               16737 non-null  object 
 7   floor_area_sqm             16737 non-null  float64
 8   flat_model                 16737 non-null  object 
 9   lease_commence_date        16737 non-null  int64  
 10  tranc_year                 16737 non-null  int64  
 11  tranc_month                16737 non-null  int64  
 12  mid_storey                 16737 non-null  int64  
 13  lower                      16737 non-null  int

### Kaggle prediction

In [16]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler

from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [17]:
# The following codes will read in the train data to create the prediction model to be used for the prediction.
# Read in the cleaned dataset
df = pd.read_csv("../datasets/cleaned_train.csv")
# Convert data type of postal_sector to string
df["postal_sector"] = df["postal_sector"].astype("str")
# Drop rows with flat_model = 2-room for the prediction model. 
df = df[df["flat_model"]!="2-room"]

# Assemble X and y variables
# Dummify the postal_sector and full_flat_type column to convert from categorical to numerical values
X = df[["tranc_year","floor_area_sqm","hdb_age","full_flat_type","mrt_nearest_distance","mall_nearest_distance","postal_sector","dist_CBD", "mid"]]
X = pd.get_dummies(data = X, columns =["postal_sector","full_flat_type"])

y = df['resale_price']

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Instantiate and Fit Model
LR = LinearRegression()
LR.fit(X_train,y_train)

# We will use the LR linear regression model for the prediction of the test dataset. 

In [19]:
#The following codes will create the same set of features from the test dataset to be used in our LR linear regression model.

X_test_df = test_df[["tranc_year","floor_area_sqm","hdb_age","full_flat_type","mrt_nearest_distance","mall_nearest_distance","postal_sector","dist_CBD", "mid"]]
X_test_df = pd.get_dummies(data = X_test_df, columns =["postal_sector","full_flat_type"])

# Feature columns that do not exist in test csv to be created and filled with zeros.
X_test_df["full_flat_type_2 ROOM DBSS"] = 0
X_test_df["full_flat_type_4 ROOM Terrace"] = 0
X_test_df['postal_sector_5'] = 0
X_test_df['postal_sector_8'] = 0
X_test_df['postal_sector_9'] = 0

# Ordering the features in X_test_df to be the same as our LR prediction model.
X_test_df = X_test_df[X.columns]

In [20]:
# The following codes will predict the data using the LR prediction model.
LR_pred_df = LR.predict(X_test_df)
predictions = pd.DataFrame(LR_pred_df, columns = ["Predicted"])
output = pd.merge(test_df, predictions,left_index = True, right_index = True)
final_output = output[['id',"Predicted"]]
final = final_output.rename(columns={"id":"Id","Predicted":"Predicted"})
# We save the output to a csv file to upload on kaggle.
final.to_csv("../datasets/predicted_resale_price_final.csv", index = False)

### Kaggle submission result



![](../imgs/kaggle_final.png)