In [1]:
import pandas as pd
pd.set_option("max_colwidth", None)
pd.set_option("max_seq_items", None)

import numpy as np

In [2]:
path = '/Users/tristangarcia/desktop/hp-pred_data/data/wa_data/'

In [3]:
train = pd.read_csv(f'{path}wa_train.csv')
test = pd.read_csv(f'{path}wa_validation.csv')  # validation set

In [4]:
print(train.shape)
print(test.shape)

(88041, 39)
(18866, 39)


# Helper function

In [5]:
def get_group_stat(row, stat_table, groupby_col, impute_col):
    key = row[groupby_col]
    if key in stat_table.index:
        return stat_table.loc[key, impute_col]
    return None


def impute_by_group(train, test, groupby_col, impute_col, method):
    if method == 'median':
        # Compute median table
        stat_table = train.groupby(groupby_col)[impute_col].median()
    elif method == 'mode':
        # Compute mode table
        stat_table = train.groupby(groupby_col)[impute_col].agg(lambda x: x.mode()[0] if not x.mode().empty else None)
    elif method == 'mean':
        stat_table = train.groupby(groupby_col)[impute_col].mean()
    # Setting index as the groupby column
    stat_table = stat_table.reset_index().set_index(groupby_col)
    
    # Impute missing values in train
    train[impute_col] = train.apply(
        lambda x: x[impute_col] if pd.notnull(x[impute_col]) 
        else get_group_stat(x, stat_table, groupby_col, impute_col), axis=1
    )
    
    # Impute missing values in test (using median/mode from the training set)
    test[impute_col] = test.apply(
        lambda x: x[impute_col] if pd.notnull(x[impute_col])
        else get_group_stat(x, stat_table, groupby_col, impute_col), axis=1
    )
    
    return train, test

# Reformatting 

### Property type

In [6]:
train['propertyTypes'].value_counts()

propertyTypes
House                                                             62024
Apartment                                                         10589
Land                                                               7909
Townhouse/Villa                                                    4815
Any                                                                1442
Rural                                                               736
House,Any                                                           160
Any,Rural                                                            90
New Developments,Apartment                                           57
Any,House                                                            52
New Developments                                                     22
Townhouse/Villa,Townhouse/Villa                                      16
House,Townhouse/Villa                                                13
Any,Land                                          

### Only possible entries:
- House
- Apartment
- Land
- Townhouse/Villa
- Rural??
- Any??

In [7]:
train['propertyTypes'] = train['propertyTypes'].str.split(',')
test['propertyTypes'] = test['propertyTypes'].str.split(',')

def reformat_propertyTypes(row):
    # Valid property types
    types = ['House','Apartment','Land','Townhouse/Villa']
    # This deals with NaN values
    if not isinstance(row, list):
        return 'Unknown'
    # Go through all property types
    for t in row:
        if t in types:
            # Return the first instance of a valid property type
            return t
    # Return 'Unkown' if no valid property types found
    return 'Unknown'
        
# Applying the function
train['propertyTypes'] = train['propertyTypes'].apply(reformat_propertyTypes)
test['propertyTypes'] = test['propertyTypes'].apply(reformat_propertyTypes)

In [8]:
# Changing the rural values from propertyTypes to House
train['propertyTypes'] = train['propertyTypes'].apply(lambda x: 'Townhouse' if x=='Townhouse/Villa' else x)
train.rename(columns={'propertyTypes':'propertyType'}, inplace=True)
test.rename(columns={'propertyTypes':'propertyType'}, inplace=True)

### Features

In [9]:
train['features'].value_counts()

features
['Ensuite']                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  6461
['Air conditioning']                                                                                                                                                                                                                                                                                                                         

In [10]:
train['features'][0][0]

'['

We can see that each instance of the variables 'features' looks like a list but is actually a string

In [11]:
# Converting the 'string lists' into lists
# https://stackoverflow.com/questions/1894269/how-to-convert-string-representation-of-list-to-a-list
import ast

train['features'] = train['features'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else [])
test['features'] = test['features'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else [])

### Lowercasing categorical variables

In [12]:
train = train.map(lambda x: x.lower() if isinstance(x, str) else x)
test = test.map(lambda x: x.lower() if isinstance(x,str) else x)

In [13]:
# Lowercasing features variable
def lower_features(feature_list):
    f = []
    if len(feature_list) == 0:
        return feature_list
    else:
        for feature in feature_list:
            f.append(feature.lower())
    return f

train['features'] = train['features'].apply(lambda x: lower_features(x))
test['features'] = test['features'].apply(lambda x: lower_features(x))

# Missing values

In [16]:
train.isnull().sum()

suburb                        17
bathrooms                      0
bedrooms                       0
parking                        0
landArea                   16698
latitude                     710
longitude                    710
features                       0
propertyType                   0
promoLevel                     0
soldMonth                      2
soldYear                       2
daysListed                     0
inspectionsCount               0
isRural                        0
hasDescription                 0
hasFloorplan                   0
hasDisplayPrice                0
hasPhoto                       0
photoCount                     0
suburb_medianPrice            19
suburb_medianRentPrice        19
suburb_entryLevelPrice        19
suburb_luxuryLevelPrice       19
primary                     2989
primaryDistance             2989
primaryType                 2989
secondary                   4394
secondaryDistance           4394
secondaryType               4394
listingUrl

### Sold price

In [17]:
na_cols = ['suburb','soldPrice']
# Removing missing rows for soldPrice
for col in na_cols:
    train.dropna(subset=[col], inplace=True)
    test.dropna(subset=[col], inplace=True)

### Suburb insight features

In [18]:
# All rows missing rows for suburb_medianPrice,suburb_medianRentPrice,suburb_entryLevelPrice,suburb_luxuryLevelPrice
# are from the same row
train.loc[train['suburb_medianPrice'].isnull(), ['suburb','suburb_medianRentPrice','suburb_entryLevelPrice',
                                               'suburb_luxuryLevelPrice']]

Unnamed: 0,suburb,suburb_medianRentPrice,suburb_entryLevelPrice,suburb_luxuryLevelPrice
5724,winthrop,,,
5942,wagin,,,
9342,ravensthorpe,,,
14077,port hedland,,,
23590,dowerin,,,
33007,porongurup,,,
34837,goode beach,,,
37202,bremer bay,,,
37569,parryville,,,
49985,bindoon,,,


In [19]:
# Input price for the first instance 
train.loc[train["suburb"] == 'winthrop',['suburb_medianPrice']].median()

suburb_medianPrice    1373000.0
dtype: float64

The median house price for Winthrop in 2024 is 1,373,000
We well check train.loc[(24563)] to ensure it has been imputed with the correct value

In [20]:
test.loc[test['suburb_medianPrice'].isnull(), ['suburb','suburb_medianRentPrice','suburb_entryLevelPrice',
                                               'suburb_luxuryLevelPrice']]

Unnamed: 0,suburb,suburb_medianRentPrice,suburb_entryLevelPrice,suburb_luxuryLevelPrice
2169,bremer bay,,,
2308,balcatta,,,
3031,centennial park,,,
10549,salmon gums,,,
10766,lake king,,,


In [21]:
# Input price for the first instance 
train.loc[train["suburb"] == 'balcatta',['suburb_medianPrice']].median()

suburb_medianPrice    681000.0
dtype: float64

The median house price for balcatta in 2024 is 681000
We well check test.loc[(11432)] to ensure it has been imputed with the correct value

In [22]:
########################################################
#                                                      # 
#  Imputing missing values for the suburb features     #
#  using the median value of the corresponding suburb  #
#                                                      #
########################################################

groupby_col = 'suburb'
# Features to impute
suburb_features = ['suburb_medianPrice', 'suburb_medianRentPrice',
                   'suburb_entryLevelPrice', 'suburb_luxuryLevelPrice']

# Loop through each feature
for impute_col in suburb_features:
    train,test = impute_by_group(train, test, groupby_col, impute_col, method='median')

In [23]:
# Checking correct value has been imputed
train.loc[(5724)]['suburb_medianPrice']

1373000.0

In [24]:
# Checking correct value has been imputed
test.loc[(2308)]['suburb_medianPrice']

681000.0

### Spatial features

Using the same imputation strategy as above, we will be imputing the missing values for latitude and longitude on the train and test sets based on the suburb of the training set

In [25]:
train.loc[train['latitude'].isnull(), ['suburb','latitude','longitude']]

Unnamed: 0,suburb,latitude,longitude
271,lathlain,,
644,warwick,,
847,cunderdin,,
1133,hilbert,,
1837,beechboro,,
...,...,...,...
87384,munglinup,,
87634,midland,,
87683,broomehill,,
87905,harvey,,


In [26]:
print(train[train['suburb']=='lathlain']['latitude'].mean())
print(train[train['suburb']=='lathlain']['longitude'].mean())

-31.967482471287124
115.90492729801983


In [27]:
test.loc[test['latitude'].isnull(), ['suburb','latitude','longitude']]

Unnamed: 0,suburb,latitude,longitude
121,hyden,,
760,north fremantle,,
914,wyalkatchem,,
969,mundaring,,
1025,bull creek,,
...,...,...,...
18454,port kennedy,,
18543,baldivis,,
18576,muntadgin,,
18603,dalkeith,,


In [28]:
print(train[train['suburb']=='hyden']['latitude'].mean())
print(train[train['suburb']=='hyden']['longitude'].mean())

-32.409388324999995
118.94274615


In [29]:
########################################################
#                                                      # 
#  Imputing missing values for the spatial features    #
#  using the median value of the corresponding suburb  #
#                                                      #
########################################################

groupby_col = 'suburb'
# Features to impute
spatial_features = ['latitude','longitude']

# Loop through each feature
for impute_col in spatial_features:
    train,test = impute_by_group(train, test, groupby_col, impute_col, method='mean')

In [30]:
print(train.loc[(271)]['latitude'])
print(train.loc[(271)]['longitude'])

-31.96748247128713
115.9049272980198


In [31]:
print(test.loc[(121)]['latitude'])
print(test.loc[(121)]['longitude'])

-32.409388325
118.94274615


# KNN Imputation

In [32]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import NearestNeighbors
from geopy.distance import geodesic

### Land area
We will be using latitude and longitude to impute on landArea. We suspect that there is a difference in landArea for rural and non-rural areas. There also seems to be less recorded neighbours for rural areas. To account for this, the data will be split into rural and non-rural areas before imputation. 

In [33]:
# Dropping the rows with missing latitude and longitude as these columns are needed for imputation
train = train.dropna(subset=['latitude','longitude'])
test = test.dropna(subset=['latitude','longitude'])
train.reset_index(drop=True,inplace=True)
test.reset_index(drop=True,inplace=True)
# Seperating null and non-null 
x = train.loc[train['landArea'].notnull(), ['latitude','longitude','landArea','isRural']]
y = train.loc[train['landArea'].isnull(), ['latitude','longitude','landArea','isRural']]
y_test = test.loc[test['landArea'].isnull(), ['latitude','longitude','landArea','isRural']]

In [34]:
print(x.shape)
print(y.shape)

(71314, 4)
(16693, 4)


- x has the non-null values
- y has the null values

In [35]:
# Rural instances
x_rural = x.loc[x['isRural']==True, ['latitude','longitude','landArea']]
y_rural = y.loc[y['isRural']==True, ['latitude','longitude']]
y_test_rural = y_test.loc[y_test['isRural']==True, ['latitude','longitude']]
# Non-rural instances
x_nonrural = x.loc[x['isRural']==False, ['latitude','longitude','landArea']]
y_nonrural = y.loc[y['isRural']==False, ['latitude','longitude']]
y_test_nonrural = y_test.loc[y_test['isRural']==False, ['latitude','longitude']]

In [36]:
print(x_rural.shape)
print(y_rural.shape)
print(y_test_rural.shape)
print()
print(x_nonrural.shape)
print(y_nonrural.shape)
print(y_test_nonrural.shape)

(4306, 3)
(42, 2)
(6, 2)

(67008, 3)
(16651, 2)
(3594, 2)


- We've now separated the rural and nonrural data
- We will using the x to predict values for y

In [37]:
imputer_rural = KNeighborsRegressor(n_neighbors=1)
# Fitting on the training set
imputer_rural.fit(x_rural[['latitude','longitude']], x_rural['landArea'])
# Imputing on missing rural landArea 
predict_rural = imputer_rural.predict(y_rural)
train.loc[y_rural.index,'landArea'] = predict_rural
# Imputing on missing rural landArea on validation set using training data
predict_test_rural = imputer_rural.predict(y_test_rural)
test.loc[y_test_rural.index,'landArea'] = predict_test_rural

In [38]:
imputer_nonrural = KNeighborsRegressor(n_neighbors=3)
# Fitting on the training set
imputer_nonrural.fit(x_nonrural[['latitude','longitude']], x_nonrural['landArea'])
# Imputing on missing nonrural landArea 
predict_nonrural = imputer_nonrural.predict(y_nonrural)
train.loc[y_nonrural.index, 'landArea'] = predict_nonrural
# Imputing on missing nonrural landArea on validation set using training data
predict_test_nonrural = imputer_nonrural.predict(y_test_nonrural)
test.loc[y_test_nonrural.index,'landArea'] = predict_test_nonrural

### School features
We will be using a dataset found online which includes schools and their geospatial locations. 
https://asl.acara.edu.au/school-search
The only filter I made on the search was selecting primary, secondary and combined on the type of school (leaving special unselected)

In [39]:
school_path = '/Users/tristangarcia/desktop/hp-pred_data/school/'
schools = pd.read_csv(f'{school_path}school_location.csv')

In [40]:
schools.head()

Unnamed: 0,School Name,Suburb,State,Postcode,School Sector,School Type,Latitude,Longitude
0,Corpus Christi Catholic School,BELLERIVE,TAS,7018,Catholic,Primary,-42.871256,147.371473
1,Fahan School,SANDY BAY,TAS,7005,Independent,Combined,-42.916158,147.352764
2,Geneva Christian College,LATROBE,TAS,7307,Independent,Combined,-41.226741,146.438726
3,Holy Rosary Catholic School,CLAREMONT,TAS,7011,Catholic,Primary,-42.789375,147.248306
4,Immaculate Heart of Mary Catholic School,LENAH VALLEY,TAS,7008,Catholic,Primary,-42.865543,147.290159


In [41]:
#### Preprocessing school data ####

# Dropping missing values
schools.dropna(inplace=True)
# Subsetting columns
schools = schools[['School Name', 'State','School Sector', 'School Type', 'Latitude', 'Longitude']]
# Lowercasing all string values
schools = schools.map(lambda x: x.lower() if isinstance(x,str) else x)
# Changing 'Independent' sector to 'private'
schools['School Sector'] = schools['School Sector'].map(lambda x: 'private' if x=='independent' else x)
# Renaming columns
schools.rename(columns={'School Name':'school','State':'state','School Sector':'sector',
                        'School Type':'type','Latitude':'latitude','Longitude':'longitude'}, inplace=True)

In [42]:
# Function to calculate distance in kilometers
def degree_to_km(lat1, lon1, lat2, lon2):
    return geodesic((lat1, lon1), (lat2, lon2)).kilometers

In [43]:
# Function to impute school data for a given dataset (train or test) and school type
def impute_school_data(df, schools, school_type):
    # Filter the schools based on the given school type
    school_df = schools[(schools['state']=='wa')&
                     ((schools['type']==school_type) | 
                      (schools['type']=='combined'))][['school','type','sector','latitude','longitude']]

    # Initialize Nearest Neighbors model
    nn = NearestNeighbors(n_neighbors=1, algorithm='ball_tree')
    nn.fit(school_df[['latitude', 'longitude']])

    # Identify rows with missing values in the given dataset for the specified school type
    missing_df = df[df[f'{school_type}'].isna()]

    # Impute missing data if there are any
    if not missing_df.empty:
        distances, indices = nn.kneighbors(missing_df[['latitude', 'longitude']])

        # Calculate distances in kilometers
        distances_km = [
            degree_to_km(lat1, lon1, lat2, lon2)
            for (lat1, lon1), (lat2, lon2) in zip(
                missing_df[['latitude', 'longitude']].values,
                school_df.iloc[indices.flatten()][['latitude', 'longitude']].values
            )
        ]

        df.loc[missing_df.index, f'{school_type}'] = school_df.iloc[indices.flatten()]['school'].values
        df.loc[missing_df.index, f'{school_type}Distance'] = distances_km
        df.loc[missing_df.index, f'{school_type}Type'] = school_df.iloc[indices.flatten()]['sector'].values

    return df

In [44]:
# Imputing on training set
train = impute_school_data(train, schools, 'primary')
train = impute_school_data(train, schools, 'secondary')
# Imputing on testing set
test = impute_school_data(test, schools, 'primary')
test = impute_school_data(test, schools, 'secondary')

# Writing to file

In [47]:
train.to_pickle(f'{path}wa_train_clean.pkl')
test.to_pickle(f'{path}wa_validation_clean.pkl')