In [1]:
import pandas as pd
pd.set_option("max_colwidth", None)
pd.set_option("max_seq_items", None)

import numpy as np

In [2]:
in_path = '/Users/tristangarcia/desktop/hp-pred_data/data/wa_data/'

In [3]:
train = pd.read_csv(f'{in_path}wa_train.csv')
test = pd.read_csv(f'{in_path}wa_validation.csv')  # validation set

In [4]:
print(train.shape)
print(test.shape)

(88064, 39)
(18871, 39)


# Reformatting 

### Property type

In [5]:
train['propertyTypes'].value_counts()

propertyTypes
House                                                             62055
Apartment                                                         10604
Land                                                               7870
Townhouse/Villa                                                    4824
Any                                                                1435
Rural                                                               735
House,Any                                                           171
Any,Rural                                                            90
New Developments,Apartment                                           56
Any,House                                                            53
New Developments                                                     23
Townhouse/Villa,Townhouse/Villa                                      16
Any,Land                                                             13
Any,Any                                           

### Only possible entries:
- House
- Apartment
- Land
- Townhouse/Villa
- Rural??
- Any??

In [6]:
train['propertyTypes'] = train['propertyTypes'].str.split(',')
test['propertyTypes'] = test['propertyTypes'].str.split(',')

def reformat_propertyTypes(row):
    # Valid property types
    types = ['House','Apartment','Land','Townhouse/Villa','Rural']
    # This deals with NaN values
    if not isinstance(row, list):
        return 'Unknown'
    # Go through all property types
    for t in row:
        if t in types:
            # Return the first instance of a valid property type
            return t
    # Return 'Unkown' if no valid property types found
    return 'Unknown'
        
# Applying the function
train['propertyTypes'] = train['propertyTypes'].apply(reformat_propertyTypes)
test['propertyTypes'] = test['propertyTypes'].apply(reformat_propertyTypes)

In [7]:
train['propertyTypes'].value_counts()

propertyTypes
House              62313
Apartment          10704
Land                7883
Townhouse/Villa     4858
Unknown             1472
Rural                834
Name: count, dtype: int64

### Features

In [8]:
train['features'].value_counts()

features
['Ensuite']                                                                                                                                                                                            6423
['Air conditioning']                                                                                                                                                                                   1030
['Secure Parking']                                                                                                                                                                                      689
['Study']                                                                                                                                                                                               503
['Air conditioning', 'Built in wardrobes']                                                                                                                                     

In [9]:
train['features'][0][0]

'['

We can see that each instance of the variables 'features' looks like a list but is actually a string

In [10]:
# Converting the 'string lists' into lists
# https://stackoverflow.com/questions/1894269/how-to-convert-string-representation-of-list-to-a-list
import ast

train['features'] = train['features'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else [])
test['features'] = test['features'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else [])

### Lowercasing categorical variables

In [11]:
train = train.map(lambda x: x.lower() if isinstance(x, str) else x)
test = test.map(lambda x: x.lower() if isinstance(x,str) else x)

# Irrelevant features

In [12]:
train.columns

Index(['listingId', 'unitNumber', 'streetNumber', 'street', 'suburb', 'state',
       'postcode', 'bathrooms', 'bedrooms', 'parking', 'landArea', 'latitude',
       'longitude', 'features', 'agency', 'propertyTypes', 'promoLevel',
       'soldMonth', 'soldYear', 'daysListed', 'inspectionsCount', 'isRural',
       'hasDescription', 'hasFloorplan', 'hasDisplayPrice', 'hasPhoto',
       'photoCount', 'suburb_medianPrice', 'suburb_medianRentPrice',
       'suburb_entryLevelPrice', 'suburb_luxuryLevelPrice', 'primary',
       'primaryDistance', 'primaryType', 'secondary', 'secondaryDistance',
       'secondaryType', 'listingUrl', 'soldPrice'],
      dtype='object')

### Listing of irrelevant columns:
listingId,unitNumber,streetNumber,listingUrl

In [13]:
# Dropping the irrelevant columns
train = train.drop(columns=['listingId','unitNumber','streetNumber','street','listingUrl'])
test = test.drop(columns=['listingId','unitNumber','streetNumber','street','listingUrl'])

# Missing values

In [14]:
train.isnull().sum()

suburb                        15
state                         15
postcode                      15
bathrooms                      0
bedrooms                       0
parking                        0
landArea                   16782
latitude                     708
longitude                    708
features                       0
agency                         0
propertyTypes                  0
promoLevel                     0
soldMonth                      0
soldYear                       0
daysListed                     0
inspectionsCount               0
isRural                        0
hasDescription                 0
hasFloorplan                   0
hasDisplayPrice                0
hasPhoto                       0
photoCount                     0
suburb_medianPrice             7
suburb_medianRentPrice         7
suburb_entryLevelPrice         7
suburb_luxuryLevelPrice        7
primary                     6285
primaryDistance             6285
primaryType                 6285
secondary 

### Sold price

In [15]:
na_cols = ['suburb','state','postcode','soldPrice']
# Removing missing rows for soldPrice
for col in na_cols:
    train.dropna(subset=[col], inplace=True)
    test.dropna(subset=[col], inplace=True)

### Suburb insight features

In [16]:
# All rows missing rows for suburb_medianPrice,suburb_medianRentPrice,suburb_entryLevelPrice,suburb_luxuryLevelPrice
# are from the same row
train.loc[train['suburb_medianPrice'].isnull(), ['suburb','soldYear','suburb_medianRentPrice','suburb_entryLevelPrice',
                                               'suburb_luxuryLevelPrice']]

Unnamed: 0,suburb,soldYear,suburb_medianRentPrice,suburb_entryLevelPrice,suburb_luxuryLevelPrice
3740,gnangara,2022.0,,,
5587,green head,2024.0,,,
24563,quinns rocks,2024.0,,,
52450,wembley,2024.0,,,
52719,jindalee,2023.0,,,
63920,derby,2023.0,,,
80297,lyalls mill,2022.0,,,


In [17]:
# Input price for the first instance 
train.loc[(train["soldYear"] == 2024) & (train["suburb"] == 'quinns rocks'),['suburb_medianPrice']].median()

suburb_medianPrice    795000.0
dtype: float64

The median house price for Quinns Rocks in 2024 is 795,000
We well check train.loc[(24563)] to ensure it has been imputed with the correct value

In [18]:
test.loc[test['suburb_medianPrice'].isnull(), ['suburb','soldYear','suburb_medianRentPrice','suburb_entryLevelPrice',
                                               'suburb_luxuryLevelPrice']]

Unnamed: 0,suburb,soldYear,suburb_medianRentPrice,suburb_entryLevelPrice,suburb_luxuryLevelPrice
11432,julimar,2024.0,,,
18783,cranbrook,2020.0,,,


In [19]:
# Input price for the first instance 
train.loc[(train["soldYear"] == 2024) & (train["suburb"] == 'julimar'),['suburb_medianPrice']].median()

suburb_medianPrice    0.0
dtype: float64

The median house price for Julimar in 2024 is 0
We well check test.loc[(11432)] to ensure it has been imputed with the correct value

In [20]:
# Function to impute median values
def impute_median(row, median_table, feature):
    if (row['suburb'], row['soldYear']) in median_table.index:
        return median_table.loc[(row['suburb'], row['soldYear']), feature]
    return None

# Features to impute
suburb_features = ['suburb_medianPrice', 'suburb_medianRentPrice',
                   'suburb_entryLevelPrice', 'suburb_luxuryLevelPrice']

# Loop through each feature
for feature in suburb_features:
    # Compute median table
    median_table = train.groupby(['suburb', 'soldYear'])[feature].median()
    median_table = median_table.reset_index().set_index(['suburb', 'soldYear'])
    
    # Impute missing values in train
    train[feature] = train.apply(
        lambda x: x[feature] if pd.notnull(x[feature]) else impute_median(x, median_table, feature), axis=1
    )
    
    # Impute missing values in test (using medians from the training set)
    test[feature] = test.apply(
        lambda x: x[feature] if pd.notnull(x[feature]) else impute_median(x, median_table, feature), axis=1
    )

In [21]:
# Checking correct value has been imputed
train.loc[(24563)]['suburb_medianPrice']

795000.0

In [22]:
# Checking correct value has been imputed
test.loc[(11432)]['suburb_medianPrice']

0.0

### Spatial features

Using the same imputation strategy as above, we will be imputing the missing values for latitude and longitude on the train and test sets based on the suburb of the training set

In [23]:
train.loc[train['latitude'].isnull(), ['suburb','latitude','longitude']]

Unnamed: 0,suburb,latitude,longitude
2,fremantle,,
197,bunbury,,
246,narrogin,,
305,esperance,,
420,crawley,,
...,...,...,...
87558,west leederville,,
87643,mosman park,,
87776,nedlands,,
87800,courtenay,,


In [24]:
print(train[train['suburb']=='fremantle']['latitude'].median())
print(train[train['suburb']=='fremantle']['longitude'].median())

-32.0519239
115.75531


In [25]:
test.loc[test['latitude'].isnull(), ['suburb','latitude','longitude']]

Unnamed: 0,suburb,latitude,longitude
139,harvey,,
269,hamilton hill,,
459,dalkeith,,
464,mount lawley,,
633,inglewood,,
...,...,...,...
17800,changerup,,
18122,thornlie,,
18424,langford,,
18778,toodyay,,


In [26]:
print(train[train['suburb']=='harvey']['latitude'].median())
print(train[train['suburb']=='harvey']['longitude'].median())

-33.0811946
115.89696


In [27]:
# Function to impute median values
def impute_median(row, median_table, feature):
    if (row['suburb']) in median_table.index:
        return median_table.loc[(row['suburb']), feature]
    return None

# Features to impute
spatial_features = ['latitude','longitude']

# Loop through each feature
for feature in spatial_features:
    # Compute median table
    median_table = train.groupby(['suburb'])[feature].median()
    median_table = median_table.reset_index().set_index(['suburb'])
    
    # Impute missing values in train
    train[feature] = train.apply(
        lambda x: x[feature] if pd.notnull(x[feature]) else impute_median(x, median_table, feature), axis=1
    )
    
    # Impute missing values in test (using medians from the training set)
    test[feature] = test.apply(
        lambda x: x[feature] if pd.notnull(x[feature]) else impute_median(x, median_table, feature), axis=1
    )

In [28]:
print(train.loc[(2)]['latitude'])
print(train.loc[(2)]['longitude'])

-32.0519239
115.75531


In [29]:
print(test.loc[(139)]['latitude'])
print(test.loc[(139)]['longitude'])

-33.0811946
115.89696


### School features

In [30]:
def impute_mode(row,mode_table,feature):
    if (row['suburb']) in mode_table.index:
        return mode_table.loc[(row['suburb']), feature]
    return None

school_features = ['primary','secondary']
    
for feature in school_features:
    mode_table = train.groupby(['suburb'])[feature].agg(lambda x: x.mode()[0] if not x.mode().empty else None)
    mode_table = mode_table.reset_index().set_index(['suburb'])
    
    train[feature] = train.apply(
        lambda x: x[feature] if pd.notnull(x[feature]) else impute_mode(x,mode_table,feature), axis=1
    )
    
    test[feature] = test.apply(
        lambda x: x[feature] if pd.notnull(x[feature]) else impute_mode(x,mode_table,feature), axis=1
    )

In [31]:
train['primary'].value_counts()

primary
highgate primary school                    1344
st joseph's school                          843
subiaco primary school                      836
st brigid's school                          601
maylands peninsula primary school           585
                                           ... 
warlawurru catholic school                    1
east kalgoorlie primary school                1
meadow springs education support centre       1
kalannie primary school                       1
riverton education support centre             1
Name: count, Length: 688, dtype: int64

### Imputing schoolType
Grouping by schools, the mode of the schoolType for each school will be imputed to missing values

In [32]:
def impute_mode(row,mode_table):
    if (row['primary']) in mode_table.index:
        return mode_table.loc[(row['primary']),'primaryType']
    return None

mode_table = train.groupby(['primary'])['primaryType'].agg(lambda x: x.mode()[0] if not x.mode().empty else None)
mode_table = mode_table.reset_index().set_index(['primary'])

train['primaryType'] = train.apply(
    lambda x: x['primaryType'] if pd.notnull(x['primaryType']) else impute_mode(x,mode_table), axis=1
)

test['primaryType'] = test.apply(
    lambda x: x['primaryType'] if pd.notnull(x['primaryType']) else impute_mode(x,mode_table), axis=1
)

In [33]:
def impute_mode(row,mode_table):
    if (row['secondary']) in mode_table.index:
        return mode_table.loc[(row['secondary']),'secondaryType']
    return None

mode_table = train.groupby(['secondary'])['secondaryType'].agg(lambda x: x.mode()[0] if not x.mode().empty else None)
mode_table = mode_table.reset_index().set_index(['secondary'])

train['secondaryType'] = train.apply(
    lambda x: x['secondaryType'] if pd.notnull(x['secondaryType']) else impute_mode(x,mode_table), axis=1
)

test['secondaryType'] = test.apply(
    lambda x: x['secondaryType'] if pd.notnull(x['secondaryType']) else impute_mode(x,mode_table), axis=1
)

### Imputing schoolDistance
Grouping by schools, the mode of the schoolDistance for each school will be imputed to missing values

In [34]:
def impute_median(row,mode_table):
    if (row['primary']) in median_table.index:
        return median_table.loc[(row['primary']),'primaryDistance']
    return None

median_table = train.groupby(['primary'])['primaryDistance'].median()
median_table = median_table.reset_index().set_index(['primary'])

train['primaryDistance'] = train.apply(
    lambda x: x['primaryDistance'] if pd.notnull(x['primaryDistance']) else impute_median(x,median_table), axis=1
)

test['primaryDistance'] = test.apply(
    lambda x: x['primaryDistance'] if pd.notnull(x['primaryDistance']) else impute_median(x,median_table), axis=1
)

In [35]:
def impute_median(row,mode_table):
    if (row['secondary']) in median_table.index:
        return median_table.loc[(row['secondary']),'secondaryDistance']
    return None

median_table = train.groupby(['secondary'])['secondaryDistance'].median()
median_table = median_table.reset_index().set_index(['secondary'])

train['secondaryDistance'] = train.apply(
    lambda x: x['secondaryDistance'] if pd.notnull(x['secondaryDistance']) else impute_median(x,median_table), axis=1
)

test['secondaryDistance'] = test.apply(
    lambda x: x['secondaryDistance'] if pd.notnull(x['secondaryDistance']) else impute_median(x,median_table), axis=1
)

### kNN Imputation for landArea
We will be using latitude and longitude to impute on landArea. We suspect that there is a difference in landArea for rural and non-rural areas. There also seems to be less recorded neighbours for rural areas. To account for this, the data will be split into rural and non-rural areas before imputation. 

In [36]:
# Dropping the rows with missing latitude and longitude as these columns are needed for imputation
train = train.dropna(subset=['latitude','longitude'])
test = test.dropna(subset=['latitude','longitude'])
train.reset_index(drop=True,inplace=True)
test.reset_index(drop=True,inplace=True)
# Seperating null and non-null 
x = train.loc[train['landArea'].notnull(), ['latitude','longitude','landArea','isRural']]
y = train.loc[train['landArea'].isnull(), ['latitude','longitude','landArea','isRural']]
y_test = test.loc[test['landArea'].isnull(), ['latitude','longitude','landArea','isRural']]

In [37]:
print(x.shape)
print(y.shape)

(71258, 4)
(16778, 4)


- x has the non-null values
- y has the null values

In [38]:
# Rural instances
x_rural = x.loc[x['isRural']==True, ['latitude','longitude','landArea']]
y_rural = y.loc[y['isRural']==True, ['latitude','longitude']]
y_test_rural = y_test.loc[y_test['isRural']==True, ['latitude','longitude']]
# Non-rural instances
x_nonrural = x.loc[x['isRural']==False, ['latitude','longitude','landArea']]
y_nonrural = y.loc[y['isRural']==False, ['latitude','longitude']]
y_test_nonrural = y_test.loc[y_test['isRural']==False, ['latitude','longitude']]

In [39]:
print(x_rural.shape)
print(y_rural.shape)
print(y_test_rural.shape)
print()
print(x_nonrural.shape)
print(y_nonrural.shape)
print(y_test_nonrural.shape)

(4308, 3)
(40, 2)
(9, 2)

(66950, 3)
(16738, 2)
(3506, 2)


- We've now separated the rural and nonrural data
- We will using the x to predict values for y

In [40]:
from sklearn.neighbors import KNeighborsRegressor

In [41]:
imputer_rural = KNeighborsRegressor(n_neighbors=1)
# Fitting on the training set
imputer_rural.fit(x_rural[['latitude','longitude']], x_rural['landArea'])
# Imputing on missing rural landArea 
predict_rural = imputer_rural.predict(y_rural)
train.loc[y_rural.index,'landArea'] = predict_rural
# Imputing on missing rural landArea on validation set using training data
predict_test_rural = imputer_rural.predict(y_test_rural)
test.loc[y_test_rural.index,'landArea'] = predict_test_rural

In [42]:
imputer_nonrural = KNeighborsRegressor(n_neighbors=3)
# Fitting on the training set
imputer_nonrural.fit(x_nonrural[['latitude','longitude']], x_nonrural['landArea'])
# Imputing on missing nonrural landArea 
predict_nonrural = imputer_nonrural.predict(y_nonrural)
train.loc[y_nonrural.index, 'landArea'] = predict_nonrural
# Imputing on missing nonrural landArea on validation set using training data
predict_test_nonrural = imputer_nonrural.predict(y_test_nonrural)
test.loc[y_test_nonrural.index,'landArea'] = predict_test_nonrural

### Removing the rest of the missing values
Some null values still remain even after imputation as there isn't enough information to impute on other values

In [43]:
train.dropna(inplace=True)
test.dropna(inplace=True)

train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

print(train.shape)
print(test.shape)

(79455, 34)
(17005, 34)


# Duplicated values

In [44]:
# Number of duplicates in training set
print(len(train) - len(train.loc[train.drop(columns=['features']).drop_duplicates().index]))
train = train.loc[train.drop(columns=['features']).drop_duplicates().index]
# Number of duplicates in validation set
print(len(test) - len(test.loc[test.drop(columns=['features']).drop_duplicates().index]))
test = test.loc[test.drop(columns=['features']).drop_duplicates().index]

46
3
