In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option("max_colwidth", None)
pd.set_option("max_seq_items", None)
pd.set_option('display.float_format', '{:.2f}'.format)

import numpy as np

In [2]:
path = '/Users/tristangarcia/desktop/hp-pred_data/data/wa_data/'

In [3]:
train = pd.read_csv(f'{path}wa_train.csv')
test = pd.read_csv(f'{path}wa_validation.csv')  # validation set

In [4]:
print(train.shape)
print(test.shape)

(81106, 11)
(17380, 11)


# Filter

In [5]:
train['soldYear'].value_counts()

soldYear
2024.00    21088
2023.00    19325
2022.00    14597
2021.00    10301
2020.00     5136
2019.00     2984
2018.00     2321
2017.00     1897
2016.00     1422
2015.00     1108
2014.00      586
2013.00      197
2010.00       44
2012.00       43
2009.00       31
2011.00       20
2008.00        5
Name: count, dtype: int64

In [6]:
# Not enough data for 2017 and earlier
train = train[train['soldYear'] > 2017]
test = test[test['soldYear'] > 2017]

### Sold price

In [7]:
# Removing unrealistic house prices
train = train[(train['soldPrice'] > 100000)]
train = train[train['soldPrice'] < 10000000]

test = test[(test['soldPrice'] > 100000)]
test = test[(test['soldPrice'] < 10000000)]

In [8]:
na_cols = ['suburb','bathrooms','bedrooms','parking',
           'soldYear','soldPrice']
# Removing missing rows for soldPrice
train.dropna(subset=na_cols, inplace=True)
test.dropna(subset=na_cols, inplace=True)

In [9]:
print(train.shape)
print(test.shape)

(75191, 11)
(16103, 11)


# Reformatting 

In [10]:
train.shape

(75191, 11)

### Property type

In [11]:
train['propertyType'].value_counts()

propertyType
House                                                       60693
Apartment                                                   10598
Townhouse/Villa                                              2884
Any                                                           775
House,Any                                                     167
House,House                                                    14
Apartment,House,Townhouse/Villa,Townhouse/Villa                10
Apartment,House,Townhouse/Villa                                 8
House,Townhouse/Villa                                           8
Apartment,House                                                 6
Apartment,Townhouse/Villa                                       5
House,Land                                                      4
Townhouse/Villa,House                                           3
Apartment,New Developments                                      3
Townhouse/Villa,New Developments                               

### Only possible entries:
- House
- Apartment
- Land
- Townhouse/Villa
- Rural??
- Any??

In [12]:
train['propertyType'] = train['propertyType'].str.split(',')
test['propertyType'] = test['propertyType'].str.split(',')

def reformat_propertyTypes(row):
    # Valid property types
    types = ['House','Apartment']
    # This deals with NaN values
    if not isinstance(row, list):
        return 'Unknown'
    # Go through all property types
    for t in row:
        if t in types:
            # Return the first instance of a valid property type
            return t
    # Return 'Unkown' if no valid property types found
    return 'Unknown'
        
# Applying the function
train['propertyType'] = train['propertyType'].apply(reformat_propertyTypes)
test['propertyType'] = test['propertyType'].apply(reformat_propertyTypes)

In [13]:
# Changing the rural values from propertyTypes to House
train['propertyType'] = train['propertyType'].apply(lambda x: 'unit' if x.lower() =='apartment' else x)
test['propertyType'] = test['propertyType'].apply(lambda x: 'unit' if x.lower() =='apartment' else x)

# Removing unknown propertyTypes
train = train[(train['propertyType'] != 'Unknown')]
test = test[(test['propertyType'] != 'Unknown')]

In [14]:
train['propertyType'].value_counts()

propertyType
House    60892
unit     10636
Name: count, dtype: int64

In [15]:
train.shape

(71528, 11)

### Features

In [16]:
train['features'].value_counts()

features
['Ensuite']                                                                                                                                                    6333
['Air conditioning']                                                                                                                                            932
['Secure Parking']                                                                                                                                              509
['Study']                                                                                                                                                       420
['Air conditioning', 'Built in wardrobes']                                                                                                                      271
                                                                                                                                                               ... 
['Firep

In [17]:
train['features'][4][0]

'['

We can see that each instance of the variables 'features' looks like a list but is actually a string

In [18]:
# Converting the 'string lists' into lists
# https://stackoverflow.com/questions/1894269/how-to-convert-string-representation-of-list-to-a-list
import ast

train['features'] = train['features'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else [])
test['features'] = test['features'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else [])

### Lowercasing categorical variables

In [19]:
train = train.map(lambda x: x.lower() if isinstance(x, str) else x)
test = test.map(lambda x: x.lower() if isinstance(x,str) else x)

In [20]:
# Lowercasing features variable
def lower_features(feature_list):
    f = []
    if len(feature_list) == 0:
        return feature_list
    else:
        for feature in feature_list:
            f.append(feature.lower())
    return f

train['features'] = train['features'].apply(lambda x: lower_features(x))
test['features'] = test['features'].apply(lambda x: lower_features(x))

# Missing values

In [21]:
train.isnull().sum()

suburb              0
bathrooms           0
bedrooms            0
parking             0
landArea        12499
latitude          420
longitude         420
features            0
propertyType        0
soldYear            0
soldPrice           0
dtype: int64

### Latitude and Longitude

In [22]:
suburbs_path = '/Users/tristangarcia/desktop/hp-pred_data/suburbs/'
suburb_statistics = pd.read_csv(f'{suburbs_path}statistics/wa_statistics.csv')

suburb_statistics.head()

Unnamed: 0,suburb,state,postcode,latitude,longitude,marriedPercentage,ownerOccupierPercentage,population,renterPercentage,singlePercentage,mostCommonAgeBracket
0,gibson desert north,wa,872,-21.95,131.3,,,,,,
1,gibson desert south,wa,872,-24.95,125.98,,,,,,
2,irrunytju,wa,872,-26.06,128.93,,,,,,
3,kanpa,wa,872,-26.53,125.62,,,,,,
4,kiwirrkurra,wa,872,-23.28,126.95,,,,,,


In [23]:
coord_df = suburb_statistics[['suburb','latitude','longitude']]
print(coord_df.shape)
coord_df.head()

(1815, 3)


Unnamed: 0,suburb,latitude,longitude
0,gibson desert north,-21.95,131.3
1,gibson desert south,-24.95,125.98
2,irrunytju,-26.06,128.93
3,kanpa,-26.53,125.62
4,kiwirrkurra,-23.28,126.95


In [24]:
# Merging train with coord_df
train = pd.merge(train, coord_df, on=['suburb'], how='left', suffixes=('', '_from_coord'))
test = pd.merge(test, coord_df, on=['suburb'], how='left', suffixes=('', '_from_coord'))

# Filling missing Latitude and Longitude
train['latitude'] = train['latitude'].fillna(train['latitude_from_coord'])
train['longitude'] = train['longitude'].fillna(train['longitude_from_coord'])
test['latitude'] = test['latitude'].fillna(test['latitude_from_coord'])
test['longitude'] = test['longitude'].fillna(test['longitude_from_coord'])

# Dropping the extra columns from coord_df
train.drop(['latitude_from_coord', 'longitude_from_coord'], axis=1, inplace=True)
test.drop(['latitude_from_coord', 'longitude_from_coord'], axis=1, inplace=True)

train.isnull().sum()

suburb              0
bathrooms           0
bedrooms            0
parking             0
landArea        12499
latitude            0
longitude           0
features            0
propertyType        0
soldYear            0
soldPrice           0
dtype: int64

# KNN Imputation

In [25]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import NearestNeighbors
from geopy.distance import geodesic

### Land area
We will be using latitude and longitude to impute on landArea. We suspect that there is a difference in landArea for rural and non-rural areas. There also seems to be less recorded neighbours for rural areas. To account for this, the data will be split into rural and non-rural areas before imputation. Additionally, rural areas will find only 1 nearest neighbour will non-rural will find 3 nearest neighbours

In [26]:
train.reset_index(drop=True,inplace=True)
test.reset_index(drop=True,inplace=True)
# Seperating null and non-null 
x = train.loc[train['landArea'].notnull(), ['latitude','longitude','landArea']]
y = train.loc[train['landArea'].isnull(), ['latitude','longitude','landArea']]
y_test = test.loc[test['landArea'].isnull(), ['latitude','longitude','landArea']]

In [27]:
print(x.shape)
print(y.shape)

(59029, 3)
(12499, 3)


In [28]:
imputer = KNeighborsRegressor(n_neighbors=1)
# Fitting on the training set
imputer.fit(x[['latitude','longitude']], x['landArea'])
# Imputing on missing landArea 
predict = imputer.predict(y[['latitude', 'longitude']])
train.loc[y.index,'landArea'] = predict
# Imputing on missing landArea on validation set using training data
predict_test = imputer.predict(y_test[['latitude', 'longitude']])
test.loc[y_test.index,'landArea'] = predict_test

# Writing to file

In [29]:
print(train.shape)
print(test.shape)

(71528, 11)
(15309, 11)


In [30]:
train.to_pickle(f'{path}wa_train_clean.pkl')
test.to_pickle(f'{path}wa_validation_clean.pkl')

In [31]:
test.head(20)

Unnamed: 0,suburb,bathrooms,bedrooms,parking,landArea,latitude,longitude,features,propertyType,soldYear,soldPrice
0,carine,2,4,2,708.0,-31.85,115.8,[],house,2023.0,885000.0
1,port kennedy,1,3,1,267.0,-32.35,115.75,"[air conditioning, built in wardrobes, separate dining room, secure parking, outdoor c/bond patio entertaining area and lock up store room]",house,2021.0,335000.0
2,glenfield,2,4,2,580.0,-28.68,114.61,[],house,2021.0,407500.0
3,canning vale,2,3,2,177.0,-32.08,115.92,[secure parking],house,2022.0,500000.0
4,east perth,1,1,1,91.0,-31.96,115.87,[balcony],unit,2023.0,312000.0
5,dianella,2,4,2,387.0,-31.9,115.87,[],house,2024.0,1135000.0
6,burswood,2,3,2,166.0,-31.97,115.9,[],house,2021.0,466000.0
7,edgewater,1,4,2,711.0,-31.76,115.79,"[air conditioning, area views, built-in wardrobes, pool]",house,2023.0,835300.0
8,watermans bay,1,1,1,1141.0,-31.85,115.75,[],unit,2024.0,408000.0
9,ballajura,1,3,2,544.0,-31.84,115.91,[],house,2020.0,350000.0
