In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option("max_colwidth", None)
pd.set_option("max_seq_items", None)
pd.set_option('display.float_format', '{:.2f}'.format)

import geopandas as gpd
from sklearn.pipeline import Pipeline
from pipelines import *
import pickle

In [2]:
def load_data():
    path = '/Users/tristangarcia/desktop/hp-pred/data/wa/'
    train = pd.read_csv(f'{path}wa_train.csv')
    test = pd.read_csv(f'{path}wa_test.csv') 
    test2 = pd.DataFrame({
        'streetNumber':[9],
        'street': ['Lambrook Way'],
        'suburb': ['Landsdale'],
        'postcode': [6065],
        'bathrooms': [2],
        'bedrooms': [5],
        'parking': [2],
        'propertyType':['house']
    })
    return train, test, test2

In [3]:
def load_support_data():
    # Files
    data_path = '/Users/tristangarcia/desktop/hp-pred/data/'

    suburb_statistics = pd.read_csv(f'{data_path}wa_suburb_statistics.csv')
    yearly_median_prices = pd.read_csv(f'{data_path}wa_price_scalers.csv')
    suburb_price_summary = pd.read_csv(f'{data_path}wa_suburb_prices.csv')
    school_data = pd.read_csv(f'{data_path}wa_schools.csv')
    coastline_data = gpd.read_file(f'{data_path}ne_50m_coastline/ne_50m_coastline.shp')
    
    return suburb_statistics, yearly_median_prices, suburb_price_summary, school_data, coastline_data

In [4]:
def load_support_dict():
    feature_dict = {
        'std_cols': ['bathrooms', 'bedrooms', 'parking', 'landArea', 'suburb_marriedPercentage',
                      'suburb_population', 'suburb_renterPercentage', 'suburb_medianSoldPrice',
                      'suburb_medianRentPrice', 'suburb_entryLevelPrice', 'suburb_luxuryLevelPrice',
                      'primaryDistance', 'primaryICSEA', 'secondaryDistance',
                      'secondaryICSEA', 'coastDistance', 'cbdDistance', 'latitude', 'longitude'],

        'ohe_cols': ['propertyType', 'suburb_mostCommonAgeBracket', 'primaryType', 'secondaryType'],

        'drop_cols': ['suburb','street','streetNumber','postcode','soldYear','primary_school','secondary_school'],

        'log_cols': ['soldPrice','landArea','suburb_medianSoldPrice','suburb_medianRentPrice','suburb_entryLevelPrice',
                     'suburb_luxuryLevelPrice','primaryDistance','secondaryDistance','coastDistance','cbdDistance']

    }

    synonym_mapping = {
        # Air Conditioning
        'air-conditioning': 'air conditioning',
        'air-conditioner': 'air conditioning',
        'reverse cycle air conditioning': 'air conditioning',
        'evaporative cooling': 'air conditioning',
        'split-system air conditioning': 'air conditioning',
        'split system ac': 'air conditioning',
        'split system heating': 'air conditioning',
        'ac': 'air conditioning',
        'reverse cycle ac': 'air conditioning',

        # Built-in Wardrobes
        'built-in wardrobes': 'built in wardrobes',
        'built-in robes': 'built in wardrobes',
        'built in robes': 'built in wardrobes',
        'builtin robes': 'built in wardrobes',

        # Outdoor Features
        'garden / courtyard': 'courtyard',
        'balcony / deck': 'balcony',
        'terrace-balcony': 'balcony',
        'outdoor entertaining': 'outdoor entertainment area',
        'outdoor entertainment': 'outdoor entertainment area',
        'entertainment area': 'outdoor entertainment area',
        'outside entertaining area': 'outdoor entertainment area',
        'alfresco': 'outdoor entertainment area',

        # Pools
        'swimming pool - in ground': 'swimming pool',
        'pool': 'swimming pool',
        'inground pool': 'swimming pool',
        'outdoor spa': 'swimming pool',

        # Heating and Fireplace
        'fireplace(s)': 'fireplace',
        'gas heating': 'heating',
        'split system heating': 'heating',
        'ducted heating': 'heating',
        'wall / ceiling insulation': 'heating',
        'insulation': 'heating',

        # Internet and Broadband
        'broadband internet access': 'broadband',
        'broadband internet available': 'broadband',
        'cable or satellite': 'broadband',

        # Solar and Energy
        'solar panels': 'solar panel',
        'solar hot water': 'solar panel',
        'rainwater storage tank': 'solar panel',
        'electric hot water service': 'solar panel',

        # Security
        'security alarm': 'alarm system',
        'alarm': 'alarm system',
        'security access': 'alarm system',
        'security system': 'alarm system',

        # Parking
        'car parking - surface': 'secure parking',
        'additional parking': 'secure parking',
        'side access': 'secure parking',

        # Rooms and Layout
        'study': 'study',
        'formal lounge': 'lounge',
        'rumpus room': 'lounge',
        'family room': 'lounge',
        'separate dining': 'dining room',
        'separate dining room': 'dining room',
        'kitchen/dining': 'dining room',
        'theatre': 'lounge',
        'games room': 'lounge',

        # Miscellaneous
        'window treatments': 'blinds',
        'high ceilings': 'ceiling',
        'inside:': 'interior',
        'outside:': 'exterior',
        'pet friendly': 'pets allowed',
        'liveability': 'comfort',
        'first home buyer': 'starter home',
        'city views': 'area views',
        'water views': 'area views',
    }
    
    return feature_dict, synonym_mapping

In [5]:
def main():
    train, _, _ = load_data()
    suburb_statistics, yearly_median_prices, suburb_price_summary, school_data, coastline_data = load_support_data()
    feature_dict, synonym_mapping = load_support_dict()
    
    # Sub pipes        
    filter_pipe = Pipeline([
        ('price_greater', ColumnFilter(column='soldPrice', method='greater', threshold=100000)),
        ('price_lesser', ColumnFilter(column='soldPrice', method='less', threshold=10000000)),
        ('soldYear_greater', ColumnFilter(column='soldYear', method='greater', threshold=2017)),
        ('remove_missing', MissingDataRemover(columns=['suburb','bathrooms','bedrooms','parking',
                                                      'soldYear','isRural','soldPrice']))
    ])

    reformatter_pipe = Pipeline([
        ('propertyType', PropertyTypeFormatter()),
        ('remove_missing_property', MissingDataRemover(columns=['propertyType'])),
        ('lowercase', LowercaseFormatter()),
        ('features', FeaturesFormatter(synonym_mapping=synonym_mapping))  
    ])

    imputer_pipe = Pipeline([
        ('coordinates', CoordinateFiller(coord_df = suburb_statistics)),
        ('landArea', LandAreaFiller())
    ])

    addfeatures_pipe = Pipeline([
        ('suburb_statistics', SuburbFeatureAdder(merge_keys=['suburb'], 
                                                 add_features=['marriedPercentage', 'population',
                                                               'renterPercentage', 'mostCommonAgeBracket'],
                                                 df=suburb_statistics)),
        ('suburb_price_summary', SuburbFeatureAdder(merge_keys=['suburb', 'bedrooms','propertyType'],
                                                    add_features=['medianSoldPrice', 'medianRentPrice', 
                                                                  'entryLevelPrice', 'luxuryLevelPrice'],
                                                    df=suburb_price_summary)),
        ('primary', SchoolFeatureAdder(schools_df=school_data, school_type='primary')),
        ('secondary', SchoolFeatureAdder(schools_df=school_data, school_type='secondary')),
        ('coastDistance', CoastDistanceFeature(coastline_df=coastline_data)),
        ('cbdDistance', CBDDistanceCalculator())
    ])

    scaler_pipe = Pipeline([
        ('scale_price', PriceScaler(price_index_df=yearly_median_prices)),
        ('log_features', FeatureScaleTransform(columns=feature_dict['log_cols'], method='log'))
    ])

    final_preprocessor = Pipeline([
        ('drop_cols', ColumnDropper(columns=feature_dict['drop_cols'])),
        ('std_cols', CustomStandardiser(columns=feature_dict['std_cols'])),
        ('ohe_cols', CustomOHE(columns=feature_dict['ohe_cols'])),
        ('mlb_features', CustomMultiLabelBinarizer(column='features'))
    ])
    
    # Main pipe
    pipeline = Pipeline([
        ('filter', filter_pipe),
        ('reformatter', reformatter_pipe),
        ('imputer', imputer_pipe),
        ('added_features', addfeatures_pipe),
        ('scaler', scaler_pipe),
        ('final', final_preprocessor)
    ])
    
    # Fitting the pipeline
    pipeline.fit(train)

    path = '/Users/tristangarcia/Desktop/hp-pred/data/'
    pickle.dump(pipeline, open(f'{path}preprocessing_pipeline.pkl', 'wb'))
    
    return pipeline
    
    

# Testing

In [6]:
pipeline = main()

In [7]:
train, test, test2 = load_data()

train = pipeline.transform(train)
test = pipeline.transform(test)
test2 = pipeline.transform(test2)

In [8]:
train.shape

(108128, 37)

In [9]:
train.head()

Unnamed: 0,bathrooms,bedrooms,parking,landArea,latitude,longitude,soldPrice,suburb_marriedPercentage,suburb_population,suburb_renterPercentage,suburb_medianSoldPrice,suburb_medianRentPrice,suburb_entryLevelPrice,suburb_luxuryLevelPrice,primaryDistance,primaryICSEA,secondaryDistance,secondaryICSEA,coastDistance,cbdDistance,propertyType_house,propertyType_unit,suburb_mostCommonAgeBracket_0,suburb_mostCommonAgeBracket_20 to 39,suburb_mostCommonAgeBracket_40 to 59,suburb_mostCommonAgeBracket_5 to 19,suburb_mostCommonAgeBracket_60+,primaryType_combined,primaryType_primary,secondaryType_combined,secondaryType_secondary,feature_air_conditioning,feature_built_in_wardrobes,feature_secure_parking,feature_ensuite,feature_dishwasher,feature_fully_fenced
0,0.44,0.68,0.74,0.23,-0.26,-0.27,5.87,-0.28,1.22,0.6,0.47,0.52,0.49,0.49,-0.92,0.29,-0.56,0.26,-3.16,0.18,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0,0,0,0,0,0
1,0.44,0.68,0.0,0.31,-0.18,-0.16,5.98,1.09,0.56,-1.08,0.57,0.62,0.59,0.54,0.63,0.58,-0.54,0.56,0.31,-0.5,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0,0,0,0,0
2,0.44,2.83,0.0,0.28,-0.05,-0.26,6.4,1.03,-0.3,-0.85,-2.07,-1.78,-2.07,-2.07,-0.93,0.43,-0.04,-2.32,-1.27,-0.19,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1,1,1,0,0,0
3,-1.2,-1.48,-0.74,-2.12,-0.1,-0.17,5.46,-0.31,0.7,0.61,0.37,0.46,0.36,0.4,-0.68,0.42,-0.02,0.51,0.21,-1.07,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0,0,0,0,0
4,-1.2,-0.4,-0.74,0.19,-0.03,-0.24,5.78,-0.43,0.26,0.93,0.46,0.53,0.45,0.44,-0.34,0.21,0.23,0.66,-0.25,-0.08,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0,0,0,0,0,0


In [10]:
test.shape

(23144, 37)

In [11]:
test.head()

Unnamed: 0,bathrooms,bedrooms,parking,landArea,latitude,longitude,soldPrice,suburb_marriedPercentage,suburb_population,suburb_renterPercentage,suburb_medianSoldPrice,suburb_medianRentPrice,suburb_entryLevelPrice,suburb_luxuryLevelPrice,primaryDistance,primaryICSEA,secondaryDistance,secondaryICSEA,coastDistance,cbdDistance,propertyType_house,propertyType_unit,suburb_mostCommonAgeBracket_0,suburb_mostCommonAgeBracket_20 to 39,suburb_mostCommonAgeBracket_40 to 59,suburb_mostCommonAgeBracket_5 to 19,suburb_mostCommonAgeBracket_60+,primaryType_combined,primaryType_primary,secondaryType_combined,secondaryType_secondary,feature_air_conditioning,feature_built_in_wardrobes,feature_secure_parking,feature_ensuite,feature_dishwasher,feature_fully_fenced
0,-1.2,-0.4,0.74,0.26,-0.27,-0.26,5.71,0.17,0.85,-0.25,0.44,0.47,0.43,0.42,0.15,-0.05,-0.13,0.29,-1.32,0.22,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0,1,0,0,0
1,0.44,0.68,1.48,-0.03,-0.25,-0.17,5.78,0.53,-0.26,-0.06,0.47,0.54,0.49,0.43,0.48,0.25,-0.26,0.45,0.33,0.07,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0,0,0,0,0,0
2,-1.2,-0.4,-0.74,0.26,-0.05,-0.24,5.87,-0.15,-0.66,-0.5,-2.07,-1.78,-2.07,-2.07,-0.45,0.2,-0.47,0.49,-0.49,-0.23,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0,0,0,0,0
3,0.44,-1.48,0.0,3.55,-0.22,-0.19,6.1,-4.16,-1.37,-1.96,-2.07,-1.78,-2.07,-2.07,0.98,-4.3,1.15,0.45,0.12,-0.09,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0,0,0,0,0
4,-1.2,-0.4,-0.74,0.26,-0.59,-0.13,5.61,0.45,-0.77,0.08,0.38,-1.78,0.39,0.37,-1.24,0.2,0.41,0.21,0.79,0.96,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0,0,0,0,0,0


In [12]:
test2.shape

(1, 36)

In [13]:
test2.head()

Unnamed: 0,bathrooms,bedrooms,parking,latitude,longitude,landArea,suburb_marriedPercentage,suburb_population,suburb_renterPercentage,suburb_medianSoldPrice,suburb_medianRentPrice,suburb_entryLevelPrice,suburb_luxuryLevelPrice,primaryDistance,primaryICSEA,secondaryDistance,secondaryICSEA,coastDistance,cbdDistance,propertyType_house,propertyType_unit,suburb_mostCommonAgeBracket_0,suburb_mostCommonAgeBracket_20 to 39,suburb_mostCommonAgeBracket_40 to 59,suburb_mostCommonAgeBracket_5 to 19,suburb_mostCommonAgeBracket_60+,primaryType_combined,primaryType_primary,secondaryType_combined,secondaryType_secondary,feature_air_conditioning,feature_built_in_wardrobes,feature_secure_parking,feature_ensuite,feature_dishwasher,feature_fully_fenced
0,0.44,1.75,0.0,-0.06,-0.15,0.26,1.1,0.82,-1.19,0.55,0.7,0.55,0.68,-0.34,0.25,-0.97,0.45,0.48,-0.39,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0,0,0,0,0,0


# Pickling Models

We will be retraining the models with the best performing parameters and pickling them. This is to ensure that the features are in the same order after they have been transformed by the preprocessing pipeline during fit and prediction

In [14]:
import pickle
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

x_train = train.drop(columns=['soldPrice'])
y_train = train['soldPrice']

path = '/Users/tristangarcia/desktop/hp-pred/data/'

rf = RandomForestRegressor(n_estimators=2000, max_depth=None, min_samples_split=2, max_features=0.5)
rf.fit(x_train, y_train)
pickle.dump(rf, open(f'{path}rfmodel.pkl','wb'))

xgb = XGBRegressor(max_depth=8, min_child_weight=1, colsample_bytree=0.7, subsample=1, n_estimators=200)
xgb.fit(x_train, y_train)
pickle.dump(xgb, open(f'{path}xgbmodel.pkl','wb'))