In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option("max_colwidth", None)
pd.set_option("max_seq_items", None)
pd.set_option('display.float_format', '{:.2f}'.format)

import geopandas as gpd
from sklearn.pipeline import Pipeline
from pipelines import *
import pickle

In [2]:
def load_data():
    path = '/Users/tristangarcia/desktop/hp-pred/data/'
    train = pd.read_csv(f'{path}wa_train.csv')
    test = pd.read_csv(f'{path}wa_test.csv') 
    test2 = test.copy()
    test2 = test2.drop(columns=['landArea','latitude','longitude',
                                'features','soldPrice','soldYear'])
    
    return train, test, test2

In [3]:
def load_support_data():
    # Files
    data_path = '/Users/tristangarcia/desktop/hp-pred/data/'

    suburb_statistics = pd.read_csv(f'{data_path}suburb_statistics.csv')
    yearly_median_prices = pd.read_csv(f'{data_path}yearly_median_prices.csv')
    suburb_price_summary = pd.read_csv(f'{data_path}suburb_price_summary.csv')
    school_data = pd.read_csv(f'{data_path}school_data.csv')
    coastline_data = gpd.read_file(f'{data_path}ne_50m_coastline/ne_50m_coastline.shp')
    
    return suburb_statistics, yearly_median_prices, suburb_price_summary, school_data, coastline_data

In [4]:
def load_support_dict():
    feature_dict = {
        'std_cols': ['bathrooms', 'bedrooms', 'parking', 'landArea', 'suburb_marriedPercentage',
                      'suburb_population', 'suburb_renterPercentage', 'suburb_medianSoldPrice',
                      'suburb_medianRentPrice', 'suburb_entryLevelPrice', 'suburb_luxuryLevelPrice',
                      'primaryDistance', 'primaryICSEA', 'secondaryDistance',
                      'secondaryICSEA', 'coastDistance', 'cbdDistance', 'latitude', 'longitude'],

        'ohe_cols': ['propertyType', 'suburb_mostCommonAgeBracket', 'primaryType', 'secondaryType'],

        'drop_cols': ['suburb','soldYear','primary_school','secondary_school'],

        'log_cols': ['soldPrice','landArea','suburb_medianSoldPrice','suburb_medianRentPrice','suburb_entryLevelPrice',
                     'suburb_luxuryLevelPrice','primaryDistance','secondaryDistance','coastDistance','cbdDistance']

    }

    synonym_mapping = {
        # Air Conditioning
        'air-conditioning': 'air conditioning',
        'air-conditioner': 'air conditioning',
        'reverse cycle air conditioning': 'air conditioning',
        'evaporative cooling': 'air conditioning',
        'split-system air conditioning': 'air conditioning',
        'split system ac': 'air conditioning',
        'split system heating': 'air conditioning',
        'ac': 'air conditioning',
        'reverse cycle ac': 'air conditioning',

        # Built-in Wardrobes
        'built-in wardrobes': 'built in wardrobes',
        'built-in robes': 'built in wardrobes',
        'built in robes': 'built in wardrobes',
        'builtin robes': 'built in wardrobes',

        # Outdoor Features
        'garden / courtyard': 'courtyard',
        'balcony / deck': 'balcony',
        'terrace-balcony': 'balcony',
        'outdoor entertaining': 'outdoor entertainment area',
        'outdoor entertainment': 'outdoor entertainment area',
        'entertainment area': 'outdoor entertainment area',
        'outside entertaining area': 'outdoor entertainment area',
        'alfresco': 'outdoor entertainment area',

        # Pools
        'swimming pool - in ground': 'swimming pool',
        'pool': 'swimming pool',
        'inground pool': 'swimming pool',
        'outdoor spa': 'swimming pool',

        # Heating and Fireplace
        'fireplace(s)': 'fireplace',
        'gas heating': 'heating',
        'split system heating': 'heating',
        'ducted heating': 'heating',
        'wall / ceiling insulation': 'heating',
        'insulation': 'heating',

        # Internet and Broadband
        'broadband internet access': 'broadband',
        'broadband internet available': 'broadband',
        'cable or satellite': 'broadband',

        # Solar and Energy
        'solar panels': 'solar panel',
        'solar hot water': 'solar panel',
        'rainwater storage tank': 'solar panel',
        'electric hot water service': 'solar panel',

        # Security
        'security alarm': 'alarm system',
        'alarm': 'alarm system',
        'security access': 'alarm system',
        'security system': 'alarm system',

        # Parking
        'car parking - surface': 'secure parking',
        'additional parking': 'secure parking',
        'side access': 'secure parking',

        # Rooms and Layout
        'study': 'study',
        'formal lounge': 'lounge',
        'rumpus room': 'lounge',
        'family room': 'lounge',
        'separate dining': 'dining room',
        'separate dining room': 'dining room',
        'kitchen/dining': 'dining room',
        'theatre': 'lounge',
        'games room': 'lounge',

        # Miscellaneous
        'window treatments': 'blinds',
        'high ceilings': 'ceiling',
        'inside:': 'interior',
        'outside:': 'exterior',
        'pet friendly': 'pets allowed',
        'liveability': 'comfort',
        'first home buyer': 'starter home',
        'city views': 'area views',
        'water views': 'area views',
    }
    
    return feature_dict, synonym_mapping

In [5]:
def main():
    train, test, test2 = load_data()
    suburb_statistics, yearly_median_prices, suburb_price_summary, school_data, coastline_data = load_support_data()
    feature_dict, synonym_mapping = load_support_dict()
    
    # Sub pipes        
    filter_pipe = Pipeline([
        ('price_greater', ColumnFilter(column='soldPrice', method='greater', threshold=100000)),
        ('price_lesser', ColumnFilter(column='soldPrice', method='less', threshold=10000000)),
        ('soldYear_greater', ColumnFilter(column='soldYear', method='greater', threshold=2017)),
        ('remove_missing', MissingDataRemover(columns=['suburb','bathrooms','bedrooms','parking',
                                                      'soldYear','isRural','soldPrice']))
    ])

    reformatter_pipe = Pipeline([
        ('propertyType', PropertyTypeFormatter()),
        ('remove_missing_property', MissingDataRemover(columns=['propertyType'])),
        ('lowercase', LowercaseFormatter()),
        ('features', FeaturesFormatter(synonym_mapping=synonym_mapping))  
    ])

    imputer_pipe = Pipeline([
        ('coordinates', CoordinateFiller(coord_df = suburb_statistics)),
        ('landArea', LandAreaFiller())
    ])

    addfeatures_pipe = Pipeline([
        ('suburb_statistics', SuburbFeatureAdder(merge_keys=['suburb'], 
                                                 add_features=['marriedPercentage', 'population',
                                                               'renterPercentage', 'mostCommonAgeBracket'],
                                                 df=suburb_statistics)),
        ('suburb_price_summary', SuburbFeatureAdder(merge_keys=['suburb', 'bedrooms','propertyType'],
                                                    add_features=['medianSoldPrice', 'medianRentPrice', 
                                                                  'entryLevelPrice', 'luxuryLevelPrice'],
                                                    df=suburb_price_summary)),
        ('primary', SchoolFeatureAdder(schools_df=school_data, school_type='primary')),
        ('secondary', SchoolFeatureAdder(schools_df=school_data, school_type='secondary')),
        ('coastDistance', CoastDistanceFeature(coastline_df=coastline_data)),
        ('cbdDistance', CBDDistanceCalculator())
    ])

    scaler_pipe = Pipeline([
        ('scale_price', PriceScaler(price_index_df=yearly_median_prices)),
        ('log_features', FeatureScaleTransform(columns=feature_dict['log_cols'], method='log'))
    ])

    final_preprocessor = Pipeline([
        ('drop_cols', ColumnDropper(columns=feature_dict['drop_cols'])),
        ('std_cols', CustomStandardiser(columns=feature_dict['std_cols'])),
        ('ohe_cols', CustomOHE(columns=feature_dict['ohe_cols'])),
        ('mlb_features', CustomMultiLabelBinarizer(column='features'))
    ])
    
    # Main pipe
    pipeline = Pipeline([
        ('filter', filter_pipe),
        ('reformatter', reformatter_pipe),
        ('imputer', imputer_pipe),
        ('added_features', addfeatures_pipe),
        ('scaler', scaler_pipe),
        ('final', final_preprocessor)
    ])
    
    # Fitting the pipeline
    pipeline.fit(train)

    path = '/Users/tristangarcia/Desktop/hp-pred/data/'
    with open(f'{path}preprocessing_pipeline.pkl', 'wb') as f:
        pickle.dump(pipeline, f)
    
    return pipeline
    
    

# Testing

In [6]:
pipeline = main()

In [7]:
train, test, test2 = load_data()

train_transformed = pipeline.fit_transform(train)
test_transformed = pipeline.transform(test)
test2_transformed = pipeline.transform(test2)

In [8]:
train_transformed.shape

(71528, 62)

In [9]:
train_transformed.head()

Unnamed: 0,bathrooms,bedrooms,parking,landArea,latitude,longitude,soldPrice,suburb_marriedPercentage,suburb_population,suburb_renterPercentage,suburb_medianSoldPrice,suburb_medianRentPrice,suburb_entryLevelPrice,suburb_luxuryLevelPrice,primaryDistance,primaryICSEA,secondaryDistance,secondaryICSEA,coastDistance,cbdDistance,propertyType_house,propertyType_unit,suburb_mostCommonAgeBracket_0,suburb_mostCommonAgeBracket_20 to 39,suburb_mostCommonAgeBracket_40 to 59,suburb_mostCommonAgeBracket_5 to 19,suburb_mostCommonAgeBracket_60+,primaryType_combined,primaryType_primary,secondaryType_combined,secondaryType_secondary,feature_air_conditioning,feature_heating,feature_built_in_wardrobes,feature_close_to_transport,feature_close_to_shops,feature_intercom,feature_close_to_schools,feature_secure_parking,feature_shed,feature_ensuite,feature_alarm_system,feature_outdoor_entertainment_area,feature_swimming_pool,feature_balcony,feature_fully_fenced,feature_broadband,feature_solar_panel,feature_study,feature_area_views,feature_dishwasher,feature_internal_laundry,feature_bath,feature_fireplace,feature_garden,feature_lounge,feature_courtyard,feature_dining_room,feature_pets_allowed,feature_gas,feature_remote_garage,feature_floorboards
0,-1.15,-0.32,0.66,5.03,-1.11,1.19,5.92,0.74,-1.16,-0.99,-1.85,-1.58,-1.85,-1.85,3.62,-0.17,2.38,0.19,1.41,1.52,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,-1.15,-1.33,-0.67,-1.54,-0.19,-0.17,5.6,-0.97,0.22,2.07,0.43,0.55,0.43,0.42,-0.32,-4.15,-0.92,-2.36,0.58,-0.72,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,-1.15,-0.32,2.66,0.98,-0.48,-0.17,5.52,0.51,-0.73,-0.26,0.45,-1.58,0.44,0.48,-1.23,-0.0,-0.47,0.1,0.88,0.76,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,-1.15,-2.35,-0.67,-0.34,-0.2,-0.3,5.81,-0.7,0.12,0.88,0.42,0.56,0.4,0.43,-4.02,0.71,-1.27,-2.36,-1.83,-0.35,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,-1.15,-1.33,-0.67,-1.29,-0.17,-0.18,5.53,-0.38,0.47,1.48,0.42,0.59,0.43,0.42,-0.19,0.57,0.24,0.59,0.52,-1.13,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
test_transformed.shape

(15376, 62)

In [11]:
test_transformed.head()

Unnamed: 0,bathrooms,bedrooms,parking,landArea,latitude,longitude,soldPrice,suburb_marriedPercentage,suburb_population,suburb_renterPercentage,suburb_medianSoldPrice,suburb_medianRentPrice,suburb_entryLevelPrice,suburb_luxuryLevelPrice,primaryDistance,primaryICSEA,secondaryDistance,secondaryICSEA,coastDistance,cbdDistance,propertyType_house,propertyType_unit,suburb_mostCommonAgeBracket_0,suburb_mostCommonAgeBracket_20 to 39,suburb_mostCommonAgeBracket_40 to 59,suburb_mostCommonAgeBracket_5 to 19,suburb_mostCommonAgeBracket_60+,primaryType_combined,primaryType_primary,secondaryType_combined,secondaryType_secondary,feature_air_conditioning,feature_heating,feature_built_in_wardrobes,feature_close_to_transport,feature_close_to_shops,feature_intercom,feature_close_to_schools,feature_secure_parking,feature_shed,feature_ensuite,feature_alarm_system,feature_outdoor_entertainment_area,feature_swimming_pool,feature_balcony,feature_fully_fenced,feature_broadband,feature_solar_panel,feature_study,feature_area_views,feature_dishwasher,feature_internal_laundry,feature_bath,feature_fireplace,feature_garden,feature_lounge,feature_courtyard,feature_dining_room,feature_pets_allowed,feature_gas,feature_remote_garage,feature_floorboards
0,-1.15,-2.35,-0.67,-2.21,-0.17,-0.18,5.41,-0.38,0.47,1.48,0.41,0.54,0.39,0.42,0.29,0.57,0.42,0.59,0.52,-1.11,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,-1.15,-0.32,1.33,-0.33,-0.14,-0.17,5.85,0.33,2.29,0.02,0.53,0.61,0.54,0.5,-0.04,0.14,0.18,0.32,0.54,-0.69,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1,1,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,0.45,0.7,0.66,-0.19,-0.16,-0.21,6.14,-0.13,0.21,0.24,0.68,0.74,0.64,0.67,-0.95,0.66,-0.23,-2.36,0.24,-1.51,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2.05,0.7,-0.01,0.22,-0.21,-0.22,6.07,1.02,0.54,-0.9,0.62,0.68,0.64,0.59,0.38,0.59,-0.87,0.55,0.21,-0.43,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,-1.15,-0.32,-0.01,0.49,-0.19,-0.26,6.23,0.79,-0.18,-0.45,0.61,0.66,0.6,0.64,-0.21,0.67,-0.64,0.69,-0.33,-0.63,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
test2_transformed.shape

(16633, 61)

In [13]:
test2_transformed.head()

Unnamed: 0,bathrooms,bedrooms,parking,latitude,longitude,landArea,suburb_marriedPercentage,suburb_population,suburb_renterPercentage,suburb_medianSoldPrice,suburb_medianRentPrice,suburb_entryLevelPrice,suburb_luxuryLevelPrice,primaryDistance,primaryICSEA,secondaryDistance,secondaryICSEA,coastDistance,cbdDistance,propertyType_house,propertyType_unit,suburb_mostCommonAgeBracket_0,suburb_mostCommonAgeBracket_20 to 39,suburb_mostCommonAgeBracket_40 to 59,suburb_mostCommonAgeBracket_5 to 19,suburb_mostCommonAgeBracket_60+,primaryType_combined,primaryType_primary,secondaryType_combined,secondaryType_secondary,feature_air_conditioning,feature_heating,feature_built_in_wardrobes,feature_close_to_transport,feature_close_to_shops,feature_intercom,feature_close_to_schools,feature_secure_parking,feature_shed,feature_ensuite,feature_alarm_system,feature_outdoor_entertainment_area,feature_swimming_pool,feature_balcony,feature_fully_fenced,feature_broadband,feature_solar_panel,feature_study,feature_area_views,feature_dishwasher,feature_internal_laundry,feature_bath,feature_fireplace,feature_garden,feature_lounge,feature_courtyard,feature_dining_room,feature_pets_allowed,feature_gas,feature_remote_garage,feature_floorboards
0,-1.15,-2.35,-0.67,-0.17,-0.17,-1.63,-0.38,0.47,1.48,0.41,0.54,0.39,0.42,0.02,0.06,0.07,0.39,0.54,-1.06,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,-1.15,-0.32,1.33,-0.14,-0.19,-0.5,0.33,2.29,0.02,0.53,0.61,0.54,0.5,-0.14,0.18,0.2,0.38,0.42,-0.8,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0.45,0.7,0.66,-0.16,-0.22,0.13,-0.13,0.21,0.24,0.68,0.74,0.64,0.67,-0.85,0.66,-0.26,-2.36,0.19,-1.46,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2.05,0.7,-0.01,-0.21,-0.21,0.2,1.02,0.54,-0.9,0.62,0.68,0.64,0.59,-0.29,0.68,-0.16,0.55,0.25,-0.5,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,-1.15,-0.32,-0.01,-0.2,-0.26,-0.2,0.79,-0.18,-0.45,0.61,0.66,0.6,0.64,-0.19,0.58,-0.32,0.52,-0.27,-0.56,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
