In [1]:
from typing import List, Any, Union
import numpy as np
import pandas as pd
from datetime import datetime
import dateutil.relativedelta
import transform_functions
from transform_functions import *
import logging
from scipy import stats
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', None)
random_seed = 42
logging.basicConfig(level=logging.INFO, format='%(asctime)s:%(levelname)s:%(message)s')
logging.info(f"Executing transform.py")

2021-03-03 13:02:20,793:INFO:Executing transform.py


In [4]:
!pwd

/Users/chris/github/zanalytics/zana_housing/src/extract_transform


In [13]:
# Read in price paid data
df_price_paid = pd.read_csv("../../data/processed/pp_sample_ext.csv",
                            parse_dates=['date'])
# Read in house price index
df_house_index = pd.read_csv("../../data/raw/house_price_index.csv")

# Read in postcode data
df_postcode = pd.read_csv("../../data/raw/postcodes.csv")

logging.info(f"Datasets Read")

df_price_paid = (df_price_paid.pipe(remove_duplicates)
                 .pipe(price_paid_process,
                 min=10000, max=5000000, number_of_months=4)
                 .drop(columns=['locality', 'town_city', 'district', 'county'])
                 )

df_house_index = (df_house_index.pipe(clean_names)
                  .pipe(drop_columns, string='change|price')
                  .pipe(col_to_dates, cols=['date'])
                  .rename({'date': 'hpi_date'}, axis='columns')
                  )

postcode_columns: List[Union[str, Any]] = ['postcode', 'latitude', 'longitude',
                                           'grid_ref', 'county', 'district',
                                           'ward', 'district_code',
                                           'ward_code', 'county_code',
                                           'constituency', 'region',
                                           'london_zone',
                                           'middle_layer_super_output_area',
                                           'postcode_area',
                                           'postcode_district', 
                                           'index_of_multiple_deprivation', 
                                           'quality', 'user_type', 
                                           'last_updated', 'nearest_station', 
                                           'distance_to_station', 
                                           'postcode_area', 'postcode_district', 
                                           'police_force', 'water_company', 
                                           'plus_code', 'average_income', 
                                           'sewage_company', 
                                           'travel_to_work_area', 'rural_urban',
                                           'altitude']

df_postcode = (df_postcode.pipe(clean_names)
               .loc[:, postcode_columns]
               )

pp_index_columns = {'detached_index': 'pp_detached_index',
                    'semi_detached_index': 'pp_semi_detached_index',
                    'terraced_index': 'pp_terraced_index',
                    'flat_index': 'pp_flat_index',
                    }

pp_avg_columns = ['pp_detached_index', 'pp_semi_detached_index',
                  'pp_terraced_index', 'pp_flat_index']
avg_columns = ['detached_index', 'semi_detached_index',
               'terraced_index', 'flat_index']

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
2021-03-03 13:13:15,984:INFO:Datasets Read


In [6]:
df_price_paid.head(2)

Unnamed: 0,id,price,date,postcode,type,new_build,land,primary_address,secondary_address,street,ppd,record,building_reference_number,current_energy_rating,potential_energy_rating,current_energy_efficiency,potential_energy_efficiency,property_type,built_form,inspection_date,lodgement_date,transaction_type,environment_impact_current,environment_impact_potential,energy_consumption_current,energy_consumption_potential,co2_emissions_current,co2_emiss_curr_per_floor_area,co2_emissions_potential,lighting_cost_current,lighting_cost_potential,heating_cost_current,heating_cost_potential,hot_water_cost_current,hot_water_cost_potential,total_floor_area,energy_tariff,mains_gas_flag,floor_level,main_heating_controls,multi_glaze_proportion,glazed_type,glazed_area,extension_count,number_habitable_rooms,number_heated_rooms,low_energy_lighting,number_open_fireplaces,hotwater_description,hot_water_energy_eff,hot_water_env_eff,floor_description,floor_energy_eff,floor_env_eff,windows_description,windows_energy_eff,windows_env_eff,walls_description,walls_energy_eff,walls_env_eff,secondheat_description,roof_description,roof_energy_eff,roof_env_eff,mainheat_description,mainheat_energy_eff,mainheat_env_eff,mainheatcont_description,mainheatc_energy_eff,mainheatc_env_eff,lighting_description,lighting_energy_eff,lighting_env_eff,main_fuel,wind_turbine_count,heat_loss_corridoor,solar_water_heating_flag,mechanical_ventilation,address,construction_age_band,lodgement_datetime,tenure,month_year,current_month
0,{96369803-9141-440F-B41F-4283BDBE3EA6},32000,2002-05-17,WS11 9SY,F,N,L,11,,HAMILTON LEA,A,A,2553022778,E,C,52,78,Flat,Mid-Terrace,12/09/2020,19/09/2020,none of the above,41,63,593,349.0,4.2,100.0,2.5,41.0,41.0,521.0,286.0,492.0,153.0,42.0,dual,N,Ground,2401,100.0,"double glazing, unknown install date",Normal,0.0,3.0,3.0,100.0,0.0,"Electric immersion, off-peak",Very Poor,Poor,"Solid, no insulation (assumed)",NO DATA!,,Fully double glazed,Average,Average,"Cavity wall, as built, no insulation (assumed)",Poor,Poor,Portable electric heaters (assumed),(another dwelling above),,,Electric storage heaters,Average,Very Poor,Manual charge control,Poor,Poor,Low energy lighting in all fixed outlets,Very Good,Very Good,electricity (not community),0.0,no corridor,N,natural,"11, Hamilton Lea, Brownhills Road",England and Wales: 1967-1975,19/09/2020 20:44,rental (social),2002-05-01,2020-12-01
1,{C7E05E3A-7FDD-4374-8C77-D0A01B2622D5},122000,2002-08-30,CV6 1EZ,S,N,F,3,,SOUTHBANK ROAD,A,A,2407912778,D,B,60,83,House,Semi-Detached,19/09/2020,19/09/2020,marketed sale,52,80,294,113.0,4.8,52.0,1.9,73.0,73.0,849.0,513.0,102.0,72.0,93.0,Single,Y,NODATA!,2106,100.0,double glazing installed during or after 2002,Normal,1.0,5.0,5.0,100.0,0.0,From main system,Good,Good,"Suspended, no insulation (assumed)",NO DATA!,,Fully double glazed,Good,Good,"Solid brick, as built, no insulation (assumed)",Very Poor,Very Poor,"Room heaters, mains gas","Pitched, 100 mm loft insulation",Average,Average,"Boiler and radiators, mains gas",Good,Good,"Programmer, room thermostat and TRVs",Good,Good,Low energy lighting in all fixed outlets,Very Good,Very Good,mains gas (not community),0.0,NO DATA!,N,natural,"3, Southbank Road",England and Wales: 1930-1949,19/09/2020 12:57,owner-occupied,2002-08-01,2020-12-01


In [14]:
df_price_paid_1 = (df_price_paid
                 .merge(df_postcode, on='postcode')
                 .merge(df_house_index, how='left',
                        left_on=['district_code', 'month_year'],
                        right_on=['area_code', 'hpi_date'])
                 .rename(pp_index_columns, axis='columns')
                 .pipe(mean_column, 'pp_average_index', pp_avg_columns)
                 .merge(df_house_index, how='left',
                        left_on=['district_code', 'current_month'],
                        right_on=['area_code', 'hpi_date'])
#                  .pipe(mean_column, 'average_index', avg_columns)
#                  .pipe(adjust_price, 'T', 'terraced_index',
#                        'pp_terraced_index')
#                 .pipe(adjust_price, 'S', 'semi_detached_index',
#                        'pp_semi_detached_index')
#                  .pipe(adjust_price, 'D', 'detached_index',
#                        'pp_detached_index')
#                  .pipe(adjust_price, 'F', 'flat_index', 'pp_flat_index')
#                  .pipe(adjust_price, 'O', 'average_index', 'pp_average_index')
                )

In [15]:
df_price_paid_1.head(2)

Unnamed: 0,id,price,date,postcode,type,new_build,land,primary_address,secondary_address,street,ppd,record,building_reference_number,current_energy_rating,potential_energy_rating,current_energy_efficiency,potential_energy_efficiency,property_type,built_form,inspection_date,lodgement_date,transaction_type,environment_impact_current,environment_impact_potential,energy_consumption_current,energy_consumption_potential,co2_emissions_current,co2_emiss_curr_per_floor_area,co2_emissions_potential,lighting_cost_current,lighting_cost_potential,heating_cost_current,heating_cost_potential,hot_water_cost_current,hot_water_cost_potential,total_floor_area,energy_tariff,mains_gas_flag,floor_level,main_heating_controls,multi_glaze_proportion,glazed_type,glazed_area,extension_count,number_habitable_rooms,number_heated_rooms,low_energy_lighting,number_open_fireplaces,hotwater_description,hot_water_energy_eff,hot_water_env_eff,floor_description,floor_energy_eff,floor_env_eff,windows_description,windows_energy_eff,windows_env_eff,walls_description,walls_energy_eff,walls_env_eff,secondheat_description,roof_description,roof_energy_eff,roof_env_eff,mainheat_description,mainheat_energy_eff,mainheat_env_eff,mainheatcont_description,mainheatc_energy_eff,mainheatc_env_eff,lighting_description,lighting_energy_eff,lighting_env_eff,main_fuel,wind_turbine_count,heat_loss_corridoor,solar_water_heating_flag,mechanical_ventilation,address,construction_age_band,lodgement_datetime,tenure,month_year,current_month,latitude,longitude,grid_ref,county,district,ward,district_code,ward_code,county_code,constituency,region,london_zone,middle_layer_super_output_area,postcode_area,postcode_district,index_of_multiple_deprivation,quality,user_type,last_updated,nearest_station,distance_to_station,postcode_area.1,postcode_district.1,police_force,water_company,plus_code,average_income,sewage_company,travel_to_work_area,rural_urban,altitude,hpi_date_x,region_name_x,area_code_x,pp_detached_index,pp_semi_detached_index,pp_terraced_index,pp_flat_index,pp_average_index,hpi_date_y,region_name_y,area_code_y,detached_index,semi_detached_index,terraced_index,flat_index
0,{96369803-9141-440F-B41F-4283BDBE3EA6},32000,2002-05-17,WS11 9SY,F,N,L,11,,HAMILTON LEA,A,A,2553022778,E,C,52,78,Flat,Mid-Terrace,12/09/2020,19/09/2020,none of the above,41,63,593,349.0,4.2,100.0,2.5,41.0,41.0,521.0,286.0,492.0,153.0,42.0,dual,N,Ground,2401,100.0,"double glazing, unknown install date",Normal,0.0,3.0,3.0,100.0,0.0,"Electric immersion, off-peak",Very Poor,Poor,"Solid, no insulation (assumed)",NO DATA!,,Fully double glazed,Average,Average,"Cavity wall, as built, no insulation (assumed)",Poor,Poor,Portable electric heaters (assumed),(another dwelling above),,,Electric storage heaters,Average,Very Poor,Manual charge control,Poor,Poor,Low energy lighting in all fixed outlets,Very Good,Very Good,electricity (not community),0.0,no corridor,N,natural,"11, Hamilton Lea, Brownhills Road",England and Wales: 1967-1975,19/09/2020 20:44,rental (social),2002-05-01,2020-11-01,52.674246,-1.970024,SK021085,Staffordshire,Cannock Chase,Norton Canes,E07000192,E05006914,E10000028,Cannock Chase,West Midlands,,Norton Canes,WS,WS11,20092.0,1.0,0.0,2020-11-21,Cannock,3.75283,WS,WS11,Staffordshire,South Staffordshire Water,9C4WM2FH+MX,38100.0,Severn Trent,Wolverhampton and Walsall,Rural town and fringe,153.0,2002-05-01,Cannock Chase,E07000192,53.396947,50.39451,47.766052,51.999948,50.889364,2020-11-01,Cannock Chase,E07000192,138.617508,136.666342,134.764376,124.696941
1,{C7E05E3A-7FDD-4374-8C77-D0A01B2622D5},122000,2002-08-30,CV6 1EZ,S,N,F,3,,SOUTHBANK ROAD,A,A,2407912778,D,B,60,83,House,Semi-Detached,19/09/2020,19/09/2020,marketed sale,52,80,294,113.0,4.8,52.0,1.9,73.0,73.0,849.0,513.0,102.0,72.0,93.0,Single,Y,NODATA!,2106,100.0,double glazing installed during or after 2002,Normal,1.0,5.0,5.0,100.0,0.0,From main system,Good,Good,"Suspended, no insulation (assumed)",NO DATA!,,Fully double glazed,Good,Good,"Solid brick, as built, no insulation (assumed)",Very Poor,Very Poor,"Room heaters, mains gas","Pitched, 100 mm loft insulation",Average,Average,"Boiler and radiators, mains gas",Good,Good,"Programmer, room thermostat and TRVs",Good,Good,Low energy lighting in all fixed outlets,Very Good,Very Good,mains gas (not community),0.0,NO DATA!,N,natural,"3, Southbank Road",England and Wales: 1930-1949,19/09/2020 12:57,owner-occupied,2002-08-01,2020-11-01,52.420171,-1.536641,SP316803,West Midlands,Coventry,Sherbourne,E08000026,E05001229,E11000018,Coventry North West,West Midlands,,Coundon,CV,CV6,19604.0,1.0,0.0,2020-11-21,Canley,2.4409,CV,CV6,West Midlands,Severn Trent,9C4WCFC7+38,41500.0,,Coventry,Urban city and town,113.0,2002-08-01,Coventry,E08000026,59.979886,56.089921,53.980761,60.906349,57.739229,2020-11-01,Coventry,E08000026,141.151717,139.737965,137.071913,126.622489
