In [1]:
import sys
sys.path.append("./../ingestion")

In [2]:
from typing import List, Any, Union
import numpy as np
import pandas as pd
from datetime import datetime
import dateutil.relativedelta
import transform_functions
from transform_functions import *
import logging
from scipy import stats
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', None)
random_seed = 42
logging.basicConfig(level=logging.INFO, format='%(asctime)s:%(levelname)s:%(message)s')
logging.info(f"Executing transform.py")
logging.info(f"If not working check the number of months in the price_paid_process function")

column_names = ['id', 'price', 'date', 'postcode',
                'type', 'new_build', 'land', 'primary_address',
                'secondary_address', 'street', 'locality', 'town_city',
                'district', 'county', 'ppd', 'record']

2021-07-11 00:52:45,152:INFO:Executing transform.py
2021-07-11 00:52:45,153:INFO:If not working check the number of months in the price_paid_process function


In [None]:
# Read in price paid data
# df_price_paid = pd.read_csv("./../data/raw/pp-complete.csv",
#                             parse_dates=['date'],
#                             names=column_names)

In [None]:
# df_price_paid.head()

In [None]:
# df_price_paid_notts = df_price_paid[df_price_paid["county"] == "NOTTINGHAMSHIRE"]

In [None]:
# df_price_paid_notts.to_csv("./../data/processed/pp-notts.csv")

In [7]:
df_price_paid = pd.read_csv("./../data/processed/pp-notts.csv", index_col=[0])

# Read in house price index
df_house_index = pd.read_csv("./../data/raw/house_price_index.csv")

# Read in postcode data
df_postcode = pd.read_csv("./../data/raw/postcodes.csv")

logging.info(f"Datasets Read")

county_rename = {'county_x': 'county'}

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
2021-07-11 00:56:56,223:INFO:Datasets Read


In [8]:
df_price_paid.columns

Index(['id', 'price', 'date', 'postcode', 'type', 'new_build', 'land',
       'primary_address', 'secondary_address', 'street', 'locality',
       'town_city', 'district', 'county', 'ppd', 'record'],
      dtype='object')

In [9]:
df_price_paid = (df_price_paid
                 .pipe(drop_columns, string='lmk_key')
                 .rename(county_rename, axis='columns')
                 .pipe(remove_duplicates)
                 .pipe(price_paid_process,
                 min=10000, max=5000000, number_of_months=3)
                 .drop(columns=['locality', 'town_city', 'district', 'county'])
                 )

In [10]:
df_price_paid.head()

Unnamed: 0,id,price,date,postcode,type,new_build,land,primary_address,secondary_address,street,ppd,record,month_year,current_month
0,{470081E3-FA40-41ED-9F50-4BB9886B36D8},48000,1995-11-03,NG25 0DS,S,N,F,54,,NORWOOD GARDENS,A,A,1995-11-01,2021-04-01
1,{E58774C2-3011-484A-B631-5A06F3FB5BC9},102500,1995-09-28,NG2 6RP,D,Y,F,9,,THIRLMERE,A,A,1995-09-01,2021-04-01
2,{4C4372D5-E9AA-4273-BA9B-5A078B374BAB},50000,1995-02-24,NG16 2BL,D,N,F,31,,VIOLET AVENUE,A,A,1995-02-01,2021-04-01
3,{7FF5CA89-7D83-4A7E-A8FC-5A07B81E558C},63500,1995-04-21,NG24 4RX,D,N,F,2,,THE HOLT,A,A,1995-04-01,2021-04-01
4,{112275E1-3F92-45DE-BD89-4F4B5790F5EE},58500,1995-06-16,NG5 8AB,S,N,F,40,,HARWOOD CLOSE,A,A,1995-06-01,2021-04-01


In [11]:
df_house_index = (df_house_index.pipe(clean_names)
                  .pipe(drop_columns, string='change|price')
                  .pipe(col_to_dates, cols=['date'])
                  .rename({'date': 'hpi_date'}, axis='columns')
                  )

postcode_columns: List[Union[str, Any]] = ['postcode', 'latitude', 'longitude',
                                           'grid_ref', 'county', 'district',
                                           'ward', 'district_code',
                                           'ward_code', 'county_code',
                                           'constituency', 'region',
                                           'london_zone',
                                           'middle_layer_super_output_area',
                                           'postcode_area',
                                           'postcode_district',
                                           'index_of_multiple_deprivation',
                                           'quality', 'user_type',
                                           'last_updated', 'nearest_station',
                                           'distance_to_station',
                                           'postcode_area', 'postcode_district',
                                           'police_force', 'water_company',
                                           'plus_code', 'average_income',
                                           'sewage_company',
                                           'travel_to_work_area', 'rural_urban',
                                           'altitude']

df_postcode = (df_postcode.pipe(clean_names)
               .loc[:, postcode_columns]
               )

pp_index_columns = {'detached_index': 'pp_detached_index',
                    'semi_detached_index': 'pp_semi_detached_index',
                    'terraced_index': 'pp_terraced_index',
                    'flat_index': 'pp_flat_index',
                    }

pp_avg_columns = ['pp_detached_index', 'pp_semi_detached_index',
                  'pp_terraced_index', 'pp_flat_index']
avg_columns = ['detached_index', 'semi_detached_index',
               'terraced_index', 'flat_index']

  df.columns = df.columns.str.replace(".", '_')


In [12]:
df_price_paid = (df_price_paid
                 .merge(df_postcode, on='postcode')
                 .merge(df_house_index, how='left',
                        left_on=['district_code', 'month_year'],
                        right_on=['area_code', 'hpi_date'])
                 .rename(pp_index_columns, axis='columns')
                 .pipe(mean_column, 'pp_average_index', pp_avg_columns)
                 .merge(df_house_index, how='left',
                        left_on=['district_code', 'current_month'],
                        right_on=['area_code', 'hpi_date'])
                 .pipe(mean_column, 'average_index', avg_columns)
                 .pipe(adjust_price, 'T', 'terraced_index',
                       'pp_terraced_index')
                .pipe(adjust_price, 'S', 'semi_detached_index',
                       'pp_semi_detached_index')
                 .pipe(adjust_price, 'D', 'detached_index',
                       'pp_detached_index')
                 .pipe(adjust_price, 'F', 'flat_index', 'pp_flat_index')
                 .pipe(adjust_price, 'O', 'average_index', 'pp_average_index')
                 )

In [13]:
duplicate_list = ['date', 'postcode', 'type', 'new_build', 'land',
                  'primary_address', 'secondary_address', 'street',
                  'ppd', 'record', 'month_year', 'current_month',
                  'latitude', 'longitude', 'grid_ref', 'county',
                  'district', 'ward', 'district_code', 'ward_code',
                  'county_code', 'region',
                  'london_zone', 'middle_layer_super_output_area',
                  'postcode_area', 'postcode_district',
                  'hpi_date', 'region_name', 'area_code']

df_price_paid = (df_price_paid
                 .pipe(drop_columns, string='e_y|index')
                 .rename({'hpi_date_x': 'hpi_date',
                          'region_name_x': 'region_name',
                          'area_code_x': 'area_code'}, axis='columns')
                 .sort_values(by=['date'])
                 .drop_duplicates(subset=duplicate_list, keep="last"))

df_price_paid = df_price_paid[df_price_paid['adjusted_price'].notnull()]
df_price_paid['adjusted_price'] = df_price_paid['adjusted_price'].astype(int)
df_price_paid = df_price_paid[(np.abs(stats.zscore(df_price_paid["adjusted_price"])) < 3)]
df_price_paid["london_zone"] = df_price_paid["london_zone"].fillna(df_price_paid.london_zone.max() + 1.0)
logging.info("Completed processing")

2021-07-11 01:00:56,727:INFO:Completed processing


In [None]:
df_price_paid.head()

In [14]:
# Lets drop the duplicates, keeping only the first instance.
df_price_paid.to_csv("./../data/processed/pp_nottinghamshire.csv", index=False)

# # Split the data - train, validation and test
# train_set, test_set = train_test_split(df_price_paid,
#                                        test_size=0.30,
#                                        random_state=random_seed)
#
# test_set, validation_set = train_test_split(test_set,
#                                             test_size=0.20,
#                                             random_state=random_seed)
#
# logging.info(f"Training shape: {train_set.shape}")
# logging.info(f"Validation shape: {validation_set.shape}")
# logging.info(f"Test shape: {test_set.shape}")
#
# # Save the split files
# train_set.to_csv("./data/processed/train.csv", index=False)
# validation_set.to_csv("./data/processed/validation.csv", index=False)
# test_set.to_csv("./data/processed/test.csv", index=False)

logging.info(f"Transforming and splitting complete")

2021-07-11 01:01:30,311:INFO:Transforming and splitting complete
