In [111]:
from typing import List, Any, Union
import numpy as np
import pandas as pd
from zanasonic_etl.transform.transform_functions import (
    remove_duplicates,
    price_paid_process,
    drop_columns,
    col_to_dates,
    mean_column,
    adjust_price,
)
from loguru import logger
from scipy import stats

from janitor import clean_names
pd.set_option("display.max_columns", None)
random_seed = 42

# Price Paid Data

In [2]:
price_paid_column_names = [
    "id",
    "price",
    "date",
    "postcode",
    "type",
    "new_build",
    "land",
    "primary_address",
    "secondary_address",
    "street",
    "locality",
    "town_city",
    "district",
    "county",
    "ppd",
    "record",
]

In [3]:
# Read in price paid data
df_price_paid = pd.read_csv(
    "../data/raw/pp_sample.csv", parse_dates=["date"], names=price_paid_column_names, low_memory=False
)

# Read in house price index
df_house_index = pd.read_csv("../data/raw/house_price_index.csv", low_memory=False)

# Read in postcode data
df_postcode = pd.read_csv("../data/raw/postcodes.csv", low_memory=False)

In [119]:
def set_min_max_price(df: pd.DataFrame, min_price: int=10000, max_price: int=5000000) -> pd.DataFrame:
    df = df.loc[(df.price >= min_price) & (df.price <= max_price)]
    return df

def month_year(df: pd.DataFrame, month_year_column: str):
    df[month_year_column] = df.date.dt.to_period("M")
    return df


In [120]:
def price_paid_process(df: pd.DataFrame, min_price: int=10000, max_price: int=5000000) -> pd.DataFrame:
    price_paid_df = (
        df.pipe(clean_names).
        assign(id = df.id.str.strip("{}")).
        set_index("id").
        sort_values(by="date", ascending=False).
        dropna(subset=["postcode"]).
        drop_duplicates(subset=price_paid_column_names[1:], keep="first").
        pipe(set_min_max_price,min_price=min_price, max_price=max_price).
        drop(columns=["locality", "town_city", "district", "county"]).
        assign(current_month_year = df.date.dt.to_period("M").max()).
        pipe(month_year, month_year_column="price_paid_month_year")
     )
    return price_paid_df

In [121]:
pp_df = price_paid_process(df=df_price_paid)

In [122]:
pp_df.head()

Unnamed: 0_level_0,price,date,postcode,type,new_build,land,primary_address,secondary_address,street,ppd,record,current_month_year,price_paid_month_year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
DE2D0CDF-F7C8-51EE-E053-6C04A8C00671,850000,2022-04-20,E5 0ET,T,N,F,19,,RUSHMORE ROAD,A,A,2022-04,2022-04
DE2D0CDF-AF88-51EE-E053-6C04A8C00671,505000,2022-04-19,WD23 2JH,T,N,F,1,,NAPIER DRIVE,A,A,2022-04,2022-04
DE2D0CE0-5C78-51EE-E053-6C04A8C00671,105050,2022-04-19,DY4 9SG,F,N,L,51,,THUNDERBOLT WAY,A,A,2022-04,2022-04
DE2D0CDF-DC0B-51EE-E053-6C04A8C00671,562000,2022-04-14,SE18 3HF,S,N,F,150,,PLUM LANE,A,A,2022-04,2022-04
DE2D0CDF-4399-51EE-E053-6C04A8C00671,550000,2022-04-14,N22 7BT,T,N,F,46,,LEVERTON CLOSE,A,A,2022-04,2022-04


# Postcode Data

In [141]:
pc_df = clean_names(df_postcode)
pc_df["london_zone"] = pc_df["london_zone"].fillna(10)
pc_df.head()


Unnamed: 0,postcode,in_use_,latitude,longitude,easting,northing,grid_ref,county,district,ward,district_code,ward_code,country,county_code,constituency,introduced,terminated,parish,national_park,population,households,built_up_area,built_up_sub_division,lower_layer_super_output_area,rural_urban,region,altitude,london_zone,lsoa_code,local_authority,msoa_code,middle_layer_super_output_area,parish_code,census_output_area,constituency_code,index_of_multiple_deprivation,quality,user_type,last_updated,nearest_station,distance_to_station,postcode_area,postcode_district,police_force,water_company,plus_code,average_income,sewage_company,travel_to_work_area,itl_level_2,itl_level_3,uprns,distance_to_sea
0,AB1 0AA,No,57.101474,-2.242851,385386.0,801193.0,NJ853011,,Aberdeen City,Lower Deeside,S12000033,S13002843,Scotland,S99999999,Aberdeen South,1980-01-01,1996-06-01,,,,,,,"Cults, Bieldside and Milltimber West - 02",Accessible small town,Scotland,46.0,8.0,S01006514,,S02001237,"Cults, Bieldside and Milltimber West",,S00090303,S14000002,6715.0,1.0,0.0,2022-05-25,Portlethen,8.31408,AB,AB1,Scotland,Scottish Water,9C9V4Q24+HV,,,Aberdeen,North Eastern Scotland,Aberdeen City and Aberdeenshire,,9.87661
1,AB1 0AB,No,57.102554,-2.246308,385177.0,801314.0,NJ851013,,Aberdeen City,Lower Deeside,S12000033,S13002843,Scotland,S99999999,Aberdeen South,1980-01-01,1996-06-01,,,,,,,"Cults, Bieldside and Milltimber West - 02",Accessible small town,Scotland,61.0,8.0,S01006514,,S02001237,"Cults, Bieldside and Milltimber West",,S00090303,S14000002,6715.0,1.0,0.0,2022-05-25,Portlethen,8.55457,AB,AB1,Scotland,Scottish Water,9C9V4Q33+2F,,,Aberdeen,North Eastern Scotland,Aberdeen City and Aberdeenshire,,10.1151
2,AB1 0AD,No,57.100556,-2.248342,385053.0,801092.0,NJ850010,,Aberdeen City,Lower Deeside,S12000033,S13002843,Scotland,S99999999,Aberdeen South,1980-01-01,1996-06-01,,,,,,,"Cults, Bieldside and Milltimber West - 02",Accessible small town,Scotland,45.0,8.0,S01006514,,S02001237,"Cults, Bieldside and Milltimber West",,S00090399,S14000002,6715.0,1.0,0.0,2022-05-25,Portlethen,8.54352,AB,AB1,Scotland,Scottish Water,9C9V4Q22+6M,,,Aberdeen,North Eastern Scotland,Aberdeen City and Aberdeenshire,,10.1476
3,AB1 0AE,No,57.084444,-2.255708,384600.0,799300.0,NO845993,,Aberdeenshire,North Kincardine,S12000034,S13002864,Scotland,S99999999,West Aberdeenshire and Kincardine,1994-02-01,1996-06-01,,,,,,,"Dunecht, Durris and Drumoak - 01",Accessible rural area,Scotland,51.0,8.0,S01006853,,S02001296,"Dunecht, Durris and Drumoak",,S00091322,S14000058,5069.0,8.0,0.0,2022-05-25,Portlethen,8.20809,AB,AB1,Scotland,Scottish Water,9C9V3PMV+QP,,,Aberdeen,North Eastern Scotland,Aberdeen City and Aberdeenshire,,9.39683
4,AB1 0AF,No,57.096656,-2.258102,384460.0,800660.0,NJ844006,,Aberdeen City,Lower Deeside,S12000033,S13002843,Scotland,S99999999,Aberdeen South,1990-12-01,1992-07-01,,,,,,,Culter - 06,Accessible small town,Scotland,46.0,8.0,S01006511,,S02001236,Culter,,S00090299,S14000002,6253.0,8.0,1.0,2022-05-25,Portlethen,8.85583,AB,AB1,Scotland,Scottish Water,9C9V3PWR+MQ,,,Aberdeen,North Eastern Scotland,Aberdeen City and Aberdeenshire,,10.5616


In [134]:
pc_df.in_use_.value_counts()

Yes    1776153
No      887071
Name: in_use_, dtype: int64

In [135]:
pc_df.shape

(2663224, 53)

In [136]:
pc_df = pc_df.dropna(axis="columns", how="all", thresh=0.5*pc_df.shape[0])

In [137]:
pc_df.shape

(2663224, 48)

In [138]:
pc_df.head()

Unnamed: 0,postcode,in_use_,latitude,longitude,easting,northing,grid_ref,county,district,ward,district_code,ward_code,country,county_code,constituency,introduced,parish,population,households,built_up_area,built_up_sub_division,lower_layer_super_output_area,rural_urban,region,altitude,lsoa_code,msoa_code,middle_layer_super_output_area,parish_code,census_output_area,constituency_code,index_of_multiple_deprivation,quality,user_type,last_updated,nearest_station,distance_to_station,postcode_area,postcode_district,police_force,water_company,plus_code,average_income,travel_to_work_area,itl_level_2,itl_level_3,uprns,distance_to_sea
0,AB1 0AA,No,57.101474,-2.242851,385386.0,801193.0,NJ853011,,Aberdeen City,Lower Deeside,S12000033,S13002843,Scotland,S99999999,Aberdeen South,1980-01-01,,,,,,"Cults, Bieldside and Milltimber West - 02",Accessible small town,Scotland,46.0,S01006514,S02001237,"Cults, Bieldside and Milltimber West",,S00090303,S14000002,6715.0,1.0,0.0,2022-05-25,Portlethen,8.31408,AB,AB1,Scotland,Scottish Water,9C9V4Q24+HV,,Aberdeen,North Eastern Scotland,Aberdeen City and Aberdeenshire,,9.87661
1,AB1 0AB,No,57.102554,-2.246308,385177.0,801314.0,NJ851013,,Aberdeen City,Lower Deeside,S12000033,S13002843,Scotland,S99999999,Aberdeen South,1980-01-01,,,,,,"Cults, Bieldside and Milltimber West - 02",Accessible small town,Scotland,61.0,S01006514,S02001237,"Cults, Bieldside and Milltimber West",,S00090303,S14000002,6715.0,1.0,0.0,2022-05-25,Portlethen,8.55457,AB,AB1,Scotland,Scottish Water,9C9V4Q33+2F,,Aberdeen,North Eastern Scotland,Aberdeen City and Aberdeenshire,,10.1151
2,AB1 0AD,No,57.100556,-2.248342,385053.0,801092.0,NJ850010,,Aberdeen City,Lower Deeside,S12000033,S13002843,Scotland,S99999999,Aberdeen South,1980-01-01,,,,,,"Cults, Bieldside and Milltimber West - 02",Accessible small town,Scotland,45.0,S01006514,S02001237,"Cults, Bieldside and Milltimber West",,S00090399,S14000002,6715.0,1.0,0.0,2022-05-25,Portlethen,8.54352,AB,AB1,Scotland,Scottish Water,9C9V4Q22+6M,,Aberdeen,North Eastern Scotland,Aberdeen City and Aberdeenshire,,10.1476
3,AB1 0AE,No,57.084444,-2.255708,384600.0,799300.0,NO845993,,Aberdeenshire,North Kincardine,S12000034,S13002864,Scotland,S99999999,West Aberdeenshire and Kincardine,1994-02-01,,,,,,"Dunecht, Durris and Drumoak - 01",Accessible rural area,Scotland,51.0,S01006853,S02001296,"Dunecht, Durris and Drumoak",,S00091322,S14000058,5069.0,8.0,0.0,2022-05-25,Portlethen,8.20809,AB,AB1,Scotland,Scottish Water,9C9V3PMV+QP,,Aberdeen,North Eastern Scotland,Aberdeen City and Aberdeenshire,,9.39683
4,AB1 0AF,No,57.096656,-2.258102,384460.0,800660.0,NJ844006,,Aberdeen City,Lower Deeside,S12000033,S13002843,Scotland,S99999999,Aberdeen South,1990-12-01,,,,,,Culter - 06,Accessible small town,Scotland,46.0,S01006511,S02001236,Culter,,S00090299,S14000002,6253.0,8.0,1.0,2022-05-25,Portlethen,8.85583,AB,AB1,Scotland,Scottish Water,9C9V3PWR+MQ,,Aberdeen,North Eastern Scotland,Aberdeen City and Aberdeenshire,,10.5616


In [139]:
a = np.intersect1d(pp_df.columns, pc_df.columns)
print (a)

['postcode']


In [140]:
b = np.intersect1d(hpi_df.columns, pc_df.columns)
print (b)

[]


In [None]:
hpi_df = hpi_df.dropna(axis="columns", how="all", thresh=0.5*hpi_df.shape[0])

In [None]:
df_house_index = (
    df_house_index.pipe(clean_names)
    .pipe(drop_columns, string="change|price")
    .pipe(col_to_dates, cols=["date"])
    .rename({"date": "hpi_date"}, axis="columns")
)

# HPI Data

In [125]:
hpi_columns = {
    "regionname": "region_name",
    "areacode": "area_code",
    "detachedindex": "detached_index",
    "semidetachedindex": "semi_detached_index",
    "terracedindex": "terraced_index",
    "flatindex": "flat_index",
}

In [127]:
hpi_df = clean_names(df_house_index)
hpi_df = hpi_df[["date", "regionname", "areacode", "index",
       "detachedindex", "semidetachedindex", "terracedindex", "flatindex"]]
hpi_df = hpi_df.rename(hpi_columns, axis="columns")
hpi_df["date"] = pd.to_datetime(hpi_df["date"],format="%d/%m/%Y")
hpi_df = month_year(hpi_df, month_year_column="hpi_month_year")
hpi_df.head()
print(hpi_df.shape)

(136513, 9)


In [130]:
hpi_df.dtypes

date                   datetime64[ns]
region_name                    object
area_code                      object
index                         float64
detached_index                float64
semi_detached_index           float64
terraced_index                float64
flat_index                    float64
hpi_month_year              period[M]
dtype: object

In [30]:
df_house_index.shape[0]

136513

In [32]:
df_house_index.dropna(axis="columns", how="all", thresh=0.5*df_house_index.shape[0], subset=None, inplace=False).head()

Unnamed: 0,Date,RegionName,AreaCode,AveragePrice,Index,1m%Change,12m%Change,SalesVolume,DetachedPrice,DetachedIndex,Detached1m%Change,Detached12m%Change,SemiDetachedPrice,SemiDetachedIndex,SemiDetached1m%Change,SemiDetached12m%Change,TerracedPrice,TerracedIndex,Terraced1m%Change,Terraced12m%Change,FlatPrice,FlatIndex,Flat1m%Change,Flat12m%Change,NewPrice,NewIndex,New1m%Change,New12m%Change,NewSalesVolume,OldPrice,OldIndex,Old1m%Change,Old12m%Change,OldSalesVolume
0,01/01/2004,Aberdeenshire,S12000034,81693.66964,40.864214,,,388.0,122490.0641,43.610982,,,70563.15784,40.821887,,,55319.63336,38.305671,,,48016.07412,42.433549,,,88436.13953,40.26725,,,103.0,81043.95084,40.883367,,,285.0
1,01/02/2004,Aberdeenshire,S12000034,81678.76231,40.856757,-0.018248,,326.0,121280.884,43.180469,-0.987166,,70804.42408,40.961464,0.341915,,55720.07531,38.582954,0.723869,,49030.18133,43.329752,2.112016,,88606.44649,40.344795,0.192576,,107.0,80965.29542,40.843688,-0.097053,,219.0
2,01/03/2004,Aberdeenshire,S12000034,83525.09702,41.780317,2.260483,,453.0,123395.4269,43.933325,1.743509,,72689.07253,42.051762,2.661767,,57362.85925,39.720488,2.94828,,50349.44771,44.495636,2.690723,,90296.91375,41.114508,1.907838,,140.0,82903.23948,41.821302,2.393549,,313.0
3,01/04/2004,Aberdeenshire,S12000034,84333.679,42.18478,0.968071,,571.0,122334.0258,43.555427,-0.860162,,74484.23119,43.090289,2.46964,,59193.39722,40.98803,3.191155,,51736.22329,45.72118,2.754301,,90319.87844,41.124964,0.025432,,180.0,84003.99161,42.376586,1.327755,,391.0
4,01/05/2004,Aberdeenshire,S12000034,86379.95396,43.208353,2.426403,,502.0,124498.8747,44.326193,1.769621,,76637.73414,44.336124,2.89122,,61202.79629,42.379424,3.394634,,53230.23061,47.041488,2.887739,,91989.17763,41.885039,1.848208,,167.0,86222.73484,43.495852,2.641235,,335.0


In [35]:
df_house_index_filter = (
    df_house_index.pipe(clean_names)
    .pipe(drop_columns, string="change|price")
    .rename({"date": "hpi_date"}, axis="columns")
)

  df.columns = df.columns.str.replace(".", "_")


In [36]:
df_house_index_filter["hpi_date"] = pd.to_datetime(df_house_index_filter["hpi_date"],format="%d/%m/%Y")

In [37]:
df_house_index_filter.head()

Unnamed: 0,hpi_date,regionname,areacode,index,indexsa,salesvolume,detachedindex,semidetachedindex,terracedindex,flatindex,cashindex,cashsalesvolume,mortgageindex,mortgagesalesvolume,ftbindex,fooindex,newindex,newsalesvolume,oldindex,oldsalesvolume
0,2004-01-01,Aberdeenshire,S12000034,40.864214,,388.0,43.610982,40.821887,38.305671,42.433549,,,,,,,40.26725,103.0,40.883367,285.0
1,2004-02-01,Aberdeenshire,S12000034,40.856757,,326.0,43.180469,40.961464,38.582954,43.329752,,,,,,,40.344795,107.0,40.843688,219.0
2,2004-03-01,Aberdeenshire,S12000034,41.780317,,453.0,43.933325,42.051762,39.720488,44.495636,,,,,,,41.114508,140.0,41.821302,313.0
3,2004-04-01,Aberdeenshire,S12000034,42.18478,,571.0,43.555427,43.090289,40.98803,45.72118,,,,,,,41.124964,180.0,42.376586,391.0
4,2004-05-01,Aberdeenshire,S12000034,43.208353,,502.0,44.326193,44.336124,42.379424,47.041488,,,,,,,41.885039,167.0,43.495852,335.0


In [15]:
df_house_index.head()

Unnamed: 0,hpi_date,regionname,areacode,index,indexsa,salesvolume,detachedindex,semidetachedindex,terracedindex,flatindex,cashindex,cashsalesvolume,mortgageindex,mortgagesalesvolume,ftbindex,fooindex,newindex,newsalesvolume,oldindex,oldsalesvolume
0,2004-01-01,Aberdeenshire,S12000034,40.864214,,388.0,43.610982,40.821887,38.305671,42.433549,,,,,,,40.26725,103.0,40.883367,285.0
1,2004-01-02,Aberdeenshire,S12000034,40.856757,,326.0,43.180469,40.961464,38.582954,43.329752,,,,,,,40.344795,107.0,40.843688,219.0
2,2004-01-03,Aberdeenshire,S12000034,41.780317,,453.0,43.933325,42.051762,39.720488,44.495636,,,,,,,41.114508,140.0,41.821302,313.0
3,2004-01-04,Aberdeenshire,S12000034,42.18478,,571.0,43.555427,43.090289,40.98803,45.72118,,,,,,,41.124964,180.0,42.376586,391.0
4,2004-01-05,Aberdeenshire,S12000034,43.208353,,502.0,44.326193,44.336124,42.379424,47.041488,,,,,,,41.885039,167.0,43.495852,335.0


In [38]:
df_house_index_filter['hpi_date'].dt.to_period('M').max()

Period('2022-04', 'M')

In [20]:
df_house_index.sort_values(by='hpi_date', inplace=True, ascending=False)

In [21]:
df_house_index

Unnamed: 0,hpi_date,regionname,areacode,index,indexsa,salesvolume,detachedindex,semidetachedindex,terracedindex,flatindex,cashindex,cashsalesvolume,mortgageindex,mortgagesalesvolume,ftbindex,fooindex,newindex,newsalesvolume,oldindex,oldsalesvolume
136512,2022-01-04,Yorkshire and The Humber,E12000003,148.202900,146.132724,,153.047441,151.323145,145.798754,132.673982,145.060267,,149.577295,,147.617145,148.781320,,,,
6531,2022-01-04,Bedford,E06000055,156.687772,,,163.741567,159.959283,154.832405,139.746762,152.098771,,157.919185,,154.593259,158.724781,,,,
114124,2022-01-04,Swindon,E06000030,148.199463,,,156.135498,152.787961,147.158473,132.331885,144.853381,,148.975622,,146.431795,150.142871,,,,
96598,2022-01-04,Scotland,S92000003,139.271363,138.381544,,149.204412,143.247069,140.772546,130.162591,137.200014,,140.245052,,136.806875,141.610098,,,,
95949,2022-01-04,Scarborough,E07000168,146.209532,,,154.154580,150.156718,146.548321,132.425144,144.358189,,148.115065,,144.553199,147.572432,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122318,1968-01-04,Wales,W92000004,2.119327,,,,,,,,,,,,,,,,
128327,1968-01-04,West Midlands Region,E12000005,2.097808,,,,,,,,,,,,,,,,
135864,1968-01-04,Yorkshire and The Humber,E12000003,1.991658,,,,,,,,,,,,,,,,
66323,1968-01-04,London,E12000007,1.096815,,,,,,,,,,,,,,,,


In [None]:
postcode_columns: List[Union[str, Any]] = [
    "postcode",
    "latitude",
    "longitude",
    "grid_ref",
    "county",
    "district",
    "ward",
    "district_code",
    "ward_code",
    "county_code",
    "constituency",
    "region",
    "london_zone",
    "middle_layer_super_output_area",
    "postcode_area",
    "postcode_district",
    "index_of_multiple_deprivation",
    "quality",
    "user_type",
    "last_updated",
    "nearest_station",
    "distance_to_station",
    "postcode_area",
    "postcode_district",
    "police_force",
    "water_company",
    "plus_code",
    "average_income",
    "sewage_company",
    "travel_to_work_area",
    "rural_urban",
    "altitude",
]

In [None]:
df_postcode = df_postcode.pipe(clean_names).loc[:, postcode_columns]


In [None]:
pp_index_columns = {
    "detachedindex": "pp_detached_index",
    "semidetachedindex": "pp_semi_detached_index",
    "terracedindex": "pp_terraced_index",
    "flatindex": "pp_flat_index",
}

pp_avg_columns = [
    "pp_detached_index",
    "pp_semi_detached_index",
    "pp_terraced_index",
    "pp_flat_index",
]

In [None]:
avg_columns = ["detachedindex", "semidetachedindex", "terracedindex", "flatindex"]

logger.info(df_price_paid.columns)
logger.info(df_house_index.columns)

df_price_paid = (
    df_price_paid.merge(df_postcode, on="postcode")
    .merge(
        df_house_index,
        how="left",
        left_on=["district_code", "month_year"],
        right_on=["areacode", "hpi_date"],
    )
    .rename(pp_index_columns, axis="columns")
    .pipe(mean_column, "pp_average_index", pp_avg_columns)
    .merge(
        df_house_index,
        how="left",
        left_on=["district_code", "current_month"],
        right_on=["areacode", "hpi_date"],
    )
    .pipe(mean_column, "averageindex", avg_columns)
    .pipe(adjust_price, "T", "terracedindex", "pp_terraced_index")
    .pipe(adjust_price, "S", "semidetachedindex", "pp_semi_detached_index")
    .pipe(adjust_price, "D", "detachedindex", "pp_detached_index")
    .pipe(adjust_price, "F", "flatindex", "pp_flat_index")
    .pipe(adjust_price, "O", "averageindex", "pp_average_index")
)

logger.info(df_price_paid.head())


In [None]:
duplicate_list = [
    "date",
    "postcode",
    "type",
    "new_build",
    "land",
    "primary_address",
    "secondary_address",
    "street",
    "ppd",
    "record",
    "month_year",
    "current_month",
    "latitude",
    "longitude",
    "grid_ref",
    "county",
    "district",
    "ward",
    "district_code",
    "ward_code",
    "county_code",
    "region",
    "london_zone",
    "middle_layer_super_output_area",
    "postcode_area",
    "postcode_district",
    "hpi_date",
    # "regionname",
    # "areacode",
]


logger.info(df_price_paid.columns)

df_price_paid = (
    df_price_paid.pipe(drop_columns, string="e_y|index")
    .rename(
        {
            "hpi_date_x": "hpi_date",
            "region_name_x": "regionname",
            "areacode_x": "areacode",
        },
        axis="columns",
    )
    .sort_values(by=["date"])
    .drop_duplicates(subset=duplicate_list, keep="last")
)
logger.info(df_price_paid.head())

df_price_paid = df_price_paid[df_price_paid["adjusted_price"].notnull()]
df_price_paid["adjusted_price"] = df_price_paid["adjusted_price"].astype(int)
df_price_paid = df_price_paid[
    (np.abs(stats.zscore(df_price_paid["adjusted_price"])) < 3)
]
df_price_paid["london_zone"] = df_price_paid["london_zone"].fillna(
    df_price_paid.london_zone.max() + 1.0
)

logger.info(df_price_paid.head())
logger.info("Completed processing")

# logger.info(df_price_paid.dtypes)
#
# #  Drop the duplicates, keeping only the first instance.
# df_price_paid.to_parquet("./data/processed/pp_sample_clean.parquet", index=False)
#
# logger.info("Transformed data saved")
