#### Clean Data Notebook

1. Rename the columns
2. Irrelevant columns
3. Duplicates
4. Null Values in relevant columns
5. Outliers
6. Map column values
7. Visualise and investigate

In [2]:
import pandas as pd

In [2]:
column_names = ['id', 'price', 'date', 'postcode', 
        'type', 'new_build', 'land', 'primary_address', 
        'secondary_address', 'street', 'locality', 'town_city', 
        'district', 'county', 'ppd', 'record' ]

df = pd.read_csv("../data/raw/pp-complete.csv", names = column_names, parse_dates = ['date'])

In [None]:
df.head(5)

In [None]:
df.type.value_counts()

In [None]:
print(f"The data contains {df.shape[0]} rows.")

In [None]:
df.dtypes

In [None]:
# Since we know the id's are not duplicated we can search excluding the id to see if any houses are entered twice.
duplicates = df.loc[:, 'price':'record'][df.loc[:, 'price':'record'].duplicated()]
print(f"The number of duplicates excluding id is {duplicates.shape[0]}")

In [None]:
#Lets perfom a quick check to verify there are actually duplicates (The answer should be two observations)
#df[(df['postcode']=='') & (df['primary_address']=='')]

In [3]:
# Lets drop the duplicates, keeping only the first instance. 
df_no_duplicates = df.drop_duplicates(subset = df.columns[1:] , keep = "first")

In [None]:
#Lets check the number of empty rows in each column. 
df_no_duplicates.isnull().sum()

In [4]:
df_empty_postcodes = df_no_duplicates[df_no_duplicates['postcode'].isnull()]

In [None]:
df_empty_postcodes.head()

In [5]:
#Difficult decision - we can obviously try and workout a way to get the postcode using the roads but 
#that can prove to be too much work for us
#For now let us delete the observations with no postcode. 
df_complete = df_no_duplicates[df_no_duplicates['postcode'].notnull()]

In [None]:
df_complete.shape[0]

In [None]:
#Lets view the range of the data (10 largest and 10 smallest)
df_complete.nlargest(n=10, columns='price')

I think we can all admit - we can't afford these kind of houses. I really wanted to make this project for a normal ordinary house. So what I really want to do is to cap the data to just houses that are less than a million. 

In [None]:
# Lets View the cheaperst houses. 
df_complete.nsmallest(n=10, columns='price')

In [6]:
# From this I think we should only observe houses that are between 10,000 and 1,000,000
df_price_cut = df_complete[(df_complete['price']<=1000000) & (df['price']>=10000)].reset_index(drop = True)

  df_price_cut = df_complete[(df_complete['price']<=1000000) & (df['price']>=10000)].reset_index(drop = True)


In [None]:
perc_reduction = ((df_complete.shape[0] - df_price_cut.shape[0])/df_complete.shape[0])*100
print(f"filtered out {perc_reduction} % of data")

In [None]:
df_price_cut.to_csv("../data/processed/house_1_price_cut.csv", index = False)

In [None]:
# Going forward we need to add some columns

In [7]:
new_build = {'Y': 1,'N': 0}
df_price_cut.new_build = [new_build[item] for item in df_price_cut.new_build]
df_price_cut['year'] = df_price_cut['date'].dt.year
df_price_cut['month'] = df_price_cut['date'].dt.year
df_price_cut['month_year'] = df_price_cut['date'].astype('datetime64[M]')

In [None]:
df_price_cut.head()

In [8]:
df_price_cut = df_price_cut.drop(columns=['locality','town_city','district','county'])

In [None]:
df_price_cut.columns

## I GOT IT WRONG 

In [None]:
#Adjust price to inflation
df_multiplier = pd.read_csv("../data/raw/inflation_multiplier.csv", usecols = ['Year', 'Multiplier'], dtype = {'Year': 'int64', 'Multiplier': 'float64'})
df_multiplier.columns =['year', 'multiplier']

In [None]:
df_multiplier.dtypes

In [None]:
df_multiplier

## Read in the house price index data

In [9]:
df_house_index = pd.read_csv("http://publicdata.landregistry.gov.uk/market-trend-data/house-price-index-data/Average-prices-Property-Type-2020-08.csv?utm_medium=GOV.UK&utm_source=datadownload&utm_campaign=average_price_property_price&utm_term=9.30_21_10_20")
df_house_index.columns = map(str.lower, df_house_index.columns)
df_house_index.columns = df_house_index.columns.str.replace(' ', '_')

In [None]:
df_house_index.head()

In [10]:
df_house_index = df_house_index.drop(columns=df_house_index.columns[df_house_index.columns.str.contains(pat = 'change|price')])

In [None]:
df_house_index.head()

In [11]:
df_house_index.dtypes
df_house_index['date'] = pd.to_datetime(df_house_index['date'])

In [None]:
df_house_index.nlargest(n=10, columns='date')

In [None]:
df_house_index[df_house_index['semi_detached_index'].isnull()]

In [None]:
df_house_index.shape

## Read in the postcode data

In [12]:
df_postcode = pd.read_csv("../data/raw/postcodes.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [13]:
df_postcode.columns = map(str.lower, df_postcode.columns)
df_postcode.columns = df_postcode.columns.str.replace(' ', '_')

In [None]:
df_postcode.columns

In [14]:
df_postcode = df_postcode[['postcode', 'latitude', 'longitude',
       'grid_ref', 'county', 'district', 'ward', 'district_code', 'ward_code',
       'county_code', 'constituency', 'region', 'london_zone', 
        'middle_layer_super_output_area', 'postcode_area', 'postcode_district']]

In [None]:
df_postcode.columns

In [15]:
df_area_info = pd.merge(df_price_cut, df_postcode, on='postcode')

In [None]:
df_area_info.head()

In [None]:
df_area_info.isnull().sum()

In [16]:
df_house_complete_district_1 = pd.merge(df_area_info, df_house_index, how='left', left_on=['district_code','month_year'], right_on=['area_code', 'date'])

In [18]:
df_house_complete_district = pd.merge(df_area_info, df_house_index, how='left', left_on=['county_code','month_year'], right_on=['area_code', 'date'])

In [17]:
df_house_complete_district_1.isnull().sum()

id                                       0
price                                    0
date_x                                   0
postcode                                 0
type                                     0
new_build                                0
land                                     0
primary_address                       4181
secondary_address                 22238432
street                              376717
ppd                                      0
record                                   0
year                                     0
month                                    0
month_year                               0
latitude                                 0
longitude                                0
grid_ref                                 0
county                             1436433
district                                 0
ward                                     0
district_code                            0
ward_code                                0
county_code

In [19]:
df_house_complete_district.isnull().sum()

id                                       0
price                                    0
date_x                                   0
postcode                                 0
type                                     0
new_build                                0
land                                     0
primary_address                       4181
secondary_address                 22238432
street                              376717
ppd                                      0
record                                   0
year                                     0
month                                    0
month_year                               0
latitude                                 0
longitude                                0
grid_ref                                 0
county                             1436433
district                                 0
ward                                     0
district_code                            0
ward_code                                0
county_code

In [None]:
df_house_complete_district.isnull().sum()

In [23]:
df_house_complete = df_house_complete_district_1

In [24]:
df_house_complete.head()

Unnamed: 0,id,price,date_x,postcode,type,new_build,land,primary_address,secondary_address,street,...,middle_layer_super_output_area,postcode_area,postcode_district,date_y,region_name,area_code,detached_index,semi_detached_index,terraced_index,flat_index
0,{A42E2F04-2538-4A25-94C5-49E29C6C8FA8},18500,1995-01-31,TQ1 1RY,F,0,L,VILLA PARADISO,FLAT 10,HIGHER WARBERRY ROAD,...,Wellswood,TQ,TQ1,1995-01-01,Torbay,E06000027,29.473651,28.476256,27.784074,31.723504
1,{08A8FE57-2C42-40DE-9C33-E8D7D8A9CEBB},18000,1995-09-07,TQ1 1RY,F,0,L,VILLA PARADISO,FLAT 4,HIGHER WARBERRY ROAD,...,Wellswood,TQ,TQ1,1995-09-01,Torbay,E06000027,29.808396,28.975339,28.191223,31.658162
2,{10BE69F8-64A5-4FD7-A172-B4448FBC44EB},19000,1995-04-28,TQ1 1RY,F,0,L,VILLA PARADISO,FLAT 3,HIGHER WARBERRY ROAD,...,Wellswood,TQ,TQ1,1995-04-01,Torbay,E06000027,29.160153,28.678533,27.951063,31.760541
3,{2A289EA0-040F-CDC8-E050-A8C063054829},300000,1995-01-09,TQ1 1RY,F,0,F,ROSA PINES LTD,,HIGHER WARBERRY ROAD,...,Wellswood,TQ,TQ1,1995-01-01,Torbay,E06000027,29.473651,28.476256,27.784074,31.723504
4,{01178325-E934-4283-A908-06DBE03D1E16},28500,1995-11-30,TQ1 1RY,F,0,L,VILLA PARADISO,FLAT 3,HIGHER WARBERRY ROAD,...,Wellswood,TQ,TQ1,1995-11-01,Torbay,E06000027,29.393848,28.605845,27.536689,31.458432


In [25]:
df_house_complete = df_house_complete.drop(columns=['date_y'])
df_house_complete = df_house_complete.rename({'date_x': 'date'}, axis='columns')

In [50]:
df_house_complete['p_average_index'] = df_house_complete[['p_detached_index','p_semi_detached_index','p_terraced_index','p_flat_index']].mean(axis=1)

In [None]:
df_house_complete.head()

In [41]:
from datetime import datetime
import dateutil.relativedelta

df_house_complete['current_month'] = pd.to_datetime(datetime.today().date().replace(day=1)) - dateutil.relativedelta.relativedelta(months=3) 

In [38]:
df_house_complete = df_house_complete.rename({'date': 'transaction_date',
                                             'detached_index':'p_detached_index',
                                              'semi_detached_index':'p_semi_detached_index',
                                              'terraced_index':'p_terraced_index',
                                              'flat_index':'p_flat_index',
                                              'average_index':'p_average_index'
                                             }, axis='columns')

In [45]:
df_house_complete_joined = pd.merge(df_house_complete, df_house_index, how='left', left_on=['district_code','current_month'], right_on=['area_code', 'date'])

In [None]:
#df_house_complete = pd.merge(df_house_complete, df_house_index, how='left', left_on=['district_code','current_month'], right_on=['area_code', 'date'])

In [None]:
dummy_df_house_complete.isnull().sum()

In [46]:
df_house_complete_joined['current_average_index'] = df_house_complete_joined[['detached_index','semi_detached_index','terraced_index','flat_index']].mean(axis=1)

In [47]:
df_house_complete = df_house_complete_joined

In [51]:
df_house_complete.columns

Index(['id', 'price', 'date_x', 'postcode', 'type', 'new_build', 'land',
       'primary_address', 'secondary_address', 'street', 'ppd', 'record',
       'year', 'month', 'month_year', 'latitude', 'longitude', 'grid_ref',
       'county', 'district', 'ward', 'district_code', 'ward_code',
       'county_code', 'constituency', 'region', 'london_zone',
       'middle_layer_super_output_area', 'postcode_area', 'postcode_district',
       'date_y', 'region_name_x', 'area_code_x', 'p_detached_index',
       'p_semi_detached_index', 'p_terraced_index', 'p_flat_index',
       'current_month', 'date', 'region_name_y', 'area_code_y',
       'detached_index', 'semi_detached_index', 'terraced_index', 'flat_index',
       'current_average_index', 'adjusted_price', 'p_average_index'],
      dtype='object')

In [52]:
df_house_complete.loc[df_house_complete['type'] == 'T', 'adjusted_price'] = (df_house_complete['terraced_index']/df_house_complete['p_terraced_index']) * df_house_complete['price']
df_house_complete.loc[df_house_complete['type'] == 'S', 'adjusted_price'] = (df_house_complete['semi_detached_index']/df_house_complete['p_semi_detached_index']) * df_house_complete['price']
df_house_complete.loc[df_house_complete['type'] == 'D', 'adjusted_price'] = (df_house_complete['detached_index']/df_house_complete['p_detached_index']) * df_house_complete['price']
df_house_complete.loc[df_house_complete['type'] == 'F', 'adjusted_price'] = (df_house_complete['flat_index']/df_house_complete['p_flat_index']) * df_house_complete['price']
df_house_complete.loc[df_house_complete['type'] == 'O', 'adjusted_price'] = (df_house_complete['current_average_index']/df_house_complete['p_average_index']) * df_house_complete['price']

In [53]:
df_house_complete.isnull().sum()

id                                       0
price                                    0
date_x                                   0
postcode                                 0
type                                     0
new_build                                0
land                                     0
primary_address                       4181
secondary_address                 22238432
street                              376717
ppd                                      0
record                                   0
year                                     0
month                                    0
month_year                               0
latitude                                 0
longitude                                0
grid_ref                                 0
county                             1436433
district                                 0
ward                                     0
district_code                            0
ward_code                                0
county_code

In [None]:
df_house_complete[df_house_complete['adjusted_price'].isnull()].head(20)

In [None]:
df_house_complete.shape

In [54]:
df_house_complete_nn = df_house_complete[df_house_complete['adjusted_price'].notnull()]

In [55]:
df_house_complete_nn['adjusted_price'] = df_house_complete_nn['adjusted_price'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_house_complete_nn['adjusted_price'] = df_house_complete_nn['adjusted_price'].astype(int)


In [56]:
df_house_complete_nn.shape

(24899926, 48)

In [57]:
df_house_complete_nn.columns

Index(['id', 'price', 'date_x', 'postcode', 'type', 'new_build', 'land',
       'primary_address', 'secondary_address', 'street', 'ppd', 'record',
       'year', 'month', 'month_year', 'latitude', 'longitude', 'grid_ref',
       'county', 'district', 'ward', 'district_code', 'ward_code',
       'county_code', 'constituency', 'region', 'london_zone',
       'middle_layer_super_output_area', 'postcode_area', 'postcode_district',
       'date_y', 'region_name_x', 'area_code_x', 'p_detached_index',
       'p_semi_detached_index', 'p_terraced_index', 'p_flat_index',
       'current_month', 'date', 'region_name_y', 'area_code_y',
       'detached_index', 'semi_detached_index', 'terraced_index', 'flat_index',
       'current_average_index', 'adjusted_price', 'p_average_index'],
      dtype='object')

In [59]:
df_house_complete_nn = df_house_complete_nn.drop(columns=['p_detached_index','p_semi_detached_index', 'p_terraced_index', 
                                                                                      'p_flat_index','current_month', 'date', 'region_name_y', 'area_code_y',
                                                                                       'detached_index', 'semi_detached_index', 'terraced_index', 'flat_index',
                                                                                       'current_average_index', 'p_average_index'])

In [60]:
df_house_complete_nn.head()

Unnamed: 0,id,price,date_x,postcode,type,new_build,land,primary_address,secondary_address,street,...,constituency,region,london_zone,middle_layer_super_output_area,postcode_area,postcode_district,date_y,region_name_x,area_code_x,adjusted_price
0,{A42E2F04-2538-4A25-94C5-49E29C6C8FA8},18500,1995-01-31,TQ1 1RY,F,0,L,VILLA PARADISO,FLAT 10,HIGHER WARBERRY ROAD,...,Torbay,South West,,Wellswood,TQ,TQ1,1995-01-01,Torbay,E06000027,67590
1,{08A8FE57-2C42-40DE-9C33-E8D7D8A9CEBB},18000,1995-09-07,TQ1 1RY,F,0,L,VILLA PARADISO,FLAT 4,HIGHER WARBERRY ROAD,...,Torbay,South West,,Wellswood,TQ,TQ1,1995-09-01,Torbay,E06000027,65899
2,{10BE69F8-64A5-4FD7-A172-B4448FBC44EB},19000,1995-04-28,TQ1 1RY,F,0,L,VILLA PARADISO,FLAT 3,HIGHER WARBERRY ROAD,...,Torbay,South West,,Wellswood,TQ,TQ1,1995-04-01,Torbay,E06000027,69336
3,{2A289EA0-040F-CDC8-E050-A8C063054829},300000,1995-01-09,TQ1 1RY,F,0,F,ROSA PINES LTD,,HIGHER WARBERRY ROAD,...,Torbay,South West,,Wellswood,TQ,TQ1,1995-01-01,Torbay,E06000027,1096057
4,{01178325-E934-4283-A908-06DBE03D1E16},28500,1995-11-30,TQ1 1RY,F,0,L,VILLA PARADISO,FLAT 3,HIGHER WARBERRY ROAD,...,Torbay,South West,,Wellswood,TQ,TQ1,1995-11-01,Torbay,E06000027,105002


In [61]:
df_house_complete_nn.to_csv("../data/processed/house_4_hpi.csv", index = False)

In [65]:
df_house_complete_nn[df_house_complete_nn['postcode']=='S12 4WX']

Unnamed: 0,id,price,date_x,postcode,type,new_build,land,primary_address,secondary_address,street,...,constituency,region,london_zone,middle_layer_super_output_area,postcode_area,postcode_district,date_y,region_name_x,area_code_x,adjusted_price
11700741,{1C0A151E-4B7B-4A32-BD4D-D8A42AC022E0},44500,1995-06-30,S12 4WX,S,0,F,3,,KILDONAN GROVE,...,Sheffield South East,Yorkshire and The Humber,,Birley,S,S12,1995-06-01,Sheffield,E08000019,189822
11700742,{63E73132-5385-40B3-B0E8-2D7E72B99EAB},37000,1996-07-07,S12 4WX,F,0,L,20,,KILDONAN GROVE,...,Sheffield South East,Yorkshire and The Humber,,Birley,S,S12,1996-07-01,Sheffield,E08000019,130387
11700743,{A75C6563-2ABA-4985-A5E4-89D1E508D2CB},38500,1999-06-25,S12 4WX,F,0,L,18,,KILDONAN GROVE,...,Sheffield South East,Yorkshire and The Humber,,Birley,S,S12,1999-06-01,Sheffield,E08000019,120986
11700744,{AB183E1F-C190-43C6-90F7-DF7AA130677D},38000,1999-09-20,S12 4WX,F,0,L,36,,KILDONAN GROVE,...,Sheffield South East,Yorkshire and The Humber,,Birley,S,S12,1999-09-01,Sheffield,E08000019,116371
11700745,{910B4BFD-DC1A-495C-AB3E-EB8F8150DA66},55000,1999-03-20,S12 4WX,T,0,F,1,,KILDONAN GROVE,...,Sheffield South East,Yorkshire and The Humber,,Birley,S,S12,1999-03-01,Sheffield,E08000019,218662
11700746,{52E9A25A-7524-4E34-A775-2E3EFD11EC77},20500,1999-10-29,S12 4WX,F,0,L,38,,KILDONAN GROVE,...,Sheffield South East,Yorkshire and The Humber,,Birley,S,S12,1999-10-01,Sheffield,E08000019,61770
11700747,{E2514DF3-DDB5-4ED6-8321-0B9B43979F71},49000,1999-05-28,S12 4WX,T,0,F,1,,KILDONAN GROVE,...,Sheffield South East,Yorkshire and The Humber,,Birley,S,S12,1999-05-01,Sheffield,E08000019,191119
11700748,{7BE43338-73CF-4092-90EB-82BD4539856D},69950,2002-04-26,S12 4WX,S,0,F,5,,KILDONAN GROVE,...,Sheffield South East,Yorkshire and The Humber,,Birley,S,S12,2002-04-01,Sheffield,E08000019,198382
11700749,{3BD00721-B103-4617-B019-CF4828BB320D},26950,2002-03-08,S12 4WX,S,0,L,24,,KILDONAN GROVE,...,Sheffield South East,Yorkshire and The Humber,,Birley,S,S12,2002-03-01,Sheffield,E08000019,77401
11700750,{F121B139-2C52-469B-9991-90824B91EB0A},125000,2002-09-13,S12 4WX,D,0,L,40,,KILDONAN GROVE,...,Sheffield South East,Yorkshire and The Humber,,Birley,S,S12,2002-09-01,Sheffield,E08000019,283362


In [None]:
df_house_complete_nn['adjusted_price'] = df_house_complete_nn['adjusted_price'].fillna(0).astype(int)

In [18]:
df.to_csv("../data/processed/house_4_hpi.csv", index = False)

In [None]:
df_house_complete[df_house_complete['adjusted_price']== 0].shape[0]

In [None]:
df_house_complete.head()

In [None]:
pd.merge(frame_1, frame_2, how='left', left_on='postcode', right_on='postcode')
df_postcode.head()

In [5]:
pd.set_option('display.max_columns', None)
#df_postcode[df_postcode['postcode']=='NG8 3PA']

In [None]:
df_multiple_transactions = df_house_data.loc[:, 'postcode':'county'][df_house_data.loc[:, 'postcode':'county'].duplicated()]

In [None]:
df_multiple_transactions.shape

In [None]:
# Lets drop the duplicates, keeping only the first instance. 
df_house_data_ordered = df_house_data.sort_values(by=['date'])
df_house_last_sale = df_house_data_ordered.drop_duplicates(subset = df_house_data_ordered.columns[3:14] ,keep = "last")

In [None]:
df_house_last_sale.shape

In [None]:
df_house_last_sale.to_csv("../data/processed/house_3_last_sale.csv", index = False)

In [3]:
df = pd.read_csv("../data/processed/house_4_hpi.csv", parse_dates = ['date_x','date_y'])

In [6]:
df.head(5)

Unnamed: 0,id,price,date_x,postcode,type,new_build,land,primary_address,secondary_address,street,ppd,record,year,month,month_year,latitude,longitude,grid_ref,county,district,ward,district_code,ward_code,county_code,constituency,region,london_zone,middle_layer_super_output_area,postcode_area,postcode_district,date_y,region_name_x,area_code_x,adjusted_price
0,{A42E2F04-2538-4A25-94C5-49E29C6C8FA8},18500,1995-01-31,TQ1 1RY,F,0,L,VILLA PARADISO,FLAT 10,HIGHER WARBERRY ROAD,A,A,1995,1995,1995-01-01,50.467875,-3.519526,SX922642,Devon,Torbay,Wellswood,E06000027,E05012269,E10000008,Torbay,South West,,Wellswood,TQ,TQ1,1995-01-01,Torbay,E06000027,67590
1,{08A8FE57-2C42-40DE-9C33-E8D7D8A9CEBB},18000,1995-09-07,TQ1 1RY,F,0,L,VILLA PARADISO,FLAT 4,HIGHER WARBERRY ROAD,A,A,1995,1995,1995-09-01,50.467875,-3.519526,SX922642,Devon,Torbay,Wellswood,E06000027,E05012269,E10000008,Torbay,South West,,Wellswood,TQ,TQ1,1995-09-01,Torbay,E06000027,65899
2,{10BE69F8-64A5-4FD7-A172-B4448FBC44EB},19000,1995-04-28,TQ1 1RY,F,0,L,VILLA PARADISO,FLAT 3,HIGHER WARBERRY ROAD,A,A,1995,1995,1995-04-01,50.467875,-3.519526,SX922642,Devon,Torbay,Wellswood,E06000027,E05012269,E10000008,Torbay,South West,,Wellswood,TQ,TQ1,1995-04-01,Torbay,E06000027,69336
3,{2A289EA0-040F-CDC8-E050-A8C063054829},300000,1995-01-09,TQ1 1RY,F,0,F,ROSA PINES LTD,,HIGHER WARBERRY ROAD,A,A,1995,1995,1995-01-01,50.467875,-3.519526,SX922642,Devon,Torbay,Wellswood,E06000027,E05012269,E10000008,Torbay,South West,,Wellswood,TQ,TQ1,1995-01-01,Torbay,E06000027,1096057
4,{01178325-E934-4283-A908-06DBE03D1E16},28500,1995-11-30,TQ1 1RY,F,0,L,VILLA PARADISO,FLAT 3,HIGHER WARBERRY ROAD,A,A,1995,1995,1995-11-01,50.467875,-3.519526,SX922642,Devon,Torbay,Wellswood,E06000027,E05012269,E10000008,Torbay,South West,,Wellswood,TQ,TQ1,1995-11-01,Torbay,E06000027,105002


In [10]:
df.shape

(24899926, 29)

In [16]:
df.shape

(15687273, 29)

In [8]:
df = df.drop(columns=['record', 'year', 'month', 'date_y', 'area_code_x'])

In [15]:
df = df.drop_duplicates(subset = ['postcode', 'type', 'new_build', 'land',
                                           'primary_address', 'secondary_address', 'street','latitude', 'longitude', 'grid_ref', 'county', 'district', 'ward',
                                           'district_code', 'ward_code', 'county_code', 'constituency', 'region',
                                           'london_zone', 'middle_layer_super_output_area', 'postcode_area',
                                           'postcode_district'] ,keep = "last")

In [17]:
df[df['postcode']=='S12 4WX']

Unnamed: 0,id,price,date_x,postcode,type,new_build,land,primary_address,secondary_address,street,ppd,month_year,latitude,longitude,grid_ref,county,district,ward,district_code,ward_code,county_code,constituency,region,london_zone,middle_layer_super_output_area,postcode_area,postcode_district,region_name_x,adjusted_price
11587337,{A75C6563-2ABA-4985-A5E4-89D1E508D2CB},38500,1999-06-25,S12 4WX,F,0,L,18,,KILDONAN GROVE,A,1999-06-01,53.34635,-1.410088,SK393834,South Yorkshire,Sheffield,Birley,E08000019,E05010859,E11000016,Sheffield South East,Yorkshire and The Humber,,Birley,S,S12,Sheffield,120986
11587340,{52E9A25A-7524-4E34-A775-2E3EFD11EC77},20500,1999-10-29,S12 4WX,F,0,L,38,,KILDONAN GROVE,A,1999-10-01,53.34635,-1.410088,SK393834,South Yorkshire,Sheffield,Birley,E08000019,E05010859,E11000016,Sheffield South East,Yorkshire and The Humber,,Birley,S,S12,Sheffield,61770
11587341,{E2514DF3-DDB5-4ED6-8321-0B9B43979F71},49000,1999-05-28,S12 4WX,T,0,F,1,,KILDONAN GROVE,A,1999-05-01,53.34635,-1.410088,SK393834,South Yorkshire,Sheffield,Birley,E08000019,E05010859,E11000016,Sheffield South East,Yorkshire and The Humber,,Birley,S,S12,Sheffield,191119
11587345,{9EBB9811-BC8B-4DAE-9247-1ABCE0B1B43D},128500,2003-02-28,S12 4WX,D,0,F,9,,KILDONAN GROVE,A,2003-02-01,53.34635,-1.410088,SK393834,South Yorkshire,Sheffield,Birley,E08000019,E05010859,E11000016,Sheffield South East,Yorkshire and The Humber,,Birley,S,S12,Sheffield,275557
11587348,{3D47873B-3F2F-45FC-B152-EB77F44B5DB8},52000,2004-12-20,S12 4WX,T,0,L,32,,KILDONAN GROVE,A,2004-12-01,53.34635,-1.410088,SK393834,South Yorkshire,Sheffield,Birley,E08000019,E05010859,E11000016,Sheffield South East,Yorkshire and The Humber,,Birley,S,S12,Sheffield,78748
11587349,{26F89041-F04E-48F7-927C-241E07DA289E},250000,2004-09-24,S12 4WX,D,0,F,48,,KILDONAN GROVE,A,2004-09-01,53.34635,-1.410088,SK393834,South Yorkshire,Sheffield,Birley,E08000019,E05010859,E11000016,Sheffield South East,Yorkshire and The Humber,,Birley,S,S12,Sheffield,368609
11587350,{B21A82DD-B935-4603-B64B-F06F55EEF058},250000,2004-12-06,S12 4WX,D,1,F,52,,KILDONAN GROVE,A,2004-12-01,53.34635,-1.410088,SK393834,South Yorkshire,Sheffield,Birley,E08000019,E05010859,E11000016,Sheffield South East,Yorkshire and The Humber,,Birley,S,S12,Sheffield,377647
11587351,{D91D8BE6-A90D-4129-948D-0AECDF6011CA},43000,2004-04-07,S12 4WX,F,0,L,30,,KILDONAN GROVE,A,2004-04-01,53.34635,-1.410088,SK393834,South Yorkshire,Sheffield,Birley,E08000019,E05010859,E11000016,Sheffield South East,Yorkshire and The Humber,,Birley,S,S12,Sheffield,62620
11587353,{3F5B6B4F-CA49-414F-B464-FBFA6973D723},150000,2004-05-26,S12 4WX,D,1,F,46,,KILDONAN GROVE,A,2004-05-01,53.34635,-1.410088,SK393834,South Yorkshire,Sheffield,Birley,E08000019,E05010859,E11000016,Sheffield South East,Yorkshire and The Humber,,Birley,S,S12,Sheffield,250086
11587358,{D76F4574-D2DF-40B1-827D-68D291A21C56},74950,2007-11-19,S12 4WX,S,0,L,24,,KILDONAN GROVE,A,2007-11-01,53.34635,-1.410088,SK393834,South Yorkshire,Sheffield,Birley,E08000019,E05010859,E11000016,Sheffield South East,Yorkshire and The Humber,,Birley,S,S12,Sheffield,92050


In [None]:
# Lets drop the duplicates, keeping only the first instance. 
df = df.sort_values(by=['date'])
df = df.drop_duplicates(subset = df.columns[3:14] ,keep = "last")