# Prediction of UK house prices

In [36]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from datetime import datetime

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# force full-width display if viewed in Chrome browser
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Download UK Land Registry price-paid data and create bigquery dataset

In [None]:
# download data to the VM from source: https://www.gov.uk/government/statistical-data-sets/price-paid-data-downloads
!wget http://prod.publicdata.landregistry.gov.uk.s3-website-eu-west-1.amazonaws.com/pp-complete.csv

In [None]:
# move to gcp bucket
!gsutil mv pp-complete.csv gs://housingasodhfmq349p78vp57pasvpfphio/pp-complete.csv

In [None]:
# make bigquery database
!bq mk housing

In [None]:
# create table called 'pricepaid' in housing database
!bq load --source_format=CSV \
    housing.pricepaid gs://housingasodhfmq349p78vp57pasvpfphio/pp-complete.csv \
    trans_id:STRING,price:INTEGER,date:STRING,postcode:STRING,type:STRING,newbuild:STRING,tenure:STRING,paon:STRING,saon:STRING,street:STRING,locality:STRING,town_city:STRING,district:STRING,county:STRING,ppd_type:STRING,record_status:STRING

## Query data

In [224]:
%%bigquery df

SELECT * FROM housing.pricepaid
LIMIT 1009000

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 559.09query/s] 
Downloading: 100%|██████████| 1009000/1009000 [00:04<00:00, 218780.80rows/s]


## Inspect dataframe

In [225]:
df.head(4)

Unnamed: 0,trans_id,price,date,postcode,type,newbuild,tenure,paon,saon,street,locality,town_city,district,county,ppd_type,record_status
0,{7011B109-0A1C-8ED6-E053-6B04A8C075C1},779950,2018-05-17 00:00,RG4 7XD,D,N,F,12,,MIDSUMMER MEADOW,CAVERSHAM,READING,READING,READING,A,A
1,{4F8D2ABA-1001-4CA7-AC98-47E6F8B01975},167000,2004-03-05 00:00,FY3 0DW,D,N,L,63,,BLEASDALE AVENUE,STAINING,BLACKPOOL,BLACKPOOL,BLACKPOOL,A,A
2,{C8180CA7-A382-49F0-BD45-EF71828C64C6},87000,1995-12-20 00:00,RG7 5EF,D,N,F,2,,MUSWELL CLOSE,THEALE,READING,NEWBURY,BERKSHIRE,A,A
3,{D97555A6-2449-4D1C-A037-FD8D9386EE23},62500,1995-12-01 00:00,BS48 2AT,D,N,F,HILLS VIEW,,POUND LANE,NAILSEA,BRISTOL,WOODSPRING,AVON,A,A


In [226]:
df.nunique()

trans_id         1009000
price              20279
date                8482
postcode          186538
type                   1
newbuild               2
tenure                 3
paon               82766
saon                2302
street             76047
locality            7874
town_city            601
district             176
county                89
ppd_type               2
record_status          1
dtype: int64

In [227]:
df['ppd_type'].unique()

array(['A', 'B'], dtype=object)

In [228]:
# check data types
df.dtypes

trans_id         object
price             int64
date             object
postcode         object
type             object
newbuild         object
tenure           object
paon             object
saon             object
street           object
locality         object
town_city        object
district         object
county           object
ppd_type         object
record_status    object
dtype: object

## Pre-processing

### Postcode

In [230]:
# Some rows have missing postcodes
len(df[df['postcode'] == ''])

1490

In [231]:
# Check for null entries in postcode column
len(df[df['postcode'].isnull()])

0

In [232]:
# Discard empty string postcodes
df = df[df['postcode'] != '']

In [233]:
# Check that all postcodes have space
len(df[df['postcode'].str.contains(' ')]) == len(df)

True

In [245]:
# Break down postcodes into components: area, district, sector ,and unit.
def get_postcode_component(postcode, component):
    
    """
    Returns part of all of a postcode string resolved into area, district, sector or unit. 
    
    UK postcodes take any of the following formats, where A denotes alphabetical
    and N denotes numeric:
    
    AN NAA 
    ANN NAA
    AAN NAA
    AANN NAA
    ANA NAA
    AANA NAA
    
    (source: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/611951/Appendix_C_ILR_2017_to_2018_v1_Published_28April17.pdf)
    
    - Area is the leftmost substring of alpha characters until (and excluding) a numeric character .Format: A or AA.
    - District is the substring commencing with numeric after the area, and before the space. Format: N, NN, or NA.
    - Sector is the first numeric character after the space. Format: N.
    - Unit is the rightmost substring. Format: AA.
   
    e.g.:
    >> get_postcode_component('RG40 2WD', component='sector')
    >> RG40 2   
    
    e.g. 2:
    >> get_postcode_component('EC1A 3HX', component='area')
    >> EC
    
    e.g. 3:
    >> get_postcode_component('EC1A 3HX', component='district')
    >> EC1A
    """
    
    # e.g. EC1A 3HX   /   RG40 3HX
    area_and_district = list(postcode.split(' ')[0])  # ['E', 'C', '1', 'A']   /   ['R', 'G', '4', '0']
    sector_and_unit = list(postcode.split(' ')[1])    # ['3', 'H', 'X']        /   ['3', 'H', 'X']
    
    if component == 'area':
        for idx, el in enumerate(area_and_district):
            if el.isnumeric():
                area = ''.join(area_and_district[:idx]) # EC  /  RG
                return area        

    elif component == 'district':
        for idx, el in enumerate(area_and_district):
            if el.isnumeric():
                district = ''.join(area_and_district[idx:]) # 1A  /  40
                return district
                
    elif component == 'sector':
        for idx, el in enumerate(sector_and_unit):
            if not el.isnumeric():
                sector = ''.join(sector_and_unit[:idx])   # 3   /   2
                return sector
                
    elif component == 'unit':
        for idx, el in enumerate(sector_and_unit):
            if not el.isnumeric():
                unit = ''.join(sector_and_unit[idx:])   # HX  /   QS
                return unit
    
    else:
        return None

In [246]:
# unit tests
test_postcodes = ['A1 2BC', 'A12 3BC', 'AB1 2BC', 'AB12 2CD', 'A1B 2BC', 'AB1C 2DE']

for postcode in test_postcodes:
    print(postcode, 'area: :', get_postcode_component(postcode, component='area'))
    print(postcode, 'district: :', get_postcode_component(postcode, component='district'))
    print(postcode, 'sector: :', get_postcode_component(postcode, component='sector'))
    print(postcode, 'unit: :', get_postcode_component(postcode, component='unit'))

A1 2BC area: : A
A1 2BC district: : 1
A1 2BC sector: : 2
A1 2BC unit: : BC
A12 3BC area: : A
A12 3BC district: : 12
A12 3BC sector: : 3
A12 3BC unit: : BC
AB1 2BC area: : AB
AB1 2BC district: : 1
AB1 2BC sector: : 2
AB1 2BC unit: : BC
AB12 2CD area: : AB
AB12 2CD district: : 12
AB12 2CD sector: : 2
AB12 2CD unit: : CD
A1B 2BC area: : A
A1B 2BC district: : 1B
A1B 2BC sector: : 2
A1B 2BC unit: : BC
AB1C 2DE area: : AB
AB1C 2DE district: : 1C
AB1C 2DE sector: : 2
AB1C 2DE unit: : DE


In [217]:
# create new columns with postcode components
df['pc_area'] = df['postcode'].apply(lambda x: get_postcode_component(x, 'area'))
df['pc_district'] = df['postcode'].apply(lambda x: get_postcode_component(x, 'district'))
df['pc_sector'] = df['postcode'].apply(lambda x: get_postcode_component(x, 'sector'))
df['pc_unit'] = df['postcode'].apply(lambda x: get_postcode_component(x, 'unit'))

In [223]:
df

Unnamed: 0,trans_id,price,date,postcode,type,newbuild,tenure,paon,saon,street,locality,town_city,district,county,ppd_type,record_status,pc_area,pc_district,pc_sector,pc_unit
0,{7011B109-06D7-8ED6-E053-6B04A8C075C1},340000,2018-05-18 00:00,LU2 9RB,D,N,F,99,,BUCKINGHAM DRIVE,,LUTON,LUTON,LUTON,A,A,LU,2,9,RB
1,{6B32222B-FEE2-01F1-E053-6C04A8C0D2C8},460000,2018-04-06 00:00,CH66 4JH,D,N,F,2,,HOWGILL CLOSE,LITTLE SUTTON,ELLESMERE PORT,CHESHIRE WEST AND CHESTER,CHESHIRE WEST AND CHESTER,A,A,CH,66,4,JH
2,{6B32222B-FF20-01F1-E053-6C04A8C0D2C8},282250,2018-03-22 00:00,CW7 2LQ,D,N,F,18,,CARNOUSTIE CLOSE,,WINSFORD,CHESHIRE WEST AND CHESTER,CHESHIRE WEST AND CHESTER,A,A,CW,7,2,LQ
3,{7011B109-28C8-8ED6-E053-6B04A8C075C1},115500,2018-02-16 00:00,SA46 0HX,D,N,F,4,,Y CILGANT,FFOSYFFIN,ABERAERON,CEREDIGION,CEREDIGION,A,A,SA,46,0,HX
4,{7C2D0700-3A90-4963-E053-6B04A8C07B97},188000,2018-06-15 00:00,FY2 9EF,D,N,F,13,,LANDSEER AVENUE,,BLACKPOOL,BLACKPOOL,BLACKPOOL,A,A,FY,2,9,EF
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5008995,{AB4880B5-B756-481A-BDAF-9EFC22011F50},124995,2012-11-16 00:00,LL20 8RF,F,N,F,1,,JOHN STREET,,LLANGOLLEN,DENBIGHSHIRE,DENBIGHSHIRE,A,A,LL,20,8,RF
5008996,{DC3EF73E-2C68-47A8-9A51-9D54CDD87C73},174950,2012-08-08 00:00,BH15 1NS,F,N,L,THE OLD BREWERY,FLAT 2,HILL STREET,,POOLE,POOLE,POOLE,A,A,BH,15,1,NS
5008997,{901CCA0F-89A5-4EED-8EDE-F408A7A2AD5C},182398,2006-04-28 00:00,LU2 0FB,F,Y,L,"HATTON PLACE, 118",FLAT 28,MIDLAND ROAD,LUTON,LUTON,LUTON,LUTON,A,A,LU,2,0,FB
5008998,{806E1A05-AC2B-489E-A0EB-F7C1FF61B683},130000,2006-10-27 00:00,TQ3 2BG,F,N,L,THE COACH HOUSE,FLAT 9,STEARTFIELD ROAD,PAIGNTON,PAIGNTON,TORBAY,TORBAY,A,A,TQ,3,2,BG


In [50]:
df.head(4)

Unnamed: 0,trans_id,price,date,postcode,type,newbuild,tenure,paon,saon,street,locality,town_city,district,county,ppd_type,record_status,postcode_area,postcode_district,postcode_sector,postcode_unit,pc_district
0,{7011B109-1728-8ED6-E053-6B04A8C075C1},185000,2018-06-01 00:00,TS17 0QS,D,N,F,1,,HAWKRIDGE CLOSE,INGLEBY BARWICK,STOCKTON-ON-TEES,STOCKTON-ON-TEES,STOCKTON-ON-TEES,A,A,TS,17,0,QS,TS17
1,{7011B109-0AEC-8ED6-E053-6B04A8C075C1},620000,2018-05-31 00:00,RG40 3HX,D,N,F,11,,CROFT ROAD,,WOKINGHAM,WOKINGHAM,WOKINGHAM,A,A,RG,40,3,HX,RG40
2,{6B32222B-FAC3-01F1-E053-6C04A8C0D2C8},265000,2018-04-13 00:00,TS26 0ST,D,N,F,19,,LAPWING ROAD,,HARTLEPOOL,HARTLEPOOL,HARTLEPOOL,A,A,TS,26,0,ST,TS26
3,{6B32222B-FFE1-01F1-E053-6C04A8C0D2C8},335000,2018-03-08 00:00,WA5 8GN,D,N,F,42,,SAVANNAH PLACE,GREAT SANKEY,WARRINGTON,WARRINGTON,WARRINGTON,A,A,WA,5,8,GN,WA5


In [None]:
## Pre-processing

In [None]:
df = pd.read_csv('/home/trev/Downloads/ppd_data.csv', sep=",", header=None)

df.columns = [
    'trans_id',
    'price',
    'date',p
    'postcode',
    'type',
    'newbuild',
    'tenure',
    'paon',
    'saon',
    'street',
    'locality',
    'town/city',
    'district',
    'county',
    'ppd_type',
    'record_status']

df['postcode'] = df['postcode'].astype(str)
df['postcode_area'] = df['postcode'].apply(lambda x: x.split(' ')[0])

df = df.sort_values('date')

df['street'] = df['street'].astype(str)
df['street'] = df['street'].apply(lambda x: x.lower())

df['days_ago'] = df['date'].apply(lambda x: datetime.today() - datetime.strptime(x, '%Y-%m-%d'))
df['days_ago'] = df['days_ago'].apply(lambda x: str(x).split(' ')[0])
df['days_ago'] = df['days_ago'].apply(lambda x: -int(x))
df['months_ago'] = df['days_ago'].apply(lambda x: int(round( x/(365/12) ,0) ))

In [None]:
def graph(postcode, housetype, streetname):

    import matplotlib.pyplot as plt

    postcode_type = df[  (df['postcode_area'] == f'{postcode}') & (df['type'] == f'{housetype}') ]
    
    # df for individual street
    street = postcode_type[postcode_type['street'] == f'{streetname}']
    
    # price data for street
    street_price = street.groupby(by=['months_ago']).agg({'price': 'median'})
    
    # moving average for street
    street_price['MA'] = street_price['price'].rolling(window=12).mean()
    
    # price data for postcode area
    postcode_type_price = postcode_type.groupby(by=['months_ago']).agg({'price': 'median'})
    postcode_type_price = postcode_type_price.groupby(by=['months_ago']).agg({'price': 'median'})
    
    # moving average for postcode area
    postcode_type_price['MA'] = postcode_type_price['price'].rolling(window=12).mean()
    
    plt.plot(street_price.index, street_price['price'].values,c='red',  label=f'{streetname}')
    plt.plot(street_price.index, street_price['MA'].values, c='pink',  label=f'{streetname} moving average')
    
    plt.plot(postcode_type_price.index, postcode_type_price['MA'].values ,c='green',  label=f'{postcode, housetype} moving average')
    plt.scatter(postcode_type_price.index, postcode_type_price['price'].values, c='blue', label=f'{postcode, housetype}')
    
    plt.rcParams["figure.figsize"]=(20,20)
    plt.xticks(rotation=90)
    plt.legend(loc='best')
    plt.show()

In [None]:
def mortgage(principal, total_period, init_period, init_payment, init_rate, subs_rate, base_rate, over_payment):
    
    """
    Mortgage payment calculator. Returns dictionary of arrays
    
    principal: amount borrowed in arbitrary currency, e.g. 500000
    total_period: total mortgage duration in months
    init_period: initial term duration in months
    init_payment: initial monthly payment amount, e.g. 1205
    init_rate: initial interest rate expressed as a decimal. e.g. 0.0146 for 1.46%
    subs_rate: subsequent interest rate expressed as a decimal. e.g. 0.0376 for 3.76%
    base_rate: Bank of England base rate. e.g. 0.001 for 0.1%
    over_payment: optional fixed monthly overpayment
    """    
    
    # create arrays for plotting
    interest_values = []
    payment_values = []
    over_payment_values = []
    principal_values = []
    interest_cum_values = []
   
    # calculate initial max overpayment
    max_monthly_overpayment = principal*0.1*(1/12)
    
    # cumulative interest
    interest_cum = 0
    
    # payments for months of initial period
    for month in range(0, init_period):
        
        # update max available monthly overpayment each year
        if month % 12 == 0:
            max_monthly_overpayment = principal*0.1*(1/12)
            
        # calculate interest
        interest = principal*(1 + init_rate)**(1/12) - principal
        
        # add to interest for this month to total interest charged
        interest_cum += interest
    
        # add interest for current month
        principal += interest
     
        # subtract payment amount from principal
        if principal < init_payment:
            principal -= principal
            #stop payments if principal paid off
            init_payment=0 
            subs_payment=0
            over_payment=0
        else:
            principal -= init_payment       
            
        # subtract overpayment from principal
        if over_payment > max_monthly_overpayment:
            over_payment = max_monthly_overpayment

            if principal < over_payment:
                principal -= principal
                init_payment=0 
                subs_payment=0
                over_payment=0
            else:
                principal -= over_payment
        else:
            if principal < over_payment:
                principal -= principal
                init_payment=0 
                subs_payment=0
                over_payment=0
            else:
                principal -= over_payment
   
        # populate arrays for plotting
        interest_values.append(interest)
        interest_cum_values.append(interest_cum)
        payment_values.append(init_payment)
        over_payment_values.append(over_payment)
        principal_values.append(principal)
   
    # calculate payments for remaining balance at new rate
    subs_payment = ((((subs_rate+base_rate)/12))*((1+((subs_rate+base_rate)/12))**(total_period-init_period)) / (((1+((subs_rate+base_rate)/12))**(total_period-init_period)) - 1))*(principal)
    
    # payments for subsequent months
    for month in range(init_period, total_period):
        
        # interest added
        interest = principal*(1 + (subs_rate+base_rate))**(1/12) - principal
        
        # add to cumulative interest 
        interest_cum += interest
            
        # add interest for current month
        principal += interest
    
        # subtract (minimum) payment amount from principal
        if principal < subs_payment:
            principal -= principal
            #stop payments if principal paid off
            init_payment=0 
            subs_payment=0
            over_payment=0
        else:
            principal -= subs_payment
            
        # subtract overpayment amount from principal
        if principal < over_payment:
            principal -= principal
            #stop payments if principal paid off
            init_payment=0 
            subs_payment=0
            over_payment=0

        else:
            principal -= over_payment
        
        # populate arrays for plotting
        interest_values.append(interest)
        interest_cum_values.append(interest_cum)
        payment_values.append(subs_payment)
        over_payment_values.append(over_payment)
        principal_values.append(principal)
        
    results = dict.fromkeys(['Interest', 'Payments', 'Overpayments', 'Principal'])
    results['Interest'] = interest_values
    results['Payments'] = payment_values
    results['Overpayments'] = over_payment_values
    results['Principal'] = principal_values
    
    return results