In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from commonregex import CommonRegex
import requests
import re

In [2]:
### Tie new charges to land registry

In [3]:
use_cols = ['Title Number', 'Tenure', 'Property Address', 'District', 'County',
            'Region', 'Postcode', 'Multiple Address Indicator', 'Price Paid',
            'Proprietor Name (1)', 'Company Registration No. (1)',
            'Proprietorship Category (1)', 'Proprietor (1) Address (1)',
            'Proprietor (1) Address (2)', 'Proprietor (1) Address (3)']

rorig = pd.read_csv('CCOD_COU_2019_11.csv', skipfooter=1, engine='python',
                    usecols=use_cols,
                    dtype={'Title Number': str,
                           'Company Registration No. (1)': str})

print(rorig.shape)
rorig.head(2)

(63745, 15)


Unnamed: 0,Title Number,Tenure,Property Address,District,County,Region,Postcode,Multiple Address Indicator,Price Paid,Proprietor Name (1),Company Registration No. (1),Proprietorship Category (1),Proprietor (1) Address (1),Proprietor (1) Address (2),Proprietor (1) Address (3)
0,NT72787,Freehold,"2 Horsham Drive, Top Valley (NG5 9AH)",CITY OF NOTTINGHAM,CITY OF NOTTINGHAM,EAST MIDLANDS,NG5 9AH,N,30000.0,PLACES FOR PEOPLE HOMES LIMITED,IP19447R,Industrial and Provident Society (Company),"80 Cheapside, London EC2V 6EE","4 The Pavilions, Ashton-on-Ribble, Preston PR2...",
1,LT25408,Freehold,"98-100 Melton Road, Leicester (LE4 5EB)",LEICESTER,LEICESTER,EAST MIDLANDS,LE4 5EB,N,,JADEMARK LIMITED,2847145,Limited Company or Public Limited Company,"98 - 100 Melton Road, Leicester",,


In [4]:
corig = pd.read_csv('charges_tempt.csv', dtype=str, header=0, index_col=None)

print(corig.shape)
corig.head(2)

(37480, 15)


Unnamed: 0,CompanyName,CompanyNumber,CompanyStatus,IncorporationDate,SICCode,status,persons_entitled,delivered_on,charge_number,classification.type,classification.description,particulars.type,particulars.description,RegAddress.District,RegAddress.PostCode
0,CHG-MERIDIAN UK LIMITED,1276016,Active,1976-09-06T00:00:00.000Z,"[{""code"":""64910"",""description"":""Financial leas...",outstanding,"[{""name"":""Bal Global Finance (UK) Limited""}]",2019-08-01T00:00:00.000Z,2529,charge-description,A registered charge,brief-description,A security charge dated 31/07/2019 between bal...,RUNNYMEDE,TW20 9AB
1,CHG-MERIDIAN UK LIMITED,1276016,Active,1976-09-06T00:00:00.000Z,"[{""code"":""64910"",""description"":""Financial leas...",outstanding,"[{""name"":""Bal Global Finance (UK) Limited""}]",2019-08-01T00:00:00.000Z,2528,charge-description,A registered charge,brief-description,A security charge dated 31/07/2019 between bal...,RUNNYMEDE,TW20 9AB


### Extract Data

In [5]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in str(inputString))

In [6]:
cedit = corig.copy()

cedit.dropna(subset=['particulars.description'], inplace=True)
cedit['hasNum'] = cedit['particulars.description'].apply(hasNumbers)
cedit = cedit[cedit['hasNum'] == True]

print(cedit.shape)
cedit.tail(2)

(23197, 16)


Unnamed: 0,CompanyName,CompanyNumber,CompanyStatus,IncorporationDate,SICCode,status,persons_entitled,delivered_on,charge_number,classification.type,classification.description,particulars.type,particulars.description,RegAddress.District,RegAddress.PostCode,hasNum
37478,LITTLE WELLIES LIMITED,9014548,Active,2014-04-28T00:00:00.000Z,"[{""code"":""88910"",""description"":""Child day-care...",outstanding,"[{""name"":""National Westminster Bank PLC""}]",2019-12-02T00:00:00.000Z,4,charge-description,A registered charge,brief-description,"110 apperley road, bradford BD10 9SR.",BRADFORD,BD2 1NH,True
37479,LITTONDALE LIMITED,11756098,Active,2019-01-08T00:00:00.000Z,"[{""code"":""98000"",""description"":""Residents prop...",outstanding,"[{""name"":""Gatehouse Bank PLC""}]",2019-11-11T00:00:00.000Z,1,charge-description,A registered charge,brief-description,"4,6 & 8 church street, clowne, chesterfield, S...",OLDHAM,OL2 6NX,True


<br>
<font color=green>Note: Particulars.description plain text is the only clear way to discern land charge</font>
<br>

In [7]:
for i in cedit['particulars.description'].sample(5):
    print(i+ '\n')

63 alumhurst road. Westbourne. Dorset. BH4 8EW.

26 hatch lane harmondsworth t/no MX434838.

The bungalow, leeming lane farm, great north road, sinderby, thirsk, YO7 4LG. The cottage, leeming lane farm, great north road, sinderby, thirsk, YO7 4LG.

14 taplow street liverpool.

Leasehold properties - 2305 michigan point tower b, 11 michigan avenue, salford, M50 2HJ - title number GM667953 (part of) and all other properties listed in the schedule of the charge attached hereto.



In [8]:
### most titles are between the length of 6-8

redit = rorig.copy()
redit['TitleLen'] = redit['Title Number'].apply(len)

redit['TitleLen'].value_counts()

8    40218
9    13893
7     8377
6     1117
5      130
4       10
Name: TitleLen, dtype: int64

In [9]:
### structure generally 2-3 letters and 5-6 numbers 
redit.loc[redit['TitleLen'] == 8]['Title Number'].sample(5)

30588    MS441144
3107     DT347495
51719    DN718514
59400    HS228847
6773     SF639956
Name: Title Number, dtype: object

In [10]:
redit

Unnamed: 0,Title Number,Tenure,Property Address,District,County,Region,Postcode,Multiple Address Indicator,Price Paid,Proprietor Name (1),Company Registration No. (1),Proprietorship Category (1),Proprietor (1) Address (1),Proprietor (1) Address (2),Proprietor (1) Address (3),TitleLen
0,NT72787,Freehold,"2 Horsham Drive, Top Valley (NG5 9AH)",CITY OF NOTTINGHAM,CITY OF NOTTINGHAM,EAST MIDLANDS,NG5 9AH,N,30000.0,PLACES FOR PEOPLE HOMES LIMITED,IP19447R,Industrial and Provident Society (Company),"80 Cheapside, London EC2V 6EE","4 The Pavilions, Ashton-on-Ribble, Preston PR2...",,7
1,LT25408,Freehold,"98-100 Melton Road, Leicester (LE4 5EB)",LEICESTER,LEICESTER,EAST MIDLANDS,LE4 5EB,N,,JADEMARK LIMITED,2847145,Limited Company or Public Limited Company,"98 - 100 Melton Road, Leicester",,,7
2,P183436,Freehold,"81 Queen Street, Barry (CF62 7EE)",THE VALE OF GLAMORGAN,THE VALE OF GLAMORGAN,WALES,CF62 7EE,N,,HURSTON REAL ESTATE LIMITED,10171850,Limited Company or Public Limited Company,"Bishopstone, 36 Crescent Road, Worthing BN11 1RL",,,7
3,PM26295,Freehold,"Flats 1-16, 2 East Shore Way, Portsmouth (PO3 ...",PORTSMOUTH,PORTSMOUTH,SOUTH EAST,PO3 6GD,N,,JLPPT HOLDCO 6 LIMITED,09134008,Limited Company or Public Limited Company,"Grand Buildings, 1-3 Strand, London WC2N 5HR",,,7
4,LN44425,Freehold,"10 St James's Place, London (SW1A 1NP)",CITY OF WESTMINSTER,GREATER LONDON,GREATER LONDON,SW1A 1NP,N,5500000.0,BEST DRESSED GROUP LIMITED,08820424,Limited Company or Public Limited Company,"Nuffield House, 41-46 Piccadilly, London W1J 0DS",,,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63740,TGL170502,Freehold,"2-12 (even) and 9-43 (odd) Watermill Close, 52...",RICHMOND UPON THAMES,GREATER LONDON,GREATER LONDON,,N,,ISHGUARD LIMITED,9163587,Limited Company or Public Limited Company,"Berkeley House, 304 Regents Park Road, London ...",,,9
63741,WSX291159,Freehold,"89 Lower Street, Pulborough (RH20 2BP)",HORSHAM,WEST SUSSEX,SOUTH EAST,RH20 2BP,N,12250.0,RG SECURITIES LIMITED,06762017,Limited Company or Public Limited Company,"7 Nelson Street, Southend On Sea SS1 1EH",,,9
63742,WSX406286,Leasehold,"Unit 35, Bolney Grange Industrial Park, Bolney...",MID SUSSEX,WEST SUSSEX,SOUTH EAST,RH17 5PB,N,,A & A CONSTRUCTION GROUP LIMITED,02551724,Limited Company or Public Limited Company,"Suite 21, 10 Churchill Square, Kings Hill, Wes...",,,9
63743,SGL301334,Freehold,"108 Manor Road, Wallington (SM6 0DW)",SUTTON,GREATER LONDON,GREATER LONDON,SM6 0DW,N,215000.0,HOLLYGRACE PROPERTIES LIMITED,09967718,Limited Company or Public Limited Company,"Unit 1, Kensington Court, 108 Manor Road, Wall...",,,9


In [11]:
### keep NaN rows due to Tenure and address info available
print(f"Rows with NaN for Price Paid: {len(redit['Price Paid'].isnull())}")

Rows with NaN for Price Paid: 63745


In [12]:
### return single or multiple titles
def hasTitle(inputString):
    ret = re.findall('([A-Z]{2,3}[0-9]{3,5})\w+', str(inputString))
    if len(ret) > 0:
        return ret
    else:
        pass

In [13]:
cedit['hasTitle'] = cedit['particulars.description'].apply(hasTitle)

cedit.loc[~cedit['hasTitle'].isnull()][['CompanyNumber', 'hasTitle']].sample(5)

Unnamed: 0,CompanyNumber,hasTitle
20402,2630203,[WK41745]
8363,10013047,[MX20735]
24551,1166032,[SK19284]
2059,9875397,"[CYM57434, CYM66700]"
29577,8906583,"[NYK35534, NYK35534]"


In [14]:
### create test descriptions
desc1 = 'The freehold property known as 26B piper close mansfield woodhouse mansfield NG19 7GG registered at h m land registry with title number NT463229.'
desc2 = '22 shedden park road, kelso ROX11811.'
desc3 = '243 plaistow road, london E15 3EU. Title number: EGL211881'
desc4 = 'The freehold land known as 128 cardiff road, reading, RG1 8PQ and 8 trafford road, reading, RG1 8JP registered at the land registry under title numbers BK124548 and BK98452.'

In [15]:
def hasAddress(inputString):
#     ret = re.findall('(\d{1,10}( \w+){1,10}( ( \w+){1,10})?( \w+){1,10}[,.](( \w+){1,10}(,)? [A-Z]{2}( [0-9]{5})?)?)', str(inputString))
    ret = re.findall('(\d[0-9]{1,3} .+, [a-zA-Z]+[^\s])', str(inputString))
#     ret = re.findall('([A-Za-z0-9]\.\-\s\,])', str(inputString))

    if len(ret) > 0:
        return ret
    else:
        pass

In [16]:
### ok performance; needs improvement for termination
### will likely need multiple regexs for various forms of input
print(hasAddress(desc1))
print(hasAddress(desc2))
print(hasAddress(desc3))
print(hasAddress(desc4))

None
['22 shedden park road, kelso']
['243 plaistow road, london']
['128 cardiff road, reading, RG1 8PQ and 8 trafford road, reading, RG1']


In [17]:
### ok performance; unable to extract city + postcode, or n-grams greater than 3
print(CommonRegex(desc1).street_addresses)
print(CommonRegex(desc2).street_addresses)
print(CommonRegex(desc3).street_addresses)
print(CommonRegex(desc4).street_addresses)

[]
['22 shedden park road,']
['243 plaistow road,']
['128 cardiff road,', '1 8PQ and 8 trafford road,']


In [18]:
def hasPostcode(inputString):
    ret = re.findall(r'\b[A-Z]{1,2}[0-9][A-Z0-9]? [0-9][ABD-HJLNP-UW-Z]{2}\b', str(inputString))
    if len(ret) > 0:
        return ret
    else:
        pass

In [19]:
print(hasPostcode(desc1))
print(hasPostcode(desc2))
print(hasPostcode(desc3))
print(hasPostcode(desc4))

['NG19 7GG']
None
['E15 3EU']
['RG1 8PQ', 'RG1 8JP']


In [20]:
### hasPostcode as a somewhat indicator of the presence of an address
cedit['hasPostcode'] = cedit['particulars.description'].apply(hasPostcode)

cedit.loc[~cedit.hasPostcode.isnull()]['hasPostcode'].sample(5)

23757    [BS32 9DU]
4508      [SN2 5DN]
5105      [FY5 1DB]
36725     [SP1 1TT]
22739     [S11 8QJ]
Name: hasPostcode, dtype: object

### Matching

In [48]:
def firstTitle(x):
    if x is not None:
        return x[0]

In [50]:
pedit = cedit.copy()

pedit['firstTitle'] = pedit.hasTitle.apply(firstTitle)
pedit = pedit[~pedit['firstTitle'].isnull()]

print(pedit.shape)
pedit.sample(5)

(11919, 19)


Unnamed: 0,CompanyName,CompanyNumber,CompanyStatus,IncorporationDate,SICCode,status,persons_entitled,delivered_on,charge_number,classification.type,classification.description,particulars.type,particulars.description,RegAddress.District,RegAddress.PostCode,hasNum,hasTitle,hasPostcode,firstTitle
35527,CHASE (HERTS) LIMITED,12096590,Active,2019-07-11T00:00:00.000Z,"[{""code"":""41100"",""description"":""Development of...",outstanding,"[{""name"":""Wilson Holdings (London) Limited""}]",2019-09-20T00:00:00.000Z,2,charge-description,A registered charge,brief-description,All that freehold land known as bircherley gre...,WELWYN HATFIELD,AL8 6HG,True,"[HD34749, HD12943]",,HD34749
11048,ECOLOGIA ENVIRONMENTAL SOLUTIONS HOLDINGS LIMITED,7330158,Active,2010-07-29T00:00:00.000Z,"[{""code"":""39000"",""description"":""Remediation ac...",outstanding,"[{""name"":""Ares Management Limited""}]",2019-10-24T00:00:00.000Z,3,charge-description,A registered charge,brief-description,"The real property known as unit 1, kingsgate b...",CHESHIRE WEST AND CHESTER,WA6 0AR,True,[DN29382],[EX14 1YG],DN29382
36464,JENUIN LETTINGS LTD,11722317,Active,2018-12-11T00:00:00.000Z,"[{""code"":""68209"",""description"":""Other letting ...",outstanding,"[{""name"":""Paratus Amc Trading as Foundation Ho...",2019-11-13T00:00:00.000Z,1,charge-description,A registered charge,brief-description,"37 surrey road, barking, essex, IG11 7QT (regi...",BARKING AND DAGENHAM,RM6 6XB,True,[NGL17516],[IG11 7QT],NGL17516
20645,MANC HOUSING LIMITED,10973059,Active,2017-09-20T00:00:00.000Z,"[{""code"":""68100"",""description"":""Buying and sel...",outstanding,"[{""name"":""Manchester Housing Assets Limited""}]",2019-09-24T00:00:00.000Z,33,charge-description,A registered charge,brief-description,11 ash street bury BL9 7BT registered under ti...,MANCHESTER,M3 2PJ,True,"[GM93326, LA29137]",[BL9 7BT],GM93326
9052,CRESCENT ARCH PROPERTIES MANAGEMENT LIMITED,10083850,Active,2016-03-24T00:00:00.000Z,"[{""code"":""68209"",""description"":""Other letting ...",outstanding,"[{""name"":""Paratus Amc Limited""}]",2019-11-07T00:00:00.000Z,6,charge-description,A registered charge,brief-description,"The property known as 48 germander place, cont...",HILLINGDON,HA4 0AH,True,[BM13881],[MK14 7DP],BM13881


In [56]:
### brief test only yielded 11 matches; may be due to lack of complete land registry records
rgg = redit[redit['Title Number'].isin(pedit.firstTitle)]

print(rgg.shape)
rgg.head(2)

(11, 17)


Unnamed: 0,Title Number,Tenure,Property Address,District,County,Region,Postcode,Multiple Address Indicator,Price Paid,Proprietor Name (1),Company Registration No. (1),Proprietorship Category (1),Proprietor (1) Address (1),Proprietor (1) Address (2),Proprietor (1) Address (3),Title Len,TitleLen
232,LAN5875,Freehold,land lying to the east of Firswood Road and la...,WEST LANCASHIRE,LANCASHIRE,NORTH WEST,,N,,G PARK SKELMERSDALE LIMITED,4926989,Limited Company or Public Limited Company,"Sixth Floor, 99 Bishopsgate, London EC2M 3XD",,,7,7
349,EX61590,Freehold,"2 Gilwell Park Close, Colchester (CO3 4SP)",COLCHESTER,ESSEX,SOUTH EAST,CO3 4SP,N,,MCCARTHY & STONE (EXTRA CARE LIVING) LIMITED,6897363,Limited Company or Public Limited Company,"Fourth Floor, 100 Holdenhurst Road, Bournemout...",,,7,7


### Google API

In [61]:
def ggfind(x):

    S = requests.Session()
    SURL = "https://maps.googleapis.com/maps/api/place/findplacefromtext/json?"

    SPARAMS = {
        "key": "AIzaSyAcNS706HQAxaJSraVqMjC2YS9MZblBnrY",
        "input": x,
        "inputtype": "textquery"
    }

    SREQ = S.get(url=SURL, params=SPARAMS)
    if SREQ.status_code == 200:
        SDATA = SREQ.json()
        if len(SDATA['candidates']) > 0:
#             return [i['place_id'] for i in SDATA['candidates']]
            return SDATA['candidates'][0]['place_id']

In [64]:
rgg['ggfind'] = rgg['Property Address'].apply(ggfind)

In [79]:
def ggdetail(x):

    T = requests.Session()
    TURL = "https://maps.googleapis.com/maps/api/place/details/json?"

    TPARAMS = {
        "key": "",
        "place_id": x,
        "fields": 'name,formatted_address,types,geometry'
    }

    if x[:2] == 'Ch':
        TREQ = T.get(url=TURL, params=TPARAMS)
        TDATA = TREQ.json()

        return TDATA['result']

In [80]:
rgg['ggdetail'] = rgg['ggfind'].apply(ggdetail)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [81]:
### only ids starting with 'ch' have details; look into this
rgg[['Property Address', 'ggfind', 'ggdetail']]

Unnamed: 0,Property Address,ggfind,ggdetail
232,land lying to the east of Firswood Road and la...,ChIJV2GoGzoXe0gRE3KYoYL9u7k,"{'formatted_address': 'Firswood Rd, United Kin..."
349,"2 Gilwell Park Close, Colchester (CO3 4SP)",ChIJ4VavQWID2UcRpnYNV60Ug9Y,"{'formatted_address': '2 Gilwell Park Cl, Colc..."
9506,"Flat 26 Monmouth Court, Bassalege Road, Newpor...",EjUyNiwgTW9ubW91dGggQ291cnQsIEJhc3NhbGVnIFJkLC...,
14971,"Heath Rise, Kersfield Road, London (SW15 3HF)",EjBIZWF0aCBSaXNlICYgS2Vyc2ZpZWxkIFJvYWQsIExvbm...,
19171,"141 Morris Green Lane, Bolton (BL3 3JU)",ChIJW4zAz8Cne0gRCESTKeir2ns,"{'formatted_address': '141 Morris Green Ln, Bo..."
30236,"227 Tettenhall Road, Wolverhampton (WV6 0DE)",EigyMjcgVGV0dGVuaGFsbCBSZCwgV29sdmVyaGFtcHRvbi...,
37610,land lying to the east of Firswood Road and la...,ChIJV2GoGzoXe0gRE3KYoYL9u7k,"{'formatted_address': 'Firswood Rd, United Kin..."
40632,"Heath Rise, Kersfield Road, London (SW15 3HF)",EjBIZWF0aCBSaXNlICYgS2Vyc2ZpZWxkIFJvYWQsIExvbm...,
48761,"Flat 26 Monmouth Court, Bassalege Road, Newpor...",EjUyNiwgTW9ubW91dGggQ291cnQsIEJhc3NhbGVnIFJkLC...,
53684,"16 Orchard Avenue, Southall (UB1 1LF)",ChIJG4FnP6tydkgRk2xkUptFycU,"{'formatted_address': '16 Orchard Ave, Southal..."


In [83]:
### look into extract additional property info
print(rgg.loc[349]['ggdetail'])
print('\n')
print(rgg.loc[19171]['ggdetail'])
print('\n')
print(rgg.loc[53684]['ggdetail'])

{'formatted_address': '2 Gilwell Park Cl, Colchester CO3 4SP, UK', 'geometry': {'location': {'lat': 51.8770652, 'lng': 0.8614945}, 'viewport': {'northeast': {'lat': 51.8783478302915, 'lng': 0.8627517302915021}, 'southwest': {'lat': 51.8756498697085, 'lng': 0.860053769708498}}}, 'name': '2 Gilwell Park Cl', 'types': ['premise']}


{'formatted_address': '141 Morris Green Ln, Bolton BL3 3JU, UK', 'geometry': {'location': {'lat': 53.55871339999999, 'lng': -2.450594799999999}, 'viewport': {'northeast': {'lat': 53.56003728029149, 'lng': -2.449308719708498}, 'southwest': {'lat': 53.55733931970849, 'lng': -2.452006680291502}}}, 'name': '141 Morris Green Ln', 'types': ['street_address']}


{'formatted_address': '16 Orchard Ave, Southall UB1 1LF, UK', 'geometry': {'location': {'lat': 51.5104889, 'lng': -0.3779815}, 'viewport': {'northeast': {'lat': 51.5117743802915, 'lng': -0.3766410697084979}, 'southwest': {'lat': 51.5090764197085, 'lng': -0.379339030291502}}}, 'name': '16 Orchard Ave', 'types'