# Spacy Info

In [3]:
from spacy.en import English

In [4]:
parser = English()

In [27]:
parsed = parser(u"I want a 5 beds house")

In [63]:
def dependencies(sent):
    parsed = parser(unicode(sent))
    return {token.orth_:token.head.orth_ for token in parsed}

In [64]:
dependencies('i need a 3 beds house')

{u'3': u'beds',
 u'a': u'house',
 u'beds': u'house',
 u'house': u'need',
 u'i': u'need',
 u'need': u'need'}

In [128]:
dependencies('i need a 3 beds house')['3']

u'beds'

In [65]:
dependencies('i need a 3-5 beds house')

{u'-': u'5',
 u'3': u'beds',
 u'5': u'3',
 u'a': u'house',
 u'beds': u'house',
 u'house': u'need',
 u'i': u'need',
 u'need': u'need'}

In [66]:
dependencies('i need a 3 to 5 beds house')

{u'3': u'5',
 u'5': u'beds',
 u'a': u'house',
 u'beds': u'house',
 u'house': u'need',
 u'i': u'need',
 u'need': u'need',
 u'to': u'5'}

In [67]:
'i need a 3-5 beds house'.split()

['i', 'need', 'a', '3-5', 'beds', 'house']

In [68]:
dependencies('the total area of the house is 4000 sqft, and has 5 beds')

{u',': u'is',
 u'4000': u'sqft',
 u'5': u'beds',
 u'and': u'is',
 u'area': u'is',
 u'beds': u'has',
 u'has': u'is',
 u'house': u'of',
 u'is': u'is',
 u'of': u'area',
 u'sqft': u'is',
 u'the': u'house',
 u'total': u'area'}

In [69]:
def ner(sent):
    parsed = parser(unicode(sent))
    return {token.orth_:token.ent_type_ if token.ent_type_ != "" else u"O" for token in parsed}

In [70]:
ner('the total area of the house is 4000 square feet, and has 5 beds')

{u',': u'O',
 u'4000': u'QUANTITY',
 u'5': u'CARDINAL',
 u'and': u'O',
 u'area': u'O',
 u'beds': u'O',
 u'feet': u'QUANTITY',
 u'has': u'O',
 u'house': u'O',
 u'is': u'O',
 u'of': u'O',
 u'square': u'QUANTITY',
 u'the': u'O',
 u'total': u'O'}

# Feature Extraction

In [89]:
import re
import numpy as np

In [145]:
bed_word = re.compile('bds*|bed\w*')
bath_word = re.compile('ba(s|th)\w*')
area_word = re.compile('sqrt|square|feet|space|lot|acre')
price_word = re.compile('\$|how much|dollar\w*|bucks|cost|k|thousand|million|m|afford|spend')

In [146]:
tagset = {0:'BED',1:'BATH',2:'AREA',3:'PRICE'}

In [151]:
def tag(sent):
    deps = dependencies(sent)
    ners = ner(sent)
    tokens = sent.split()
    tags = []
    for token in tokens:
        if not token.isdigit(): 
            tags += ['O']
            continue
        elif bed_word.match(deps[token]): tags += ['BED']
        elif bath_word.match(deps[token]): tags += ['BATH']
        elif area_word.match(deps[token]): tags += ['AREA']
        elif price_word.match(deps[token]): tags += ['PRICE']
        else:
            tags += [tagset[np.argmax([len(bed_word.findall(sent)),
                                       len(bath_word.findall(sent)),
                                       len(area_word.findall(sent)),
                                       len(price_word.findall(sent))])]]
    return zip(tokens,tags)

In [152]:
s1 = 'i want to see a 5 bath house'
s2 = 'the total area of the house is 4000 square feet, and has 5 beds'

In [153]:
tag(s1)

[('i', 'O'),
 ('want', 'O'),
 ('to', 'O'),
 ('see', 'O'),
 ('a', 'O'),
 ('5', 'BATH'),
 ('bath', 'O'),
 ('house', 'O')]

In [154]:
tag(s2)

[('the', 'O'),
 ('total', 'O'),
 ('area', 'O'),
 ('of', 'O'),
 ('the', 'O'),
 ('house', 'O'),
 ('is', 'O'),
 ('4000', 'AREA'),
 ('square', 'O'),
 ('feet,', 'O'),
 ('and', 'O'),
 ('has', 'O'),
 ('5', 'BED'),
 ('beds', 'O')]