### Extract Phone

In [1]:
from etk.core import Core
import pprint

c = Core()

# you can extract phone from either url or text(tokens), difference is in the way url needs to be tokenized
url = 'http://some_url.com/ad/town/602-228-4192/1/310054'
source_type = 'url'
include_context = True
output_format = 'obfuscation' # (or 'list')
extracted_phone = c._extract_phone(url, source_type, include_context, output_format)

pp = pprint.PrettyPrinter(indent=4)
pp.pprint(extracted_phone)

[{   'obfuscation': 'False', 'value': '6022284192'}]


In [2]:
from etk.core import Core
import pprint

c = Core()

# in this example we'll extract an obfuscated phone
text = 'new person in town searching for a great date wiff u  \
        fresh person here searching 4 a great date wiff you Sweet new person in town \
        seeking for a good date with u for80 2sixseven one9zerofor'

#phone extractor needs tokens as inputs, so here goes
tokens = c.extract_tokens_from_crf(c.extract_crftokens(text)) # looks complicated, needs to be simplified
source_type = 'text'
include_context = True
output_format = 'obfuscation' # (or 'list')
extracted_phone = c._extract_phone(tokens, source_type, include_context, output_format)

pp = pprint.PrettyPrinter(indent=4)
pp.pprint(extracted_phone)

[{   'obfuscation': 'True', 'value': '4802671904'}]


### Extract Weight

In [15]:
from etk.core import Core
import pprint

c = Core()
text = "Measurements: 105lbs 5\'2\" with a beautiful face"
extracted_weight = c._extract_weight(text)

pp = pprint.PrettyPrinter(indent=4)
pp.pprint(extracted_weight)


[   {   'context': {   'end': 20, 'start': 14},
        'metadata': {   'unit': 'pound'},
        'value': '105'}]


### Extract  Height

In [16]:
from etk.core import Core
import pprint

c = Core()
text = "Nationality:   Swedish  Height:   155 cm   Weight:   47 Kg   Hair Colour:   Blonde"
extracted_height = c._extract_height(text)

pp = pprint.PrettyPrinter(indent=4)
pp.pprint(extracted_height)

[   {   'context': {   'end': 40, 'start': 34},
        'metadata': {   'unit': 'centimeter'},
        'value': '155'},
    {   'context': {   'end': 40, 'start': 30},
        'metadata': {   'unit': 'centimeter'},
        'value': '155'}]


### Extract email

In [17]:
from etk.core import Core
c = Core()
import pprint

text = 'contact me at some_email@gmail.com'
extracted_email = c._extract_email(text, True)

pp = pprint.PrettyPrinter(indent=4)
pp.pprint(extracted_email)

[   {   'context': {   'end': 34, 'obfuscation': False, 'start': 14},
        'value': 'some_email@gmail.com'}]


### Extract using Regex

In [18]:
from etk.core import Core
c = Core()
import pprint

regex = "(?:my[\\s]+name[\\s]+is[\\s]+([-a-z0-9@$!]+))"
text = "hi there, my name is jessica, join me at so and so"
include_context = True # return the start and end index of the regex match in string
flags = 0
extracted_name = c._extract_using_regex(text, regex, include_context, flags)

pp = pprint.PrettyPrinter(indent=4)
pp.pprint(extracted_name)

[{   'context': {   'end': 28, 'start': 10}, 'value': 'jessica'}]
