In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
text = nlp('''Dr.Strange loves pav bhaji of mumbai as it costs only 2$ per plate.''')

In [4]:
for i, sent in enumerate(text.sents):
    print(f'{i}={sent}')

0=Dr.Strange loves pav bhaji of mumbai as it costs only 2$ per plate.


In [5]:
for sent in text.sents:
    for word in sent:
        print(word)

Dr.
Strange
loves
pav
bhaji
of
mumbai
as
it
costs
only
2
$
per
plate
.


In [6]:
import nltk

In [7]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DNK133\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
from nltk.tokenize import sent_tokenize

In [9]:
sent_tokenize('Dr. strange loves pav bhaji of mumbai as it costs only 2$ per plate.')

['Dr. strange loves pav bhaji of mumbai as it costs only 2$ per plate.']

In [10]:
nlp2 = spacy.blank('en')

In [11]:
type(nlp2)

spacy.lang.en.English

In [12]:
token2 = nlp2('''Dr. strange loves pav bhaji of mumbai as it costs only 2$ per plate.''')

In [13]:
type(token2)

spacy.tokens.doc.Doc

In [14]:
for token in token2:
    print(token)

Dr.
strange
loves
pav
bhaji
of
mumbai
as
it
costs
only
2
$
per
plate
.


In [15]:
token2[0]

Dr.

In [16]:
word = token2[4]
word.text

'bhaji'

In [17]:
word

bhaji

### Span object

In [18]:
word.text[1:4]

'haj'

In [19]:
span4 = token2[1:5]

In [20]:
span4

strange loves pav bhaji

In [21]:
type(span4)

spacy.tokens.span.Span

### token attribute

In [22]:
token2

Dr. strange loves pav bhaji of mumbai as it costs only 2$ per plate.

In [23]:
token = token2[1]

In [24]:
type(token)

spacy.tokens.token.Token

In [25]:
token.is_alpha

True

In [26]:
new_token = nlp2('tony gave two $ to peter')

In [27]:
text = new_token[2]

In [28]:
text.like_num

True

In [29]:
text = new_token[3]

In [30]:
text.is_currency

True

In [31]:
text = ['Dayton high school, 8th grade students information\n',
 '==================================================\n',
 '\n',
 'Name\tbirth day   \temail\n',
 '-----\t------------\t------\n',
 'Virat   5 June, 1882    virat@kohli.com\n',
 'Maria\t12 April, 2001  maria@sharapova.com\n',
 'Serena  24 June, 1998   serena@williams.com \n',
 'Joe      1 May, 1997    joe@root.com\n',
 '\n',
 '\n',
 '\n']

In [32]:
text = ''.join(text)
text



In [33]:
doc = nlp2(text)
doc

Dayton high school, 8th grade students information

Name	birth day   	email
-----	------------	------
Virat   5 June, 1882    virat@kohli.com
Maria	12 April, 2001  maria@sharapova.com
Serena  24 June, 1998   serena@williams.com 
Joe      1 May, 1997    joe@root.com




In [34]:
email = []
for token in doc:
    if token.like_email:
        email.append(token.text)
email

['virat@kohli.com',
 'maria@sharapova.com',
 'serena@williams.com',
 'joe@root.com']

In [35]:
nlp3 = spacy.blank("hi")
doc3 = nlp("भैया जी! 5000 ₹ उधार थे वो वापस देदो")
for token in doc3:
    print(token, token.is_currency)

भैया False
जी False
! False
5000 False
₹ True
उधार False
थे False
वो False
वापस False
देदो False


### Customizing tokenizer

In [36]:
doc = nlp("gimme double cheese extra large healthy pizza")

In [37]:
from spacy.symbols import ORTH

In [38]:
token = [token.text for token in doc]
token

['gimme', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [39]:
nlp.tokenizer.add_special_case('gimme',[
    {ORTH: 'gim'},
    {ORTH: 'me'}
])
doc = nlp("gimme double cheese extra large healthy pizza")
token = [token.text for token in doc]
token

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

### Sentence Tokenization or Segmentation

In [40]:
doc = nlp2('''To find dates using Spacy in Python, you need to use Spacy's entity recognition capabilities. Spacy provides a pre-trained statistical model that can identify entities like dates, names, locations, etc. You can use this model to extract dates from text.''')

In [41]:
for token in doc.sents:
    print(token)

ValueError: [E030] Sentence boundaries unset. You can add the 'sentencizer' component to the pipeline with: `nlp.add_pipe('sentencizer')`. Alternatively, add the dependency parser or sentence recognizer, or set sentence boundaries by setting `doc[i].is_sent_start`.

In [42]:
nlp2.pipeline

[]

In [43]:
nlp2.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x17eebc0fec0>

In [44]:
nlp2.pipeline

[('sentencizer', <spacy.pipeline.sentencizer.Sentencizer at 0x17eebc0fec0>)]

In [45]:
doc = nlp2('''To find dates using Spacy in Python, you need to use Spacy's entity recognition capabilities. Spacy provides a pre-trained statistical model that can identify entities like dates, names, locations, etc. You can use this model to extract dates from text.''')

In [46]:
for token in doc.sents:
    print(token)

To find dates using Spacy in Python, you need to use Spacy's entity recognition capabilities.
Spacy provides a pre-trained statistical model that can identify entities like dates, names, locations, etc.
You can use this model to extract dates from text.


In [47]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''

# TODO: Write code here
# Hint: token has an attribute that can be used to detect a url

doc = nlp(text)

In [48]:
doc


Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.

In [49]:
url = []
for token in doc:
    if token.like_url:
        url.append(token.text)
url

['http://www.data.gov/',
 'http://www.science',
 'http://data.gov.uk/.',
 'http://www3.norc.org/gss+website/',
 'http://www.europeansocialsurvey.org/.']

In [54]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"

# TODO: Write code here
# Hint: Use token.i for the index of a token and token.is_currency for currency symbol detection

doc1 = nlp2(transactions)
doc1.text

'Tony gave two $ to Peter, Bruce gave 500 € to Steve'

In [57]:
for token in doc1:
    if token.like_num and doc1[token.i+1].is_currency:
        print(token.text, doc1[token.i+1].text) 

 

two $
500 €
