## Corpora, Tokens, y Types

In [1]:
import spacy
spacy.__version__

'2.1.6'

In [2]:
nlp = spacy.load('en')
text = "Mary, don't slap the green witch"
[str(token) for token in nlp(text.lower())]

['mary', ',', 'do', "n't", 'slap', 'the', 'green', 'witch']

In [3]:
from nltk.tokenize import TweetTokenizer

In [4]:
tweet = u"Snow White and the Seven Degrees #MakeAMovieCold@midnight:-)"
tokenizer = TweetTokenizer()
tokenizer.tokenize(tweet.lower())

['snow',
 'white',
 'and',
 'the',
 'seven',
 'degrees',
 '#makeamoviecold',
 '@midnight',
 ':-)']

## Unigrams, Bigrams, Trigrams, ..., N-grams

In [5]:
def n_grams(text, n):
    """
    takes tokens or text, returns a list of n-grams
    """
    return [text[i:i+n] for i in range(len(text)-n+1)]


cleaned = ["mary", ",", "n't", "slap", "green", "witch", "."]
n_grams(cleaned, 3)

[['mary', ',', "n't"],
 [',', "n't", 'slap'],
 ["n't", 'slap', 'green'],
 ['slap', 'green', 'witch'],
 ['green', 'witch', '.']]

## Lemmas y Stems

In [6]:
nlp = spacy.load("en")
doc = nlp(u"he was running late")
for token in doc:
    print(f"{token} --> {token.lemma_}")

he --> -PRON-
was --> be
running --> run
late --> late


## Categorizando palabras: POS Tagging

In [7]:
nlp = spacy.load("en")
doc = nlp(u"Mary slapped the green witch.")
for token in doc:
    print(f"{token} - {token.pos_}")

Mary - PROPN
slapped - VERB
the - DET
green - ADJ
witch - NOUN
. - PUNCT


## Categorizando spans: Chunking y NER

In [8]:
nlp = spacy.load("en")
doc = nlp(u"Mary slapped the green witch.")
for chunk in doc.noun_chunks:
    print(f"{chunk} - {chunk.label_}")

Mary - NP
the green witch - NP


In [9]:
import spacy
from spacy import displacy
from collections import Counter

In [10]:
nlp = spacy.load("en")
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
[(X.text, X.label_) for X in doc.ents]

[('European', 'NORP'),
 ('Google', 'ORG'),
 ('$5.1 billion', 'MONEY'),
 ('Wednesday', 'DATE')]

In [11]:
displacy.render(doc, jupyter=True, style='ent')

In [12]:
displacy.render(doc, jupyter=True, style='dep', options = {"distance": 120})