# Tokenization

In [1]:
text = """Here’s to the crazy ones, the misfits, the rebels, the troublemakers, 
the round pegs in the square holes. The ones who see things differently — they’re not fond of 
rules. You can quote them, disagree with them, glorify
or vilify them, but the only thing you can’t do is ignore them because they
change things. They push the human race forward, and while some may see them
as the crazy ones, we see genius, because the ones who are crazy enough to think
that they can change the world, are the ones who do."""

### .split()

Splits whenever a white space is encountered

In [2]:
# word tokenization
print(text.split())
# sentence tokenizer
# text.split('. ')

['Here’s', 'to', 'the', 'crazy', 'ones,', 'the', 'misfits,', 'the', 'rebels,', 'the', 'troublemakers,', 'the', 'round', 'pegs', 'in', 'the', 'square', 'holes.', 'The', 'ones', 'who', 'see', 'things', 'differently', '—', 'they’re', 'not', 'fond', 'of', 'rules.', 'You', 'can', 'quote', 'them,', 'disagree', 'with', 'them,', 'glorify', 'or', 'vilify', 'them,', 'but', 'the', 'only', 'thing', 'you', 'can’t', 'do', 'is', 'ignore', 'them', 'because', 'they', 'change', 'things.', 'They', 'push', 'the', 'human', 'race', 'forward,', 'and', 'while', 'some', 'may', 'see', 'them', 'as', 'the', 'crazy', 'ones,', 'we', 'see', 'genius,', 'because', 'the', 'ones', 'who', 'are', 'crazy', 'enough', 'to', 'think', 'that', 'they', 'can', 'change', 'the', 'world,', 'are', 'the', 'ones', 'who', 'do.']


### NLTK

It  considers punctuation as a token

In [3]:
from nltk.tokenize import word_tokenize
print(word_tokenize(text))
# sntence tokenizer
# from nltk.tokenize import sent_tokenize
# sent_tokenize(text)

['Here', '’', 's', 'to', 'the', 'crazy', 'ones', ',', 'the', 'misfits', ',', 'the', 'rebels', ',', 'the', 'troublemakers', ',', 'the', 'round', 'pegs', 'in', 'the', 'square', 'holes', '.', 'The', 'ones', 'who', 'see', 'things', 'differently', '—', 'they', '’', 're', 'not', 'fond', 'of', 'rules', '.', 'You', 'can', 'quote', 'them', ',', 'disagree', 'with', 'them', ',', 'glorify', 'or', 'vilify', 'them', ',', 'but', 'the', 'only', 'thing', 'you', 'can', '’', 't', 'do', 'is', 'ignore', 'them', 'because', 'they', 'change', 'things', '.', 'They', 'push', 'the', 'human', 'race', 'forward', ',', 'and', 'while', 'some', 'may', 'see', 'them', 'as', 'the', 'crazy', 'ones', ',', 'we', 'see', 'genius', ',', 'because', 'the', 'ones', 'who', 'are', 'crazy', 'enough', 'to', 'think', 'that', 'they', 'can', 'change', 'the', 'world', ',', 'are', 'the', 'ones', 'who', 'do', '.']


### CountVectorizer - sklearn

The default regexp select tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator).

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
texts = ["""Here’s to the crazy ones, the misfits, the rebels, the troublemakers, 
            the round pegs in the square holes. The ones who see things differently — they’re not fond of 
            rules. You can quote them, disagree with them, glorify
            or vilify them, but the only thing you can’t do is ignore them because they
            change things. They push the human race forward, and while some may see them
            as the crazy ones, we see genius, because the ones who are crazy enough to think
            that they can change the world, are the ones who do.""" ,
         'I choose a lazy person to do a hard job. Because a lazy person will find an easy way to do it.']

df = pd.DataFrame({'author': ['jobs', 'gates'], 'text':texts})
cv = CountVectorizer(stop_words='english')
cv_matrix = cv.fit_transform(df['text'])
df_dtm = pd.DataFrame(cv_matrix.toarray(), index=df['author'].values, columns=cv.get_feature_names())
df_dtm

Unnamed: 0,change,choose,crazy,differently,disagree,easy,fond,forward,genius,glorify,...,round,rules,square,thing,things,think,troublemakers,vilify,way,world
jobs,2,0,3,1,1,0,1,1,1,1,...,1,1,1,1,2,1,1,1,0,1
gates,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


### Spacy

Tokenization in different languages (It  considers punctuation as a token)

In [5]:
from spacy.lang.es import Spanish
nlp = Spanish()

text_spanish = """Por los locos. Los marginados. Los rebeldes. Los problematicos. 
Los inadaptados. Los que ven las cosas de una manera distinta. A los que no les gustan
las reglas. Y a los que no respetan el “status quo”. Puedes citarlos, discrepar de ellos,
ensalzarlos o vilipendiarlos. Pero lo que no puedes hacer es ignorarlos… Porque ellos
cambian las cosas, empujan hacia adelante la raza humana y, aunque algunos puedan
considerarlos locos, nosotros vemos en ellos a genios. Porque las personas que están
lo bastante locas como para creer que pueden cambiar el mundo, son las que lo logran."""

doc = nlp(text_spanish)

tokens = [token.text for token in doc]
print(tokens)

['Por', 'los', 'locos', '.', 'Los', 'marginados', '.', 'Los', 'rebeldes', '.', 'Los', 'problematicos', '.', '\n', 'Los', 'inadaptados', '.', 'Los', 'que', 'ven', 'las', 'cosas', 'de', 'una', 'manera', 'distinta', '.', 'A', 'los', 'que', 'no', 'les', 'gustan', '\n', 'las', 'reglas', '.', 'Y', 'a', 'los', 'que', 'no', 'respetan', 'el', '“', 'status', 'quo', '”', '.', 'Puedes', 'citarlos', ',', 'discrepar', 'de', 'ellos', ',', '\n', 'ensalzarlos', 'o', 'vilipendiarlos', '.', 'Pero', 'lo', 'que', 'no', 'puedes', 'hacer', 'es', 'ignorarlos', '…', 'Porque', 'ellos', '\n', 'cambian', 'las', 'cosas', ',', 'empujan', 'hacia', 'adelante', 'la', 'raza', 'humana', 'y', ',', 'aunque', 'algunos', 'puedan', '\n', 'considerarlos', 'locos', ',', 'nosotros', 'vemos', 'en', 'ellos', 'a', 'genios', '.', 'Porque', 'las', 'personas', 'que', 'están', '\n', 'lo', 'bastante', 'locas', 'como', 'para', 'creer', 'que', 'pueden', 'cambiar', 'el', 'mundo', ',', 'son', 'las', 'que', 'lo', 'logran', '.']


### Gensim

Gensim is quite strict with punctuation. It splits whenever a punctuation is encountered. 

In [6]:
from gensim.utils import tokenize
#word tokenization
print(list(tokenize(text)))

['Here', 's', 'to', 'the', 'crazy', 'ones', 'the', 'misfits', 'the', 'rebels', 'the', 'troublemakers', 'the', 'round', 'pegs', 'in', 'the', 'square', 'holes', 'The', 'ones', 'who', 'see', 'things', 'differently', 'they', 're', 'not', 'fond', 'of', 'rules', 'You', 'can', 'quote', 'them', 'disagree', 'with', 'them', 'glorify', 'or', 'vilify', 'them', 'but', 'the', 'only', 'thing', 'you', 'can', 't', 'do', 'is', 'ignore', 'them', 'because', 'they', 'change', 'things', 'They', 'push', 'the', 'human', 'race', 'forward', 'and', 'while', 'some', 'may', 'see', 'them', 'as', 'the', 'crazy', 'ones', 'we', 'see', 'genius', 'because', 'the', 'ones', 'who', 'are', 'crazy', 'enough', 'to', 'think', 'that', 'they', 'can', 'change', 'the', 'world', 'are', 'the', 'ones', 'who', 'do']
