# Processing Raw Text

In [12]:
from __future__ import division
import nltk, re, pprint
from nltk import word_tokenize

from urllib import urlopen
url = "http://www.gutenberg.org/files/2554/2554.txt"
raw = urlopen(url).read()
raw[:75]

'The Project Gutenberg EBook of Crime and Punishment, by Fyodor Dostoevsky\r\n'

## Tokenization 
the process to break down text into basic units - words


In [17]:
tokens = word_tokenize(raw)
tokens[:10]

['The',
 'Project',
 'Gutenberg',
 'EBook',
 'of',
 'Crime',
 'and',
 'Punishment',
 ',',
 'by']

## Using NLTK text object
You can convert a list of tokens into a Text object and use the functions there 

In [22]:
text = nltk.Text(tokens)
text.collocations()

Katerina Ivanovna; Pyotr Petrovitch; Pulcheria Alexandrovna; Avdotya
Romanovna; Rodion Romanovitch; Marfa Petrovna; Sofya Semyonovna; old
woman; Project Gutenberg-tm; Porfiry Petrovitch; Amalia Ivanovna;
great deal; Nikodim Fomitch; young man; Ilya Petrovitch; n't know;
Project Gutenberg; Dmitri Prokofitch; Andrey Semyonovitch; Hay Market


## Using BeautifulSoup to extract relevant content

In [31]:
from bs4 import BeautifulSoup
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html= urlopen(url).read().decode('utf8')
raw = BeautifulSoup(html).get_text()
tokens = word_tokenize(raw)

text = nltk.Text(tokens[110:390]) # manually filter menu
text.concordance('gene')

Displaying 5 of 5 matches:
hey say too few people now carry the gene for blondes to last beyond the next 
blonde hair is caused by a recessive gene . In order for a child to have blond
 have blonde hair , it must have the gene on both sides of the family in the g
ere is a disadvantage of having that gene or by chance . They do n't disappear
des would disappear is if having the gene was a disadvantage and I do not thin


## Using FeedParser to access am Atom feed

In [43]:
import feedparser

llog = feedparser.parse("http://languagelog.ldc.upenn.edu/nll/?feed=atom")
llog['feed']['title']
len(llog.entries)
post = llog.entries[2]
post.title
content = post.content[0].value
content[0:70]
word_tokenize(BeautifulSoup(content).get_text())

[u'Jessica',
 u'Firger',
 u',',
 u'``',
 u'First',
 u'human',
 u'head',
 u'transplant',
 u'two',
 u'years',
 u'away',
 u',',
 u'says',
 u'one',
 u'surgeon',
 u"''",
 u',',
 u'CBS',
 u'News',
 u'2/26/2015',
 u':',
 u'Most',
 u'people',
 u'ca',
 u"n't",
 u'wrap',
 u'their',
 u'head',
 u'around',
 u'the',
 u'concept',
 u'.',
 u'But',
 u'one',
 u'scientist',
 u'believes',
 u'head',
 u'transplants',
 u'in',
 u'humans',
 u'are',
 u'possible',
 u'and',
 u'that',
 u'the',
 u'first',
 u'could',
 u'occur',
 u'as',
 u'early',
 u'as',
 u'2017',
 u'.',
 u'In',
 u'expressions',
 u'of',
 u'the',
 u'form',
 u'X',
 u'transplant',
 u',',
 u'for',
 u'X=kidney',
 u',',
 u'heart',
 u',',
 u'etc.',
 u',',
 u'the',
 u'X',
 u'comes',
 u'from',
 u'a',
 u'donor',
 u',',
 u'and',
 u'is',
 u'installed',
 u'in',
 u'or',
 u'on',
 u'a',
 u'recipient',
 u'.',
 u'If',
 u'Kim',
 u'and',
 u'Leslie',
 u'get',
 u'kidney',
 u'transplants',
 u',',
 u'their',
 u'identities',
 u'remain',
 u'the',
 u'same',
 u'from',
 u'both',

## Working with Unicode
Python 3 sourcecode is byh default in UTF-8. Codecs is not needed as it has support for encoding in open

In [53]:
import codecs

path = nltk.data.find('corpora/unicode_samples/polish-lat2.txt')
f = codecs.open(path, encoding='latin2')

for line in f:
    line = line.strip()
    print line.encode('unicode_escape')
    print line

Pruska Biblioteka Pa\u0144stwowa. Jej dawne zbiory znane pod nazw\u0105
Pruska Biblioteka Państwowa. Jej dawne zbiory znane pod nazwą
"Berlinka" to skarb kultury i sztuki niemieckiej. Przewiezione przez
"Berlinka" to skarb kultury i sztuki niemieckiej. Przewiezione przez
Niemc\xf3w pod koniec II wojny \u015bwiatowej na Dolny \u015al\u0105sk, zosta\u0142y
Niemców pod koniec II wojny światowej na Dolny Śląsk, zostały
odnalezione po 1945 r. na terytorium Polski. Trafi\u0142y do Biblioteki
odnalezione po 1945 r. na terytorium Polski. Trafiły do Biblioteki
Jagiello\u0144skiej w Krakowie, obejmuj\u0105 ponad 500 tys. zabytkowych
Jagiellońskiej w Krakowie, obejmują ponad 500 tys. zabytkowych
archiwali\xf3w, m.in. manuskrypty Goethego, Mozarta, Beethovena, Bacha.
archiwaliów, m.in. manuskrypty Goethego, Mozarta, Beethovena, Bacha.


## Working with regular expressions 

In [55]:
import re
wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]

In [57]:
[w for w in wordlist if re.search('ed$',w)] # Find words ending in -ed

[u'abaissed',
 u'abandoned',
 u'abased',
 u'abashed',
 u'abatised',
 u'abed',
 u'aborted',
 u'abridged',
 u'abscessed',
 u'absconded',
 u'absorbed',
 u'abstracted',
 u'abstricted',
 u'accelerated',
 u'accepted',
 u'accidented',
 u'accoladed',
 u'accolated',
 u'accomplished',
 u'accosted',
 u'accredited',
 u'accursed',
 u'accused',
 u'accustomed',
 u'acetated',
 u'acheweed',
 u'aciculated',
 u'aciliated',
 u'acknowledged',
 u'acorned',
 u'acquainted',
 u'acquired',
 u'acquisited',
 u'acred',
 u'aculeated',
 u'addebted',
 u'added',
 u'addicted',
 u'addlebrained',
 u'addleheaded',
 u'addlepated',
 u'addorsed',
 u'adempted',
 u'adfected',
 u'adjoined',
 u'admired',
 u'admitted',
 u'adnexed',
 u'adopted',
 u'adossed',
 u'adreamed',
 u'adscripted',
 u'aduncated',
 u'advanced',
 u'advised',
 u'aeried',
 u'aethered',
 u'afeared',
 u'affected',
 u'affectioned',
 u'affined',
 u'afflicted',
 u'affricated',
 u'affrighted',
 u'affronted',
 u'aforenamed',
 u'afterfeed',
 u'aftershafted',
 u'aftertho

# Stemming

In [61]:
raw = """DENNIS: Listen, strange women lying in ponds distributing swords
     is no basis for a system of government.  Supreme executive power derives from 
     a mandate from the masses, not from some farcical aquatic ceremony."""
    
tokens = nltk.word_tokenize(raw)
print tokens

['DENNIS', ':', 'Listen', ',', 'strange', 'women', 'lying', 'in', 'ponds', 'distributing', 'swords', 'is', 'no', 'basis', 'for', 'a', 'system', 'of', 'government', '.', 'Supreme', 'executive', 'power', 'derives', 'from', 'a', 'mandate', 'from', 'the', 'masses', ',', 'not', 'from', 'some', 'farcical', 'aquatic', 'ceremony', '.']


In [66]:
porter = nltk.PorterStemmer()
stems = [porter.stem(t) for t in tokens]
print stems

[u'DENNI', u':', u'Listen', u',', u'strang', u'women', u'lie', u'in', u'pond', u'distribut', u'sword', u'is', u'no', u'basi', u'for', u'a', u'system', u'of', u'govern', u'.', u'Suprem', u'execut', u'power', u'deriv', u'from', u'a', u'mandat', u'from', u'the', u'mass', u',', u'not', u'from', u'some', u'farcic', u'aquat', u'ceremoni', u'.']


In [70]:
lancaster = nltk.LancasterStemmer()
stems = [lancaster.stem(t) for t in tokens]
print stems

['den', ':', 'list', ',', 'strange', 'wom', 'lying', 'in', 'pond', 'distribut', 'sword', 'is', 'no', 'bas', 'for', 'a', 'system', 'of', 'govern', '.', 'suprem', 'execut', 'pow', 'der', 'from', 'a', 'mand', 'from', 'the', 'mass', ',', 'not', 'from', 'som', 'farc', 'aqu', 'ceremony', '.']


## Lemmatization

In [73]:
wnl = nltk.WordNetLemmatizer()
lemmas = [wnl.lemmatize(t) for t in tokens]
print lemmas

['DENNIS', ':', 'Listen', ',', 'strange', u'woman', 'lying', 'in', u'pond', 'distributing', u'sword', 'is', 'no', 'basis', 'for', 'a', 'system', 'of', 'government', '.', 'Supreme', 'executive', 'power', 'derives', 'from', 'a', 'mandate', 'from', 'the', u'mass', ',', 'not', 'from', 'some', 'farcical', 'aquatic', 'ceremony', '.']


## Sentence Segentation

In [76]:
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')
sents = sent_tokenizer.tokenize(text)
pprint.pprint(sents[171:181])

[u'In the wild events which were to follow this girl had no\npart at all; he never saw her again until all his tale was over.',
 u'And yet, in some indescribable way, she kept recurring like a\nmotive in music through all his mad adventures afterwards, and the\nglory of her strange hair ran like a red thread through those dark\nand ill-drawn tapestries of the night.',
 u'For what followed was so\nimprobable, that it might well have been a dream.',
 u'When Syme went out into the starlit street, he found it for the\nmoment empty.',
 u'Then he realised (in some odd way) that the silence\nwas rather a living silence than a dead one.',
 u'Directly outside the\ndoor stood a street lamp, whose gleam gilded the leaves of the tree\nthat bent out over the fence behind him.',
 u'About a foot from the\nlamp-post stood a figure almost as rigid and motionless as the\nlamp-post itself.',
 u'The tall hat and long frock coat were black; the\nface, in an abrupt shadow, was almost as dark.',
 u'Only a fr