# Processing Raw Text

In [None]:
from __future__ import division
import nltk, re, pprint
from nltk import word_tokenize

from urllib import urlopen
url = "http://www.gutenberg.org/files/2554/2554.txt"
raw = urlopen(url).read()
raw[:75]

## Tokenization 
the process to break down text into basic units - words


In [None]:
tokens = word_tokenize(raw)
tokens[:10]

## Using NLTK text object
You can convert a list of tokens into a Text object and use the functions there 

In [None]:
text = nltk.Text(tokens)
text.collocations()

## Using BeautifulSoup to extract relevant content

In [None]:
from bs4 import BeautifulSoup
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html= urlopen(url).read().decode('utf8')
raw = BeautifulSoup(html).get_text()
tokens = word_tokenize(raw)

text = nltk.Text(tokens[110:390]) # manually filter menu
text.concordance('gene')

## Using FeedParser to access am Atom feed

In [None]:
import feedparser

llog = feedparser.parse("http://languagelog.ldc.upenn.edu/nll/?feed=atom")
llog['feed']['title']
len(llog.entries)
post = llog.entries[2]
post.title
content = post.content[0].value
content[0:70]
word_tokenize(BeautifulSoup(content).get_text())

## Working with Unicode
Python 3 sourcecode is byh default in UTF-8. Codecs is not needed as it has support for encoding in open

In [None]:
import codecs

path = nltk.data.find('corpora/unicode_samples/polish-lat2.txt')
f = codecs.open(path, encoding='latin2')

for line in f:
    line = line.strip()
    print line.encode('unicode_escape')
    print line

## Working with regular expressions 

In [None]:
import re
wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]

In [None]:
[w for w in wordlist if re.search('ed$',w)] # Find words ending in -ed

# Stemming

In [None]:
raw = """DENNIS: Listen, strange women lying in ponds distributing swords
     is no basis for a system of government.  Supreme executive power derives from 
     a mandate from the masses, not from some farcical aquatic ceremony."""
    
tokens = nltk.word_tokenize(raw)
print tokens

In [None]:
porter = nltk.PorterStemmer()
stems = [porter.stem(t) for t in tokens]
print stems

In [None]:
lancaster = nltk.LancasterStemmer()
stems = [lancaster.stem(t) for t in tokens]
print stems

## Lemmatization

In [None]:
wnl = nltk.WordNetLemmatizer()
lemmas = [wnl.lemmatize(t) for t in tokens]
print lemmas

## Sentence Segmentation

In [None]:
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')
sents = sent_tokenizer.tokenize(text)
pprint.pprint(sents[171:181])