# NLTK Project

In [105]:
# Import necessary libraries
import re
import nltk
from nltk import pos_tag, ne_chunk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/vincent/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/vincent/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/vincent/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vincent/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Preprocessing the corpus

In [106]:
# Load the corpus
f = open('corpus.txt')
corpus = f.read()

# Normalize
normalized = corpus.lower()
normalized = re.sub(r"[^a-zA-Z0-9]", " ", normalized)

# Tokenize
words = word_tokenize(normalized)
sentences = sent_tokenize(corpus)

# Tag each word with part of speech
tags = pos_tag(words)

# Tokenize, pos tag, then recognize named entities in text
tree = ne_chunk(pos_tag(word_tokenize(normalized)))

# Reduce words to their stems
stemmed = [PorterStemmer().stem(w) for w in words]
lemmed = [WordNetLemmatizer().lemmatize(w, pos='v') for w in words]

## Corpus statistics

In [131]:
# Some basic statistic
print("Words:", len(words))
print("Lemmanized words:", len(set(lemmed)))
print("Sentences:", len(sentences))
print("Average sentence length:", int(len(words)/len(sentences)))

fdist = nltk.FreqDist(words)

# Count and print most used words
print("5 Most used words:")
for (word, n) in fdist.most_common(5):
    print("   -", word, "(" + str(n) + ")")

# Count and print amount of hapaxes
hapaxes = 0
for (word, n) in fdist.items():
    if n == 1:
        hapaxes = hapaxes + 1
print("Hapaxes:", hapaxes)

Words: 1078
Lemmanized words: 383
Sentences: 33
Average sentence length: 32
5 Most used words:
   - and (59)
   - the (53)
   - of (41)
   - to (33)
   - is (29)
Hapaxes: 284
