# NLTK Project

In [2]:
# Import necessary libraries
import re
import nltk
from nltk import pos_tag, ne_chunk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/vincent/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/vincent/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/vincent/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vincent/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Preprocessing the corpus

In [8]:
# Load the corpus
f = open('corpus.txt')
raw = f.read()

# Remove empty lines
raw = re.sub(r"\n\n", "\n", raw)

# Remove pholosopher


# Remove number and philosopther
corpus = ""
for line in raw.split("\n"):
    # Replace all . within the quote with a ,
    
    corpus = corpus + "\n" + re.sub(r"[0-9]+. ", "", line).split(" - ")[0]
print(corpus)

# Normalize
normalized = corpus.lower()
normalized = re.sub(r"[^a-zA-Z0-9]", " ", normalized)

# Tokenize
words = word_tokenize(normalized)
sentences = sent_tokenize(corpus)

# Tag each word with part of speech
tags = pos_tag(words)

# Tokenize, pos tag, then recognize named entities in text
tree = ne_chunk(pos_tag(word_tokenize(normalized)))

# Reduce words to their stems
stemmed = [PorterStemmer().stem(w) for w in words]
lemmed = [WordNetLemmatizer().lemmatize(w, pos='v') for w in words]


Never let the future disturb you. You will meet it, if you have to, with the same weapons of reason which today arm you against the present.
You should … live in such a way that there is nothing which you could not as easily tell your enemy as keep to yourself.
Relentlessly prune bullshit, don’t wait to do things that matter, and savor the time you have.
Success is based off of your willingness to work your ass off no matter what obstacles are in your way.
Problems only exist in the human mind.
Just keep in mind: the more we value things outside our control, the less control we have.
Live your life like you’re the hero in your movie.
Keep your intention pure. Emotions will try to distract you. So keep going. That’s the cure.
If you make happiness your goal, you’ll be disappointed. If you make presence your goal, you’ll be satisfied.
Make the mind tougher by exposing it to adversity.
We should live with the conviction: 'I wasn't born for one particular corner: the whole world‟s my home

## Corpus statistics

In [4]:
# Some basic statistic
print("Words:", len(words))
print("Vocabulary size:", len(set(lemmed)))
print("Sentences:", len(sentences))
print("Average sentence length:", int(len(words)/len(sentences)))

fdist = nltk.FreqDist(words)

# Count and print most used words
print("5 Most used words:")
for (word, n) in fdist.most_common(5):
    print("   -", word, "(" + str(n) + ")")

# Count and print amount of hapaxes
hapaxes = 0
for (word, n) in fdist.items():
    if n == 1:
        hapaxes = hapaxes + 1
print("Hapaxes:", hapaxes)

Words: 1060
Vocabulary size: 402
Sentences: 83
Average sentence length: 12
5 Most used words:
   - the (51)
   - to (42)
   - you (40)
   - is (39)
   - it (24)
Hapaxes: 286
