# NLTK Project

In [17]:
# Import necessary libraries
import re
import random
from collections import Counter
import nltk
from nltk import induce_pcfg
from nltk import pos_tag, ne_chunk
from nltk import PCFG
from nltk import tree
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.parse.generate import generate
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('wordnet')
nltk.download('treebank')

[nltk_data] Downloading package punkt to /home/vincent/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/vincent/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/vincent/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vincent/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package treebank to /home/vincent/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

## Preprocessing the corpus

In [4]:
# Load the corpus
f = open('corpus.txt')
raw = f.read()

# Remove empty lines
raw = re.sub(r"\n\n", "\n", raw)

# Remove number and philosopther
corpus = ""
for line in raw.split("\n"):
    corpus = corpus + "\n" + re.sub(r"[0-9]+. ", "", line).split(" - ")[0].lower().replace('.', '').replace(',', '').replace(':', '').replace(';', '') + "  " 
#print(corpus)

# Normalize
normalized = corpus.lower()
normalized = re.sub(r"[^a-zA-Z0-9]", " ", normalized)

# Tokenize
words = word_tokenize(normalized)
sentences = corpus.split("  \n")[1:]
print(sentences)

# Tag each word with part of speech
tags = pos_tag(words)

# Reduce words to their stems
stemmed = [PorterStemmer().stem(w) for w in words]
lemmed = [WordNetLemmatizer().lemmatize(w, pos='v') for w in words]

['never let the future disturb you you will meet it if you have to with the same weapons of reason which today arm you against the present', 'you should live in such a way that there is nothing which you could not as easily tell your enemy as keep to yourself', 'relentlessly prune bullshit dont wait to do things that matter and savor the time you have', 'success is based off of your willingness to work your ass off no matter what obstacles are in your way', 'problems only exist in the human mind', 'just keep in mind the more we value things outside our control the less control we have', 'live your life like youre the hero in your movie', 'keep your intention pure emotions will try to distract you so keep going thats the cure', 'if you make happiness your goal youll be disappointed if you make presence your goal youll be satisfied', 'make the mind tougher by exposing it to adversity', 'we should live with the conviction i wasnt born for one particular corner the whole worlds my home cou

## Corpus statistics

In [5]:
# Some basic statistic
print("Words:", len(words))
print("Vocabulary size:", len(set(lemmed)))
print("Sentences:", len(sentences))
print("Average sentence length:", int(len(words)/len(sentences)))

fdist = nltk.FreqDist(words)

# Count and print most used words
print("5 Most used words:")
for (word, n) in fdist.most_common(5):
    print("   -", word, "(" + str(n) + ")")

# Count and print amount of hapaxes
hapaxes = 0
for (word, n) in fdist.items():
    if n == 1:
        hapaxes = hapaxes + 1
print("Hapaxes:", hapaxes)

Words: 1042
Vocabulary size: 404
Sentences: 65
Average sentence length: 16
5 Most used words:
   - the (51)
   - to (42)
   - is (39)
   - you (36)
   - it (23)
Hapaxes: 289


## Extract probablistic lexicon based on tags

In [6]:
lexicon = []
lhs = []
rhs = []
for word, tag in tags:
    lhs.append(tag)
    
tagcount = Counter(lhs)
rulecount = Counter(tags)
for word, tag in tags:
    lexicon.append(tag + " -> '" + word + "' [" + str(rulecount[(word, tag)]/tagcount[tag]) + "]")

lexiconset = sorted(set(lexicon))
lexicon = ''

for rule in lexiconset:
    rule = re.sub(r"\$", "S", rule)
    lexicon = lexicon + '\n' + rule

#print(lexicon)

## Extract probablisitc phrase strucure rules

In [9]:
grammar = "# Phrase structure rules\n\n"
for sentence in sentences:
    rhs = [tag.replace("$", "S").replace(".", "") for word, tag in pos_tag(word_tokenize(sentence))]
    grammar = grammar + "S -> " + " ".join(rhs) + " [" + str(1/65) + "]\n"
    

grammar = grammar + "\n\n\n# Lexical rules\n" + lexicon
print(grammar)

# Phrase structure rules

S -> RB VB DT NN NN PRP PRP MD VB PRP IN PRP VBP TO IN DT JJ NNS IN NN WDT NN IN PRP IN DT JJ [0.015384615384615385]
S -> PRP MD VB IN JJ DT NN IN EX VBZ NN WDT PRP MD RB IN RB VB PRPS NN IN NN TO VB [0.015384615384615385]
S -> RB JJ NN NN NN TO VB NNS IN NN CC VB DT NN PRP VBP [0.015384615384615385]
S -> NN VBZ VBN IN IN PRPS NN TO VB PRPS NN IN DT NN WP VBZ VBP IN PRPS NN [0.015384615384615385]
S -> NNS RB VBP IN DT JJ NN [0.015384615384615385]
S -> RB VB IN NN DT JJR PRP NN NNS IN PRPS NN DT JJR NN PRP VBP [0.015384615384615385]
S -> VB PRPS NN IN NN DT NN IN PRPS NN [0.015384615384615385]
S -> VB PRPS NN NN NNS MD VB TO VB PRP RB VB VBG IN DT NN [0.015384615384615385]
S -> IN PRP VBP VB PRPS NN NN VB VBN IN PRP VBP NN PRPS NN NN VB VBN [0.015384615384615385]
S -> VB DT NN NN IN VBG PRP TO VB [0.015384615384615385]
S -> PRP MD VB IN DT NN NN VBP VBN IN CD JJ NN DT JJ NNS PRPS NN NN [0.015384615384615385]
S -> DT JJR PRP VBP DT JJ DT JJR PRP MD VB JJ [0.0153

## Generating sentences

In [122]:
pcfg = PCFG.fromstring(grammar)

# Select random structure
def pick_rhs(lhs):
    rules = []
    probs = []
    for rule in pcfg.productions():
        if rule.lhs() == lhs:
            rules.append(list(rule.rhs()))
            probs.append(rule.prob())
    return random.choices(rules, weights=probs)[0]

# Generate sentences based on probability
def generate_senteneces(n):
    for i in range(n):
        sentence = []
        for nonterminal in pick_rhs(Nonterminal('S')):
            sentence.append(pick_rhs(nonterminal)[0])
        print(" ".join(sentence))
    
generate_senteneces(10)
    

not first ears arrive by day will be alive by move
know poor as warriors nor unconquerable as it
himself is better intolerable if a thing to do grounded if any uncertainty of he has more to keep out to be not
sometimes keep it to be born the longer exposing of uncomfortable character relentlessly silently by your able warriors
competition has the wise wait and approval is the chief one
you must allow your corner silently in it makes your position than we must easily look self absurd only nor about
you lives immediately become who it seek but when you speak you
particular hopes should change we who arise weapons wise everlasting youll but satisfied by the nothing by change
not same emotions live in quality should change few if position
what we add shedding most is only what it most wither to be
