# NLTK Project

In [17]:
# Import necessary libraries
import re
import nltk
from nltk import pos_tag, ne_chunk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('wordnet')
nltk.download('treebank')

[nltk_data] Downloading package punkt to /home/vincent/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/vincent/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/vincent/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vincent/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package treebank to /home/vincent/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

## Preprocessing the corpus

In [4]:
# Load the corpus
f = open('corpus.txt')
raw = f.read()

# Remove empty lines
raw = re.sub(r"\n\n", "\n", raw)

# Remove number and philosopther
corpus = ""
for line in raw.split("\n"):
    # Replace all . within the quote with a ,
    
    corpus = corpus + "\n" + re.sub(r"[0-9]+. ", "", line).split(" - ")[0]
#print(corpus)

# Normalize
normalized = corpus.lower()
normalized = re.sub(r"[^a-zA-Z0-9]", " ", normalized)

# Tokenize
words = word_tokenize(normalized)
sentences = sent_tokenize(corpus)

# Tag each word with part of speech
tags = pos_tag(words)

# Tokenize, pos tag, then recognize named entities in text
tree = ne_chunk(pos_tag(word_tokenize(normalized)))

# Reduce words to their stems
stemmed = [PorterStemmer().stem(w) for w in words]
lemmed = [WordNetLemmatizer().lemmatize(w, pos='v') for w in words]

## Corpus statistics

In [5]:
# Some basic statistic
print("Words:", len(words))
print("Vocabulary size:", len(set(lemmed)))
print("Sentences:", len(sentences))
print("Average sentence length:", int(len(words)/len(sentences)))

fdist = nltk.FreqDist(words)

# Count and print most used words
print("5 Most used words:")
for (word, n) in fdist.most_common(5):
    print("   -", word, "(" + str(n) + ")")

# Count and print amount of hapaxes
hapaxes = 0
for (word, n) in fdist.items():
    if n == 1:
        hapaxes = hapaxes + 1
print("Hapaxes:", hapaxes)

Words: 1060
Vocabulary size: 402
Sentences: 83
Average sentence length: 12
5 Most used words:
   - the (51)
   - to (42)
   - you (40)
   - is (39)
   - it (24)
Hapaxes: 286


## Extracting sentence structures

In [20]:
from nltk import tree

ruleset = set(rule for tree in nltk.corpus.treebank.parsed_sents()[:10] for rule in tree.productions())
for rule in ruleset:
        print(rule)
trees = map(lambda s: tree.Tree.fromstring(s), structure)
rules = []


"""
# Extract the CFG rules (productions) for the sentence
for tree in trees:
    for production in tree.productions():
        rules.append(production)
print(rules)"""

VBZ -> 'is'
PRT -> RP
SBAR -> -NONE- S
CD -> '55'
JJ -> 'likely'
VP -> MD VP
S -> NP-SBJ VP
NP -> PRP$ NN NN NNS
VBN -> 'used'
JJ -> 'British'
PP-CLR -> TO NP
NP-SBJ -> DT
NNP -> 'Mr.'
VBP -> 'show'
NP -> DT NN NN
NP -> NNS
NP-SBJ -> NNS
NP -> DT JJ NNS
VBZ -> 'enters'
NNS -> 'findings'
-NONE- -> '*'
NP -> NN
NP-SBJ -> NN
NP -> CD NNS
VB -> 'make'
ADJP-PRD -> RB JJ
NNS -> 'lungs'
NP -> QP NNS
JJ -> 'preliminary'
NN -> 'group'
NN -> 'year'
NN -> 'spokewoman'
ADVP-TMP -> RB
DT -> 'no'
S-TPC-2 -> NP-SBJ VP
NP -> NP NNP NNP NNP
'' -> "''"
VP -> VBN NP S-CLR
ADVP-TMP -> NP IN
IN -> 'in'
IN -> 'with'
VP -> VBZ NP-PRD PP-LOC ADVP-TMP
NP -> NP PP
NN -> 'problem'
VP -> VBG PP-CLR
VBD -> 'was'
JJ -> 'former'
NN -> 'unit'
QP -> RBR IN CD
S -> S-TPC-2 , NP-SBJ VP .
IN -> 'of'
IN -> 'before'
IN -> 'among'
PP-LOC -> IN NP
WDT -> 'that'
DT -> 'a'
NN -> 'cigarette'
VP -> VBZ NP
NP-SBJ -> DT NNP NN
S -> S-TPC-1 , NP-SBJ VP .
UCP -> ADJP CC NP
VBZ -> 'has'
RRC -> ADVP-TMP VP
NNS -> 'exposures'
DT -> 'an

'\n# Extract the CFG rules (productions) for the sentence\nfor tree in trees:\n    for production in tree.productions():\n        rules.append(production)\nprint(rules)'

## Creating a PCFG

In [None]:
from nltk import induce_pcfg

S = Nonterminal('S')
grammar_PCFG = induce_pcfg(S, treeData_rules)
print(grammar_PCFG)

## Generating sentences