# NLTK Project

In [38]:
# Import necessary libraries
import re
from collections import Counter
import nltk
from nltk import pos_tag, ne_chunk
from nltk import PCFG
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.parse.generate import generate
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('wordnet')
nltk.download('treebank')

[nltk_data] Downloading package punkt to /home/vincent/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/vincent/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/vincent/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vincent/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package treebank to /home/vincent/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

## Preprocessing the corpus

In [9]:
# Load the corpus
f = open('corpus.txt')
raw = f.read()

# Remove empty lines
raw = re.sub(r"\n\n", "\n", raw)

# Remove number and philosopther
corpus = ""
for line in raw.split("\n"):
    corpus = corpus + "\n" + re.sub(r"[0-9]+. ", "", line).split(" - ")[0]

# Normalize
normalized = corpus.lower()
normalized = re.sub(r"[^a-zA-Z0-9]", " ", normalized)

# Tokenize
words = word_tokenize(normalized)
sentences = sent_tokenize(corpus)

# Tag each word with part of speech
tags = pos_tag(words)

# Tokenize, pos tag, then recognize named entities in text
tree = ne_chunk(pos_tag(word_tokenize(normalized)))

# Reduce words to their stems
stemmed = [PorterStemmer().stem(w) for w in words]
lemmed = [WordNetLemmatizer().lemmatize(w, pos='v') for w in words]

## Corpus statistics

In [3]:
# Some basic statistic
print("Words:", len(words))
print("Vocabulary size:", len(set(lemmed)))
print("Sentences:", len(sentences))
print("Average sentence length:", int(len(words)/len(sentences)))

fdist = nltk.FreqDist(words)

# Count and print most used words
print("5 Most used words:")
for (word, n) in fdist.most_common(5):
    print("   -", word, "(" + str(n) + ")")

# Count and print amount of hapaxes
hapaxes = 0
for (word, n) in fdist.items():
    if n == 1:
        hapaxes = hapaxes + 1
print("Hapaxes:", hapaxes)

Words: 1054
Vocabulary size: 404
Sentences: 83
Average sentence length: 12
5 Most used words:
   - the (51)
   - to (42)
   - you (40)
   - is (39)
   - it (24)
Hapaxes: 288


## Extract probablistic lexicon based on tags

In [57]:
lexicon = []
lhs = []
rhs = []
for word, tag in tags:
    lhs.append(tag)
    
tagcount = Counter(lhs)
rulecount = Counter(tags)
for word, tag in tags:
    lexicon.append(tag + " -> '" + word + "' [" + str(rulecount[(word, tag)]/tagcount[tag]) + "]")

lexiconset = sorted(set(lexicon))
lexicon = ''

for rule in lexiconset:
    rule = re.sub(r"\$", "S", rule)
    lexicon = lexicon + '\n' + rule

#print(lexicon)

## Creating a PCFG

In [58]:

grammar = lexicon

## Generating sentences

In [66]:
pcfg = PCFG.fromstring(grammar)
for sentence in generate(pcfg, n=10):
     print(' '.join(sentence))

and
but
nor
or
