# NLTK Project

In [1]:
# Import necessary libraries
import nltk
import random
import re
from collections import Counter
from nltk.corpus import stopwords
from nltk.grammar import Nonterminal
from nltk.grammar import ProbabilisticProduction
from nltk.parse.generate import generate
from nltk.parse import RecursiveDescentParser
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk import induce_pcfg
from nltk import Nonterminal
from nltk import PCFG
from nltk import pos_tag, ne_chunk
from nltk import tree
from nltk import Tree
from typing import Iterator, List, Tuple, Union
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('punkt')
nltk.download('treebank')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/vincent/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/vincent/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package punkt to /home/vincent/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package treebank to /home/vincent/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vincent/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Preprocessing the corpus

In [2]:
# Load the corpus
f = open('corpus.txt')
raw = f.read()

# Remove empty lines
raw = re.sub(r"\n\n", "\n", raw)

# Remove number and philosopther
corpus = ""
for line in raw.split("\n"):
    corpus = corpus + "\n" + re.sub(r"[0-9]+. ", "", line).split(" - ")[0].lower().replace('.', '').replace(',', '').replace(':', '').replace(';', '') + "  " 
#print(corpus)

# Normalize
normalized = corpus.lower()
normalized = re.sub(r"[^a-zA-Z0-9]", " ", normalized)

# Tokenize
words = word_tokenize(normalized)
sentences = corpus.split("  \n")[1:]
#print(sentences)

# Tag each word with part of speech
tags = pos_tag(words)

# Reduce words to their stems
stemmed = [PorterStemmer().stem(w) for w in words]
lemmed = [WordNetLemmatizer().lemmatize(w, pos='v') for w in words]

# print normalized corpus to file
with open("normalized.txt", "w") as f:
    for sentence in sentences:
        f.write(sentence + "\n")

## Corpus statistics

In [3]:
# Some basic statistic
print("Words:", len(words))
print("Vocabulary size:", len(set(lemmed)))
print("Sentences:", len(sentences))
print("Average sentence length:", int(len(words)/len(sentences)))

fdist = nltk.FreqDist(words)

# Count and print most used words
print("5 Most used words:")
for (word, n) in fdist.most_common(5):
    print("   -", word, "(" + str(n) + ")")

# Count and print amount of hapaxes
hapaxes = 0
for (word, n) in fdist.items():
    if n == 1:
        hapaxes = hapaxes + 1
print("Hapaxes:", hapaxes)

Words: 1042
Vocabulary size: 404
Sentences: 65
Average sentence length: 16
5 Most used words:
   - the (51)
   - to (42)
   - is (39)
   - you (36)
   - it (23)
Hapaxes: 289


## Extract file to annotate

In [4]:
with open('unannotated.txt', 'w') as f:
    for sentence in sentences:
        sent_tags = pos_tag(word_tokenize(sentence))
        f.write('(S ')
        for word, tag in sent_tags:
            f.write(f"({tag} {word}) ")
        f.write(")\n")

Now copy `unannotated.txt` to `annotated.txt` and manually annotate corpus

## Induce a PCFG from annotated corpus

In [5]:
trees = []
with open('annotated.txt') as f:
    for line in f:
        tree = Tree.fromstring(line)
        trees.append(tree)

productions = []
for tree in trees:
        productions += tree.productions()
        
grammar = nltk.induce_pcfg(Nonterminal('S'), productions)
listgrammar = [str(prod) for prod in grammar.productions()]
listgrammar.sort()

with open("grammar.txt", "w") as f:
    psr = []
    lex = []
    for i in listgrammar:
        if "'" in i:
            lex.append(i)
        else:
            psr.append(i)
    
    f.write("# This grammar is sorted alphabetically. The starting node is 'S'\n\n# PHRASE STRUCTURE RULES:\n\n")
    for i in psr:
        f.write(i + "\n")
    
    f.write("\n\n# LEXICAL RULES:\n\n")
    for i in lex:
        f.write(i + "\n")

## Generating sentences

In [18]:
Symbol = Union[str, Nonterminal]

class Generator(nltk.grammar.PCFG):
    def generate(self, n: int) -> Iterator[str]:
        for _ in range(n):
            yield self._generate_derivation(self.start())

    def _generate_derivation(self, nonterminal: Nonterminal) -> str:
        sentence: List[str] = []
        symbol: Symbol
        derivation: str
        for symbol in self._reduce_once(nonterminal):
            if isinstance(symbol, str):
                derivation = symbol
            else:
                derivation = self._generate_derivation(symbol)
            if derivation != "":
                sentence.append(derivation)
        return " ".join(sentence)

    def _reduce_once(self, nonterminal: Nonterminal) -> Tuple[Symbol]:
        return self._choose_production_reducing(nonterminal).rhs()

    def _choose_production_reducing(
        self, nonterminal: Nonterminal
    ) -> ProbabilisticProduction:
        productions: List[ProbabilisticProduction] = self._lhs_index[nonterminal]
        probabilities: List[float] = [production.prob() for production in productions]
        return random.choices(productions, weights=probabilities)[0]


def generate(n):
    listgrammar.insert(0, 'S -> S [0]') # Manipulate a starting node
    rawgrammar = "\n".join(listgrammar).replace('$', 'S')
    generator = Generator.fromstring(rawgrammar) # $ will cause errors
    sentences = []
    while len(sentences) < n:
        for sentence in generator.generate(1):
            if len(sentence.split(' ')) > 8 and len(sentence.split(' ')) < 32:
                sentences.append(sentence.capitalize() + '.')
    return sentences

generated = generate(50)

with open('generated.tex', 'w') as f:
    f.write('\begin{enumerate}')
    for sentence in generated:
        f.write('\item '  + sentence)

    f.write('\end{enumerate}')