# DSC360 - Week 4 - Exercise 4.2

We begin the exercise this week by importing the necessary libraries and files.

In [1]:
import pandas as pd
import re
import string
from bs4 import BeautifulSoup
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
from autocorrect import Speller
import contractions

## Using Natural Language Processing (NLP)

**1. In the text, there's a text normalizer created - your assignment is to re-create that normalizer as a Python class that can be re-used (within a .py file). However, unlike the book author's version, pass a Pandas Series (e.g., dataframe['column']) to your normalize_corpus function and use apply/lambda for each cleaning function.**

In [2]:
# Create text normalizer as a class
class TextNormalizer:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.spell = Speller()
        self.nlp = spacy.load("en_core_web_sm")

    # Define function to obtain part of speech tags
    def get_wordnet_pos(self, word):
        tag = pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)

    # Define function to clean specified text
    def clean_text(self, text):
        # Remove HTML tags
        text = BeautifulSoup(text, "html.parser").get_text()
        # Expand contractions
        text = contractions.fix(text)
        # Tokenize
        tokens = word_tokenize(text)
        # Make lowercase and remove punctuation/stopwords
        tokens = [self.spell(word.lower()) for word in tokens if word.isalpha() and word.lower() not in self.stop_words]
        # Lemmatize
        tokens = [self.lemmatizer.lemmatize(word, self.get_wordnet_pos(word)) for word in tokens]
        return ' '.join(tokens)

    # Define normalization function (including apply/lambda function)
    def normalize(self, text_series):
        return text_series.apply(lambda text:self.clean_text(text))

**2. Using your new text normalizer, create a Jupyter Notebook that uses this class to clean up the text found in the file "big.txt" (that text file is in the GitHub for Week 4 repository). Your resulting text should be a (long) single stream of text.**

In [3]:
# Define the file path
file_path = r'C:\Users\Amanda Heflin\Downloads\big.txt'

# Read file
with open(file_path, 'r') as file:
    text = file.read()

# Convert into pandas series and return sample of cleaned text
text_series = pd.Series([text[:1000]])
normalizer = TextNormalizer()
cleaned_text = normalizer.normalize(text_series)
print(cleaned_text)

0    project gutenberg ebook adventure sherlock hol...
dtype: object


**3. Then, using spaCy *and* NLTK (this will be two different ways of doing the same thing), create code that show the tokens, lemmas, parts of speech, and dependencies in the first 1,000 characters of "big.txt" (the same text you normalized).**

In [4]:
from nltk import ne_chunk, RegexpParser, pos_tag, word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

# Define function to utilize treebank tagging
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Use spaCy for analysis
doc_spacy = normalizer.nlp(cleaned_text.iloc[0])
spacy_results = [(token.text, token.lemma_, token.pos_, token.dep_) for token in doc_spacy]

# Display results
print("spaCy Results:")
for result in spacy_results:
    print(result)

# Tokenization and Part of Speech tagging with NLTK
tokens_nltk = word_tokenize(cleaned_text.iloc[0])
pos_tags = pos_tag(tokens_nltk)
# Lemmatize tokens based on POS tags
lemmas = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
# Define expanded grammar model that includes adj and adv
grammar = """
NP: {<DT>?<JJ.*>*<NN.*>} # Noun Phrase with adj
VP: {<VB.*><RB.*><NP|PP>*} # Verb Phrase with adv
PP: {<IN><NP>} # Prepositional Phrase
"""
parser = RegexpParser(grammar)
chunked = parser.parse(pos_tags)

# Display results including chunked structure for dependency approximation
print("\nNLTK Results:")
for i, (word, tag) in enumerate(pos_tags):
    lemma = lemmas[i]
    print(f"Token: {word}, Lemma: {lemma}, POS: {tag}")

print("\nNLTK Dependency Approximation:")
for subtree in chunked:
    if isinstance(subtree, nltk.Tree):
        print(f"{subtree.label()}:{subtree.leaves()}")

spaCy Results:
('project', 'project', 'PROPN', 'compound')
('gutenberg', 'gutenberg', 'PROPN', 'compound')
('ebook', 'ebook', 'PROPN', 'compound')
('adventure', 'adventure', 'NOUN', 'compound')
('sherlock', 'sherlock', 'PROPN', 'compound')
('holmes', 'holmes', 'PROPN', 'compound')
('sir', 'sir', 'PROPN', 'compound')
('arthur', 'arthur', 'PROPN', 'compound')
('conan', 'conan', 'PROPN', 'compound')
('doyle', 'doyle', 'PROPN', 'compound')
('series', 'series', 'PROPN', 'nsubj')
('sir', 'sir', 'PROPN', 'compound')
('arthur', 'arthur', 'PROPN', 'compound')
('conan', 'conan', 'PROPN', 'compound')
('doyle', 'doyle', 'PROPN', 'compound')
('copyright', 'copyright', 'NOUN', 'compound')
('law', 'law', 'NOUN', 'compound')
('change', 'change', 'NOUN', 'relcl')
('world', 'world', 'NOUN', 'dobj')
('sure', 'sure', 'ADV', 'advmod')
('check', 'check', 'VERB', 'ROOT')
('copyright', 'copyright', 'NOUN', 'compound')
('law', 'law', 'NOUN', 'compound')
('country', 'country', 'PROPN', 'compound')
('download', 