In [1]:
import requests
from bs4 import BeautifulSoup

from nltk import tokenize
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

import string

import spacy
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shahv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shahv\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def fetch_data(data_source, output_file):
    """ Fetches and saves data from the link """
    response = requests.get(data_source)
    if response.status_code == 200:
        file = open(output_file, "w+", encoding="utf-8")
        file.write(response.text)
        file.close()
        return True
    return False

In [3]:
def clean_file(data_source, output_data):
    """ Removes the file with newline characters """
    with open(data_source, "r", encoding="utf-8") as f:
        data = f.readlines()
    with open(output_data, "w+", encoding="utf-8") as outfile:
        for line in data:
            if len(line.strip()) > 0:
                outfile.write(f"{line.strip()}\n")

In [4]:
def clean_data(text):
    """ Removes and replaces unnecessary characters """
    data = text.strip().strip("\n").strip("\t")
    data = data.replace("\n"," ").replace("\t"," ")
    data = data.replace('“',"").replace('”',"")
    data = data.replace('’',"'").replace('—'," ")
    return data

In [5]:
def extract_paragraphs(source_file):
    """ Extracts the <p> tags from the <div> with class chapter """
    soup = BeautifulSoup(open(source_file, "r", encoding="utf-8").read().strip(), 'html.parser')
    divs = soup.find_all('div',{'class': lambda x: x and 'chapter' in x.split()})
    text_list = []
    for div in divs:
        paras = div.find_all('p')
        for p in paras:
            raw_text = p.get_text()
            text_list.append(clean_data(raw_text))
    return text_list

In [6]:
def get_sentences(paragraph):
    """ Splits the text into sentences """
    sentences = []
    tokens = nlp(paragraph)
    for sent in tokens.sents:
        sentences.append(sent.text.strip())
    return sentences

In [7]:
def remove_punctuations(text):
    table = str.maketrans('', '', string.punctuation)
    clean_tokens = [word.translate(table) for word in text]
    return "".join(clean_tokens)

In [8]:
def remove_stopwords(text):
    """ Filters the stop words """
    tokens = word_tokenize(text)
    clean_tokens = [word for word in tokens if not word in stopwords.words()]
    return " ".join(clean_tokens)

In [9]:
def apply_stemming(text):
    """ Applies stemming to reduce each word to its root or base """
    tokens = word_tokenize(text)
    porter = PorterStemmer()
    stem_tokens = [porter.stem(word) for word in tokens]
    return " ".join(stem_tokens)

In [10]:
def apply_lemmatization(text):
    """ Applies lemmatization to reduce words to their root form """
    lem_tokens = [WordNetLemmatizer().lemmatize(word) for word in text]
    return "".join(lem_tokens)

In [11]:
TXT_DATA_SOURCE = "https://www.gutenberg.org/files/16/16-0.txt"
HTML_DATA_SOURCE = "https://www.gutenberg.org/files/16/16-h/16-h.htm"
DATA_PATH = "results/peter_pan.txt"
CLEAN_DATA_PATH = "results/peter_pan_clean.txt"
EXTRACTED_SENTENCES_PATH = "data/peter_pan_sentences.txt"

In [12]:
fetch_data(HTML_DATA_SOURCE, DATA_PATH)
clean_file(DATA_PATH, CLEAN_DATA_PATH)
paragraphs = extract_paragraphs(CLEAN_DATA_PATH)
sentences = []
for para in paragraphs:
    parsed_sents = get_sentences(para)
    sentences.extend(parsed_sents)

In [13]:
print(f"Extracted Sentences Count: {len(sentences)}")

Extracted Sentences Count: 3244


### Limiting sentences to 280 characters ~ 55 words

In [14]:
def split_lines(sentence, split_char=" ", n=55):
    groups = sentence.split(split_char)
    parsed_sents = [split_char.join(groups[:n]), split_char.join(groups[n:])]
    return [sent for sent in parsed_sents if sent]

In [15]:
limit_sents = []
for sent in sentences:
    split_sents = split_lines(sent)
    limit_sents.extend(split_sents)

In [16]:
print(f"Split Sentences Count: {len(limit_sents)}")

Split Sentences Count: 3259


In [17]:
with open(EXTRACTED_SENTENCES_PATH, "w+", encoding="utf-8") as f:
    for sent in limit_sents:
        clean_sent = remove_punctuations(sent)
        if len(clean_sent.strip().split()) > 2:
            sent_no_sw = remove_stopwords(clean_sent)
            sent_stem = apply_stemming(sent_no_sw)
            sent_lem = apply_lemmatization(sent_stem)
            f.write(f"{sent_lem}\n")