In [1]:
import requests
from bs4 import BeautifulSoup
from nltk import tokenize
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
def fetch_data(data_source, output_file):
    response = requests.get(data_source)
    if response.status_code == 200:
        file = open(output_file, "w+", encoding="utf-8")
        file.write(response.text)
        file.close()
        return True
    return False

In [3]:
def clean_file(data_source, output_data):
    with open(data_source, "r", encoding="utf-8") as f:
        data = f.readlines()
    with open(output_data, "w+", encoding="utf-8") as outfile:
        for line in data:
            if len(line.strip()) > 0:
                outfile.write(f"{line.strip()}\n")

In [4]:
def clean_data(text):
    data = text.strip().strip("\n").strip("\t")
    data = data.replace("\n"," ").replace("\t"," ")
    data = data.replace('“',"").replace('”',"")
    data = data.replace('’',"'").replace('—'," ")
    return data

In [5]:
def extract_paragraphs(source_file):
    soup = BeautifulSoup(open(source_file, "r", encoding="utf-8").read().strip(), 'html.parser')
    divs = soup.find_all('div',{'class': lambda x: x and 'chapter' in x.split()})
    text_list = []
    for div in divs:
        paras = div.find_all('p')
        for p in paras:
            raw_text = p.get_text()
            text_list.append(clean_data(raw_text))
    return text_list

In [6]:
def get_sentences(paragraph):
    sentences = []
    tokens = nlp(paragraph)
    for sent in tokens.sents:
        sentences.append(sent.text.strip())
    return sentences

In [7]:
TXT_DATA_SOURCE = "https://www.gutenberg.org/files/16/16-0.txt"
HTML_DATA_SOURCE = "https://www.gutenberg.org/files/16/16-h/16-h.htm"
DATA_PATH = "data/peter_pan.txt"
CLEAN_DATA_PATH = "data/peter_pan_clean.txt"
EXTRACTED_SENTENCES_PATH = "data/peter_pan_sentences.txt"

In [8]:
fetch_data(HTML_DATA_SOURCE, DATA_PATH)
clean_file(DATA_PATH, CLEAN_DATA_PATH)
paragraphs = extract_paragraphs(CLEAN_DATA_PATH)
sentences = []
for para in paragraphs:
    parsed_sents = get_sentences(para)
    sentences.extend(parsed_sents)

In [9]:
print(f"Extracted Sentences Count: {len(sentences)}")

Extracted Sentences Count: 3244


### Limiting sentences to 280 characters ~ 55 words

In [10]:
def split_lines(sentence, split_char=" ", n=55):
    groups = sentence.split(split_char)
    parsed_sents = [split_char.join(groups[:n]), split_char.join(groups[n:])]
    return [sent for sent in parsed_sents if sent]

In [11]:
limit_sents = []
for sent in sentences:
    split_sents = split_lines(sent)
    limit_sents.extend(split_sents)

In [12]:
print(f"Split Sentences Count: {len(limit_sents)}")

Split Sentences Count: 3259


In [14]:
with open(EXTRACTED_SENTENCES_PATH, "w+", encoding="utf-8") as f:
    for sent in limit_sents:
        if len(sent.split()) > 2:
            f.write(f"{sent}\n")