In [1]:
import os
import nltk
import pandas as pd
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/thakrav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/thakrav/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/thakrav/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/thakrav/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
import os
from paragraph_cleaner import ParagraphCleaner
from paragraph_analysis import ParagraphAnalysis
from utils import big_data_dict, styled_print, create_dir, extract_images, extract_paragraphs, \
    random_select_dict, combine_multiple_text_files


In [3]:
%load_ext autoreload
%autoreload 2

## House of Dragon Seasons

In [4]:
hod_root_data_path = "../data/processed-data/url-texts/house-of-dragons"
directories = ["characters", "episodes", "dragons", "differences"]

### Episodes

In [5]:
directories_key = 'episodes'

data_text_dir = create_dir("../data/processed-data", "clean-csvs")

data_dir = os.path.join(hod_root_data_path, directories_key)
txt_file = combine_multiple_text_files(data_dir)
raw_paragraphs = {}
with open(txt_file, "rt") as f:
    for i, line in enumerate(f):
        raw_paragraphs[i] = line
styled_print(f"Found Total {len(raw_paragraphs)} Paragraphs from the {directories_key}", header=True)

[1m› [4mcreating directory ... ../data/processed-data/clean-csvs[0m
[1m› [4mFound Total 0 Paragraphs from the subtitles[0m


In [6]:
data_df = pd.DataFrame(raw_paragraphs.items(), columns=["id", "paragraphs"])
data_df.to_csv(os.path.join(data_text_dir, f"{directories_key}-raw-paragraphs.csv"), index=False, header=True)

In [7]:
styled_print(f"Some Sample Paragraphs from the {directories_key}", header=True)
sampled_paragraphs = random_select_dict(raw_paragraphs, 5)
for key, val in sampled_paragraphs.items():
    styled_print(f"{key} - {val}")

[1m› [4mSome Sample Paragraphs from the subtitles[0m


IndexError: list index out of range

### Clean Paragraphs

In [None]:
paragraph_cleaner = ParagraphCleaner(raw_paragraphs)

In [None]:
cleaned_paragraphs = paragraph_cleaner.clean_paragraphs()
styled_print(f"Found Total {len(cleaned_paragraphs)} Paragraphs from the {directories_key}", header=True)

In [None]:
book_df = pd.DataFrame(cleaned_paragraphs.items(), columns=["id", "paragraphs"])
book_df.to_csv(os.path.join(data_text_dir, f"{directories_key}-clean-paragraphs.csv"), index=False, header=True)

In [None]:
styled_print(f"Some Sample Cleaned Paragraphs from from the {directories_key}", header=True)
sampled_paragraphs = random_select_dict(cleaned_paragraphs, 5)
for key, val in sampled_paragraphs.items():
    styled_print(f"{key} - {val}")

### Analysis of Cleaned Paragraphs

In [None]:
out_dir = create_dir("../data", "analysis-data")
paragraph_analysis = ParagraphAnalysis(
    cleaned_paragraphs, 
    out_dir=out_dir
)
styled_print(f"Found {paragraph_analysis.words_counts} Words.", header=True)

#### Histogram of Paragraph Lengths

In [None]:
paragraph_analysis.characters_per_paragraph_histogram(
    figsize=(3, 3), dpi=200, save_flag=False
)

#### Histogram of Words per Paragraph

In [None]:
paragraph_analysis.words_per_paragraph_histogram(
    figsize=(3, 3), dpi=200, save_flag=False
)

#### Histogram of Avg Word Lenghth per Paragraph

In [None]:
paragraph_analysis.avg_word_len_per_paragraph_histogram(
    figsize=(3, 3), dpi=200, save_flag=False
)

#### Checking Stop Words

In [None]:
stop_words_corpus = paragraph_analysis.get_stop_words_corpus(language='english')
styled_print(f"Found {len(stop_words_corpus.keys())} unique stop words", header=True)

In [None]:
if len(stop_words_corpus.keys()) > 0:
    top_k = 25
    top_k_stop_words = paragraph_analysis.get_top_k_stop_words(stop_words_corpus, top_k=top_k)
    paragraph_analysis.plot_top_k_stop_words(
        top_k_stop_words, figsize=(8, 8), dpi=300, save_flag=False)

#### Checking Non-Stop Words

In [None]:
non_stop_words_corpus = paragraph_analysis.get_non_stop_words_corpus(language='english')
styled_print(f"Found {len(non_stop_words_corpus.keys())} unique non-stop words", header=True)

In [None]:
top_k = 25
top_k_non_stop_words = paragraph_analysis.get_top_k_non_stop_words(non_stop_words_corpus, top_k=top_k)
paragraph_analysis.plot_top_k_non_stop_words(
    top_k_non_stop_words, figsize=(8, 8), dpi=300, save_flag=False)

#### Bi-Gram Analysis

In [None]:
bigrams = paragraph_analysis.get_ngrams(n=2, return_list=True)
styled_print(f"Found {len(bigrams)} Bigrams", header=True)

In [None]:
top_k = 25
top_k_bigrams = paragraph_analysis.get_top_k_ngrams(n=2, top_k=top_k)
paragraph_analysis.plot_top_k_ngrams(
    top_k_bigrams, title=f"Top {top_k} Bigrams Count Plot", figsize=(8, 8), dpi=300, save_flag=False)

#### Trigram Analysis

In [None]:
trigrams = paragraph_analysis.get_ngrams(n=3, return_list=True)
styled_print(f"Found {len(trigrams)} Trigrams", header=True)

In [None]:
top_k = 25
top_k_trigrams = paragraph_analysis.get_top_k_ngrams(n=3, top_k=top_k)
paragraph_analysis.plot_top_k_ngrams(
    top_k_trigrams, 
    title=f"Top {top_k} Trigrams Count Plot", 
    figsize=(8, 8), 
    dpi=300, 
    save_flag=False
)