# Import necessary libraries

In [60]:
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
import string
import nltk
import glob
import json
import os
import re

# Prepare data

In this section, texts of seven Artemis Fowl books are loaded and saved into a dataframe with columns Name and Text for book names and texs in string format respectively.

In [61]:
# Create an empty pandas dataframe
df = pd.DataFrame(columns = ['Name', 'Text']) 

In [62]:
# Get filenames from the direcory called 'data'
books = os.listdir('data/')
books.remove('.DS_Store')

In [63]:
# Iterate through all files, open and save them to the dataframe
for book in books:
    file_path = glob.glob('data/' + book + '/*.txt')
    name = file_path[0].split("/")[-1]
    text = open(file_path[0], mode = 'r', encoding='utf-8').read()
    df = df.append({'Name': name.replace('.txt',''), 'Text': text}, ignore_index=True)

In [64]:
# Data overview
df.head()

Unnamed: 0,Name,Text
0,Colfer Eoin. Artemis Fowl and the Atlantis Com...,Eoin Colfer. Artemis Fowl and the Atlantis Com...
1,Colfer Eoin. Artemis Fowl: The Time Paradox,Eoin Colfer. Artemis Fowl: the time paradox\n\...
2,Colfer Eoin. Artemis Fowl. The Lost Colony,Eoin Colfer\n\nArtemis Fowl. The Lost Colony\n...
3,Colfer Eoin. Artemis Fowl. The Arctic Incident,Eoin Colfer\n\nArtemis Fowl. The Arctic Incide...
4,Colfer Eoin. Artemis Fowl. The Opal Deception,Eoin Colfer\n\nArtemis Fowl. The Opal Deceptio...


# Split text into chapters and paragraphs

A class Book is created to process Artemis Fowl books.

In [65]:
class Book:
    """
    This is a class for processing Artemis Fowl books. 
      
    Attributes: 
        name (str): The book title. 
        text (str): The book text. 
    """
    
    def __init__(self, name, text):
        """
        The constructor for Book class. 
  
        Parameters: 
           name (str): The book title. 
            text (str): The book text.
        """
        self.name = name
        self.text = text
    
    def remove_header(self):
        """
        The function to remove all the information wrote before Chapter 1 in the book. 
          
        Returns: 
            string: A string which contains the book text without the introduction and prologue.
        """
        without_header = self.text[self.text.lower().find('chapter 1'):]
        return without_header
    
    def remove_footer(self):
        """
        The function to remove all the information wrote before the last chapter of the book. 
          
        Returns: 
            string: A string which contains the book text without the epilogue.
        """
        without_footer = self.text[:self.text.lower().find('epilogue')]
        return without_footer
    
    def get_chapters(self):
        """
        The function to split the book text into chapters. 
          
        Returns: 
            list: A list of strings which contains the book text splitted into chapters.
        """
        self.text = self.remove_header()
        self.text = self.remove_footer()
        
        chapters = re.split('chapter ', self.text.lower())
        return [chapter for chapter in chapters if chapter != '']
    
    def count_chapters(self):
        """
        The function to count the number of chapters in the book. 
          
        Returns: 
            integer: An integer number.
        """
        return len(self.get_chapters())
    
    def extract_chapter_title(self, chapter):
        """
        The function to extract a title of the chapter.
          
        Returns: 
            list: A list of strings where the first string is a title and
                  the second string is a chapter text.
        """
        title = chapter.split('\n\n')[0]
        chapter_text = chapter.replace(title + '\n\n', '')
        return [string.capwords(title[title.find(':')+2:]), chapter_text]
    
    def get_paragraphs(self, chapter):
        """
        The function to split the book chapter into paragraphs. 
          
        Returns: 
            list: A list of strings which contains the chapter text splitted into paragraphs.
        """
        paragraphs = chapter.split('\n')
        return [paragraph for paragraph in paragraphs if paragraph != '']
    
    def count_paragraphs(self, chapter):
        """
        The function to count the number of paragraphs in the chapter. 
          
        Returns: 
            integer: An integer number.
        """
        return len(self.get_paragraphs(chapter))
    
    def get_sentences(self, paragraph):
        """
        The function to split the paragraph into sentences. 
          
        Returns: 
            list: A list of strings which contains the paragraph text splitted into sentences.
        """
        sentences = nltk.tokenize.sent_tokenize(paragraph)
        return [sentence for sentence in sentences if sentence != '']
    
    
    def count_sentences(self, paragraph):
        """
        The function to count the number of sentences in the paragraph. 
          
        Returns: 
            integer: An integer number.
        """
        return len(self.get_sentences(paragraph))
    
    def average_sentence_len(self, paragraph):
        """
        The function to count the average length of sentences in the paragraph. 
          
        Returns: 
            float: A float number.
        """
        sentences = self.get_sentences(paragraph)
        return round(sum(map(len, sentences))/len(sentences),2)
    
    def get_words(self, sentence):
        """
        The function to split the paragraph into words (includes removing stop-words and punctuation).
          
        Returns: 
            list: A list of strings which contains the sentence splitted into words.
        """
        splitted = nltk.tokenize.word_tokenize(sentence)
        # Remove puncuation, all words are lowercase 
        words = [word.lower() for word in splitted if word.isalpha()]
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if not word in stop_words]
        return words
    
    def count_words(self, sentence):
        """
        The function to count the number of words in the sentence. 
          
        Returns: 
            integer: An integer number.
        """
        return len(self.get_words(sentence))
    
    def average_word_len(self, sentence):
        """
        The function to count the average length of words in the sentence. 
          
        Returns: 
            float: A float number.
        """
        words = self.get_words(sentence)
        return round(sum(map(len, words))/len(words), 2)
    
    def count_sentiment(self, sentence):
        """
        The function to count the sentiment score of a text.
          
        Returns: 
            float: A float number.
        """
        sid = SentimentIntensityAnalyzer()
        score = sid.polarity_scores(sentence)
        return score['compound']
    
    def get_chapter_sentiment(self, chapter_text):
        sid = SentimentIntensityAnalyzer()
        text = []
        paragraphs = self.get_paragraphs(chapter_text)
        for paragraph in paragraphs:
            sentences = self.get_sentences(paragraph)
            for sentence in sentences:
                text += self.get_words(sentence)
        return sid.polarity_scores(sentence)['compound']
    
    def get_paragraph_sentiment(self, paragraph):
        sid = SentimentIntensityAnalyzer()
        text = []
        sentences = self.get_sentences(paragraph)
        for sentence in sentences:
            text += self.get_words(sentence)
        return sid.polarity_scores(sentence)['compound']
            
    
    def print_structure(self):
        """
        The function to print the book structure (Root -> Chapters -> Paragraphs).
        """
        chapters = self.get_chapters()
        
        print('Root')
        chapter_num = 1
        for chapter in chapters:
            print('\tChapter ' + str(chapter_num))
            chapter_num += 1
            paragraphs = self.get_paragraphs(chapter)
            paragraph_num = 1
            for paragraph in paragraphs:
                print('\t\tParagraph ' + str(paragraph_num))
                paragraph_num += 1
    
    def create_structure(self):
        """
        The function to create the book structure (Root -> Chapters -> Paragraphs -> Sentences).
        Takes book parts along with their features like text sentiment, 
        the number of words in sentences, the number of sentences in paragraphs,
        average word length in a sentence, average sentence lenth in a paragraph.
        """     
        data = {
          "title": self.name,
          "text": {
            "number_of_chapters": self.count_chapters(),
            "chapters": [ ]
          }
        }
        
        chapters = self.get_chapters()
        chapter_num = 1
        for chapter in chapters:
            chapter_title, chapter_text = self.extract_chapter_title(chapter)
            data["text"]["chapters"].append({
                     "chapter_number": chapter_num,
                     "chapter_title": chapter_title,
                     "number_of_paragraphs": self.count_paragraphs(chapter_text),
                     "chapter_sentiment": self.get_chapter_sentiment(chapter_text),
                     "paragraphs": [ ]
            })
            
            paragraphs = self.get_paragraphs(chapter_text)
            paragraph_num = 1
            for par in paragraphs:
                data["text"]["chapters"][chapter_num-1]["paragraphs"].append({
                         "number_of_sentences": self.count_sentences(par),
                         "average_sentence_len": self.average_sentence_len(par),
                         "paragraph_sentiment": self.get_paragraph_sentiment(par),
                         "sentences": [ ]
                })
                
                sentences = self.get_sentences(par)
                for sent in sentences:
                    if len(self.get_words(sent)) < 1:
                        continue
                    data["text"]["chapters"][chapter_num-1]["paragraphs"][paragraph_num-1]["sentences"].append({
                             "sentence": self.get_words(sent),
                             "average_word_len": self.average_word_len(sent),
                             "num_of_words": self.count_words(sent),
                             "sentence_sentiment_score": self.count_sentiment(sent)
                     })
                paragraph_num +=1
            chapter_num += 1
        return data
  
    def save_structure(self, filename="data.json"):
        """
        The function to save the created book structure into a JSON file.
        """
        with open(filename, "w") as file:
            json.dump(self.create_structure(), file)

# Parse and save texts

In [59]:
for i in range(df.shape[0]):
    book = Book(df.Name[i], df.Text[i])
    book.save_structure('parsed_data/' + df.Name[i] + '.json')
    print(df.Name[i] + ' - saved')

Colfer Eoin. Artemis Fowl and the Atlantis Complex - saved
Colfer Eoin. Artemis Fowl: The Time Paradox - saved
Colfer Eoin. Artemis Fowl. The Lost Colony - saved
Colfer Eoin. Artemis Fowl. The Arctic Incident - saved
Colfer Eoin. Artemis Fowl. The Opal Deception - saved
Colfer Eoin. Artemis Fowl - saved
Colfer Eoin. Artemis Fowl: The Eternity Code - saved


As a result of this step, all seven books are processed and results of processing are saved into a folder _pasred_data/_ into json files with names respective to book titles.

# An example of how data is represented

In [69]:
book = Book(df.Name[3], df.Text[3])

data = {
          "title": book.name,
          "text": {
            "number_of_chapters": book.count_chapters(),
            "chapters": [ ]
          }
        }
        
chapters = book.get_chapters()
chapter_num = 1
for chapter in chapters:
    chapter_title, chapter_text = book.extract_chapter_title(chapter)
    data["text"]["chapters"].append({
             "chapter_number": chapter_num,
             "chapter_title": chapter_title,
             "number_of_paragraphs": book.count_paragraphs(chapter_text),
           #  "chapter_sentiment": book.get_chapter_sentiment(chapter_text),
             "paragraphs": [ ]
    })

    paragraphs = book.get_paragraphs(chapter_text)
    paragraph_num = 1
    for par in paragraphs:
        data["text"]["chapters"][chapter_num-1]["paragraphs"].append({
                 "number_of_sentences": book.count_sentences(par),
                 "average_sentence_len": book.average_sentence_len(par),
                 "sentences": [ ]
        })

        sentences = book.get_sentences(par)
        for sent in sentences:
            if len(book.get_words(sent)) < 1:
                continue
            data["text"]["chapters"][chapter_num-1]["paragraphs"][paragraph_num-1]["sentences"].append({
                     "sentence": book.get_words(sent),
                     "average_word_len": book.average_word_len(sent),
                     "num_of_words": book.count_words(sent),
                     "sentence_sentiment_score": book.count_sentiment(sent)
             })
            break
        paragraph_num +=1
        break
    chapter_num += 1
    break

In [70]:
print(json.dumps(data, indent=6))

{
      "title": "Colfer Eoin. Artemis Fowl. The Arctic Incident",
      "text": {
            "number_of_chapters": 14,
            "chapters": [
                  {
                        "chapter_number": 1,
                        "chapter_title": "Family Ties",
                        "number_of_paragraphs": 64,
                        "paragraphs": [
                              {
                                    "number_of_sentences": 4,
                                    "average_sentence_len": 76.0,
                                    "sentences": [
                                          {
                                                "sentence": [
                                                      "loss",
                                                      "husband",
                                                      "profound",
                                                      "effect",
                                                      "angeline",
