# Preprocessing

In [25]:
import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

import gensim
from gensim import corpora
from gensim import similarities
from gensim import models
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import re
import os
from os import path

from num2words import num2words
import shutil

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [26]:
books_directory = "Data/Train/raw"
book_texts = {}

for filename in os.listdir(books_directory):
    if filename.endswith('.txt'):
        with open(os.path.join(books_directory, filename), "r", encoding="utf8", errors='ignore') as file:
            book_text = file.read()
            book_texts[filename.replace('.txt','')] = book_text

In [17]:
# book_texts['23609']

In [27]:
def printRoman(number):
    roman_num = ''
    num = [1, 4, 5, 9, 10, 40, 50, 90,
        100, 400, 500, 900, 1000]
    sym = ["I", "IV", "V", "IX", "X", "XL",
        "L", "XC", "C", "CD", "D", "CM", "M"]
    i = 12
     
    while number:
        div = number // num[i]
        number %= num[i]
 
        while div:
            roman_num += sym[i]
            div -= 1
        i -= 1
    
    return roman_num

In [28]:
num = []
num_word = []
cap_roman = []
roman_num = []
roman_book = []
cap_roman_book = []
prop_roman = []
num_only = []
roman_fullstop = []
roman_only = []
roman_chap_fullstop = []
num_fullstop = []
roman_only_space = []
roman_short = []
regex = ['\n[A-Z ]+\n', 'Chapter \d+|CHAPTER \d+|Chapters \d+|CHAPTER [IVXLCDMivxlcdm]+|Chapter [IVXLCDMivxlcdm]+|Book [IVXLC]+|BOOK [IVXLC]+']
regex_list = "|".join(regex)

for i in range(1, 100):
    num.append('\nChapter ' + str(i))
    num_word.append('\n\nChapter ' + num2words(i).capitalize())
    roman_num.append('\n\nChapter ' + printRoman(i))
    cap_roman.append('\nCHAPTER ' + printRoman(i))
    roman_book.append('\n\nBook ' + printRoman(i))
    cap_roman_book.append('\n\nBOOK ' + printRoman(i))
    prop_roman.append('\n\nPROP. ' + printRoman(i) + '[.]')
    num_only.append('\n' + str(i) + '\n\n')
    roman_fullstop.append('\n\n' + printRoman(i) + '[.]')
    roman_only.append('\n\n' + printRoman(i) + '\n')
    roman_chap_fullstop.append('\nCHAPTER. ' + printRoman(i) + '[.]')
    roman_only_space.append('\n\n  ' + printRoman(i) + '\n')
    num_fullstop.append('\n\n' + str(i) + '[.]\n')
    roman_short.append('  CHAP.   ' + printRoman(i) + '[.]\n')
    
header_list = num + num_word + cap_roman + roman_num + roman_book + cap_roman_book + prop_roman + num_only + roman_fullstop + roman_only + roman_chap_fullstop + num_fullstop + roman_only_space + roman_short
header = "|".join(header_list)

In [29]:
remove_list = ['CONTENTS', 'APPENDIX', 'INDEMNITY', 'PREFACE', 'DEFINITIONS', 'CHAPTER', 'BY', 'ILLUSTRATIONS',
               'INTRODUCTORY', 'COMPRISING', 'OF', 'MATERIALS', 'STORIES']

# start = "*** START OF THIS PROJECT GUTENBERG EBOOK"
# start2 = "***START OF THE PROJECT GUTENBERG EBOOK"

start_line = {'61': '*** START OF THIS PROJECT GUTENBERG EBOOK THE COMMUNIST MANIFESTO ***',
             '448': '\n\n\nTHE PSYCHOLOGY OF REVOLUTION',
             '852': '*** START OF THIS PROJECT GUTENBERG EBOOK DEMOCRACY AND EDUCATION ***',
             '926': '\n\n\n_Ten Thousand Dreams',
             '1497': '\n\n\nTHE REPUBLIC',
             '1656': '\n\n\nAPOLOGY',
             '1974': '\n\n\nTHE POETICS OF ARISTOTLE',
             '1998': '\n\n\nTHUS SPAKE ZARATHUSTRA',
             '2529': '\n\n\nTHE ANALYSIS OF MIND',
             '3800': '\n\n\nThe Ethics',
             '4280': '\n\n\nTHE CRITIQUE OF PURE REASON',
             '4339': '\n\n\nNERVES AND COMMON SENSE',
             '4363': '\n\n\nBEYOND GOOD AND EVIL',
             '5827': '\n\n\nTHE PROBLEMS OF PHILOSOPHY',
             '6762': '\n\n\nA TREATISE ON GOVERNMENT',
             '7370': '\n\n\nSECOND TREATISE OF GOVERNMENT by JOHN LOCKE',
             '11224': '\n\n\nUTILITARIANISM',
             '13614': '\n\n\nSTUDIES IN THE PSYCHOLOGY OF SEX, VOLUME V',
             '13722': '\n\n\nYOUTH AND SEX',
             '13791': '\n\n\nApplied Psychology',
             '14969': '\n\n\nNERVOUS AND MENTAL DISEASE MONOGRAPH SERIES NO. 7',
             '15489': '\n\n\nDREAM PSYCHOLOGY',
             '16287': '\n\n\n#TALKS TO TEACHERS#',
             '17829': '\n\n\nAPPLIED PSYCHOLOGY',
             '19322': '\n\n\nTHE ANTICHRIST',
             '19560': '\n\n\n  APPLIED EUGENICS',
             '20842': '\n\n\nDREAMS',
             '21077': '\n\n\n\nInternational Scientific Series.',
             '22108': '\n\n\n\n\n\nTHE',
             '23609': '*** START OF THIS PROJECT GUTENBERG EBOOK SEARCHLIGHTS ON HEALTH: ***',
             '23680': '*** START OF THIS PROJECT GUTENBERG EBOOK SEX ***',
             '23860': '***START OF THE PROJECT GUTENBERG EBOOK THE YOUNG MAN\'S GUIDE***',
             '24001': '\n\n\n\nTHE PHYSICAL LIFE OF WOMAN:',
             '24518': '*** START OF THIS PROJECT GUTENBERG EBOOK EXTRAORDINARY POPULAR DELUSIONS ***',
             '26117': '*** START OF THIS PROJECT GUTENBERG EBOOK SKETCHES OF THE FAIR SEX ***',
             '28402': '*** START OF THIS PROJECT GUTENBERG EBOOK THE SEXUAL LIFE OF THE CHILD ***',
             '28458': '*** START OF THIS PROJECT GUTENBERG EBOOK WHAT A YOUNG WOMAN OUGHT TO KNOW ***',
             '31529': '*** START OF THIS PROJECT GUTENBERG EBOOK MODERN MARRIAGE AND HOW TO BEAR IT ***',
             '31671': '*** START OF THIS PROJECT GUTENBERG EBOOK PLAIN TALKS ON AVOIDED SUBJECTS ***',
             '31732': '*** START OF THIS PROJECT GUTENBERG EBOOK THE SEX SIDE OF LIFE ***',
             '34901': '*** START OF THIS PROJECT GUTENBERG EBOOK ON LIBERTY ***',
             '35534': '*** START OF THIS PROJECT GUTENBERG EBOOK HOW TO BE HAPPY THOUGH MARRIED ***',
             '53453': '*** START OF THIS PROJECT GUTENBERG EBOOK INSTINCTS OF THE HERD ***',
             '65145': '***START OF THE PROJECT GUTENBERG EBOOK ESSAYS OF A BIOLOGIST***',
             'analmd10': '\n\n\nTHE ANALYSIS OF MIND',
             'poetc10': '\n\n\nTHE POETICS OF ARISTOTLE',
             'prphi10': '*** START OF THE PROJECT GUTENBERG EBOOK, THE PROBLEMS OF PHILOSOPHY ***',
             'repub11': '\n\n\n\n\nTHE REPUBLIC',
             'spzar10': '\n\n\nFRIEDRICH NIETZSCHE',
             'tgovt10': '*** START OF THE PROJECT GUTENBERG EBOOK, POLITICS ***',
             'trgov10': '*** START OF THE PROJECT GUTENBERG EBOOK, TWO TREATISES OF GOVERNMENT ***',
             'umem10': '*** START OF THE PROJECT GUTENBERG EBOOK, UNCONSCIOUS MEMORY ***',
             'wltnt10': '\n\n\nAN INQUIRY INTO THE NATURE AND CAUSES OF'
}


last_line = {'61': 'WORKING MEN OF ALL COUNTRIES, UNITE!',
             '448': 'are conditioned by eternal laws.',
             '852': 'in learning from all the contacts of life is the essential moral\ninterest.',
             '926': 'You will also gain knowledge by travel and sojourn in foreign countries.',
             '1497': 'been describing.',
             '1656': 'to live. Which is better God only knows.',
             '1974': 'the critics and the answers to these objections.',
             '1998': 'ripe. MY day beginneth: ARISE NOW, ARISE, THOU GREAT NOONDAY!"',
             '2529': 'is nearer to what actually exists.',
             '3800': 'neglected? But all things excellent are as difficult as they are\nrare.',
             '4280': 'her ardent desire for knowledge.',
             '4339': 'dependence upon the Lord God Almighty.',
             '4363': 'And Light and Dark were one that wedding-morn.',
             '5827': 'which constitutes its highest good.',
             '6762': 'three boundaries of education, moderation, possibility, and decorum.',
             '7370': 'place it in new hands, as they think good.',
             '11224': 'character of its sanctions.',
             '13614': 'degradation. They had been but the irritations of convalescence.',
             '13722': 'this is a doctor\'s business, not mine.',
             '13791': 'you how to use it with confidence and with the positive assurance of\nsuccess._',
             '14969': 'special intensive somatic sexual manifestation of former years.',
             '15489': 'that past by the indestructible wish.',
             '16287': 'let live, would come into the world!',
             '17829': 'scientific system for success-achievement that will be unfolded in\nsubsequent volumes.',
             '19322': 'last?_--_From today?_--The transvaluation of all values!...',
             '19560': 'factors affects an indefinitely large number of characters.',
             '20842': '"psycho-analysis" was far from having the development that it has\nto-day. (H. B.)',
             '21077': 'the work of the consciousness.',
             '22108': 'disease and death can no longer claim us.',
             '23609': 'affectionate, nor will she trouble her husband with many of her trials or\ndifficulties.',
             '23680': 'element in the human soul.',
             '23860': 'enough to speak of ourselves when we are compelled to do it in our own\ndefence.',
             '24001': 'impressions and affectionate reminiscences.',
             '24518': 'Which Phaeton so rashly brake.',
             '26117': 'A joy to gladness all divine.',
             '28402': 'sexual and general education.',
             '28458': 'worthy, hopeth all things, believeth all things, endureth all things,\nand never faileth.',
             '31529': 'society, and no society as sweet as matrimony!\'',
             '31671': 'is life itself and from whom all life constantly emanates.',
             '31732': 'and perfect health, if we are going to be happy ourselves or make any\none else happy.',
             '34901': 'it has preferred to banish.',
             '35534': 'In that great love\'s great overflow."',
             '53453': 'feet have been upborne, so many dry lips refreshed.',
             '65145': 'made to co-operate.',
             'analmd10': 'psychology is nearer to what actually exists.',
             'poetc10': 'the critics and the answers to these objections.',
             'prphi10': 'universe which constitutes its highest good.',
             'repub11': 'this life and in the pilgrimage of a thousand years which we have been\ndescribing.',
             'spzar10': 'ARISE, THOU GREAT NOONDAY!"',
             'tgovt10': 'moderation, possibility, and decorum.',
             'trgov10': 'hands, as they think good.',
             'umem10': '{184d}  Encyclopaedia Britannica, 9th ed., p. 750.',
             'wltnt10': 'accommodate her future views and designs to the real mediocrity\nof her circumstances.'
            }
             

In [30]:
def remove_end(text, start_line, last_line):
    start = text.find(start_line)
    end = text.find(last_line)
    if (start > 0):
        text = text[start:]
    if (end > 0):
        text = text[:end + len(last_line)]
    return text

def chapIndexes(text, header):
    chap_index = []
    indexes = [
        match.start() for match in re.finditer(header, text)
    ]
   
    if(len(indexes) > 1):
        for i in range(len(indexes)-1):
            if (indexes[i+1] - indexes[i]) > 1500:
                chap_index.append(indexes[i])

        chap_index.append(indexes[-1])
        
    elif (len(indexes) == 1):
        chap_index.append(indexes[0])
        
    if len(chap_index) == 0:
        chap_index = chapIndexesbyCapWord(text)
    
    return chap_index

def chapIndexesbyCapWord(text):
    chap_index = []
    indexes = [
        match.start() for match in re.finditer(regex_list, text)
    ]
    if(len(indexes) > 1):
        for i in range(len(indexes)-1):
            if (indexes[i+1] - indexes[i]) > 1500:
                chap_index.append(indexes[i])   
    
    elif (len(indexes) == 1):
        chap_index.append(indexes[0])
    return chap_index

def splitbyChapters(text, chap_index):
    split_text = []
    
    if (len(chap_index) > 1):
        for i in range(len(chap_index) - 1):
            split_text.append(text[chap_index[i]:chap_index[i + 1]])
        return split_text
    elif (len(chap_index) == 1):
        split_text.append(text[chap_index[0]:])
        return split_text
    return text

def directory(input_name):
    dir = input_name #+ ' ' + datetime.now().strftime("Day-%d %m %y_Time-%H %M %S")
    if os.path.exists(dir):
        shutil.rmtree(dir)
    os.makedirs(dir)
    return dir

def saveChapters(dir,split_text, id):
#     folder = os.path.join('Data/Train_Chapters',id)
#     dir = directory(dir)
    if(not os.path.exists(dir)):
        os.mkdir(dir)
    folder = os.path.join(dir,id)
    os.mkdir(folder)
    chapter_list = []
    for i in split_text:
        if i != '':
            name = i.split("\n")
            name = [x for x in name if x != '']
            name = name[0].split('.')
            form_name = name[0].replace(".", "_").replace(" ", "_").replace(":", "").replace("?", "").replace('"', "").replace('\x00', "")
            form_name = form_name.strip()
            if(form_name not in remove_list):
                with open (f'./{folder}/{form_name}.txt', "w", encoding='utf-8') as f:
                    f.write(i)
                    f.close()
                    chapter_list.append(form_name)
    return chapter_list

def savecleanBooks(dir, text, id):
#     folder = 'cleaned_text'
    if(not os.path.exists(dir)):
        os.mkdir(dir)
    with open (f'./{dir}/{id}.txt', "w", encoding='utf-8') as f:
        f.write(text)
        f.close()

In [31]:
exception_list = ['23609', '23680', '24001', '24518', '31732'] # these books cannot split

for id, text in book_texts.items():
    cleaned_text = remove_end(text, start_line[id], last_line[id])
    savecleanBooks("Data/Train/Train_Cleaned", cleaned_text, id)
    if(id not in exception_list):
        chap_index = chapIndexes(text, header)
        split_text = splitbyChapters(cleaned_text, chap_index)
        if ((type(split_text) == list) & (len(split_text)>0)):
            saveChapters('Data/Train/Train_Chapters', split_text, id)

In [88]:
# to split one book

# id = '1656'
# text = book_texts[id]
# cleaned_text = remove_end(text, start_line[id], last_line[id])
# savecleanBooks("Data/Train/Train_Cleaned", cleaned_text, id)
# chap_index = chapIndexes(text, header)
# split_text = splitbyChapters(cleaned_text, chap_index)
# if ((type(split_text) == list) & (len(split_text)>0)):
#     saveChapters('Data/Train/Train_Chapters', split_text, id)

# PDF Extraction

## PDF Reader

In [7]:
from PyPDF2 import PdfReader, PdfWriter
def extract_text_from_pdf(pdf_file):
    with open(pdf_file, 'rb') as pdf:
        reader = PdfReader(pdf, strict=False)
        pdf_text = []

        for page in reader.pages:
            content = page.extract_text()
            pdf_text.append(content)
        return pdf_text

In [8]:
book = 'Data/PDF_Test/Plain Talks on Avoided Subjects, by Henry Newell Guernsey.pdf'
extracted_text = extract_text_from_pdf(book)
extracted_text[0:50]

['The Project Gutenberg eBook of Plain Talks on Avoided Subjects\nThis ebook is for the use of anyone anywhere in the United States and most other parts\nof the world at no cost and with almost no restrictions whatsoever . You may copy it,\ngive it away or re-use it under the terms of the Project Gutenber g License included with\nthis ebook or online at www .gutenber g.org. If you are not located in the United States,\nyou’ll have to check the laws of the country where you are located before using this\neBook.\nTitle: Plain Talks on Avoided Subjects\nAuthor : Henry N. Guernsey\nRelease date : March 17, 2010 [eBook #31671]\nMost recently updated: January 6, 2021\nLanguage : English\nCredits : Produced by Jana Srna and the Online Distributed\nProofreading Team at https://www .pgdp.net (This book was\nproduced from scanned images of public domain material\nfrom the Google Print project.)\n*** ST ART OF THE PROJECT  GUTENBERG EBOOK PLAIN TALKS ON AVOIDED\nSUBJECTS ***\nPLAIN T ALKS\nON\nA\

In [14]:
raw = ""
current_length = 0
chap_indexes = []

for text in extracted_text:
    chap_index = chapIndexes(text, header2)
    if(len(chap_index)>0):
        for index in chap_index:
            chap_indexes.append(current_length+index)
    current_length += len(text)
    
    raw += text

# last_line = "the long pilgrimage of war so many weary feet have been upborne, so many dry\nlips refreshed."
last_line = 'in from the Lord who is life itself and from whom all life constantly emanates.'
cleaned_text = remove_end(raw, last_line)
split_text = splitbyChapters(cleaned_text, chap_indexes)
if ((type(split_text) == list) & (len(split_text)>1)):
    saveChapters(split_text, 'book2')

In [15]:
len(chap_indexes)

18

In [16]:
split_text

['CHAPTER X.O\x00\x00\x00\x00\x00 \x00\x00 \x00\x00\x00 S\x00\x00, 117I',
 "CHAPTER I.\nI\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00.\nN the creation of the world and all that therein is, we should consider it an axiom that\n“Everything was created for use.” All individual substances, or beings, that come to our\nnotice bear certain relations to one another, have connection one with another , and are\ndependent upon and useful to each other; and nothing could possibly exist or subsist without\nthis co-rel ation: connection with and use to each other . This is a law which needs only a little\nreflection to be accepted as a truth in every particular—in the greatest as well as in the least\ncreated form. This is more plainly seen in the animal kingdom than in the mineral or\nvegetable, because its members associa te and finally become conjoined in pairs. Man and\nwoman, who represent the crown and glory of all created beings, in whom are embodied all\nthe lower orders, were and are still

In [17]:
index = chap_indexes[14]
i = raw[index:index+20]

name = i.split("\n")
name = [x for x in name if x != '']
# print(name[0])
form_name = name[0].replace(".", "_").replace(" ", "_").replace(":", "").replace("?", "").replace('"', "").replace('\x00', "")
form_name

'42_'

## Convert to HTML file

In [19]:
from bs4 import BeautifulSoup
  
# Opening the html file
HTMLFile = open("Data/PDF_Test/Instincts of the Herd in Peace and War, by W. Trotter.html", 'r', errors='ignore')
  
# Reading the file
index = HTMLFile.read()
  
# Creating a BeautifulSoup object and specifying the parser
Parse = BeautifulSoup(index, 'lxml')
  
# Printing html code of some tags
print(Parse.head)
print(Parse.h1)
print(Parse.h2)
print(Parse.h3)
print(Parse.li)

<head><meta content="text/html; charset=utf-8" http-equiv="Content-Type"/><title>file_1679724103427</title><style type="text/css"> * {margin:0; padding:0; text-indent:0; }
 .s1 { color: black; font-family:"Times New Roman", serif; font-style: normal; font-weight: normal; text-decoration: none; font-size: 17pt; }
 .s2 { color: black; font-family:"Times New Roman", serif; font-style: normal; font-weight: normal; text-decoration: none; font-size: 12pt; }
 .s3 { color: #2100CC; font-family:"Times New Roman", serif; font-style: normal; font-weight: normal; text-decoration: underline; font-size: 12pt; }
 .s4 { color: #2100CC; font-family:"Times New Roman", serif; font-style: normal; font-weight: normal; text-decoration: none; font-size: 12pt; }
 .a { color: black; font-family:"Times New Roman", serif; font-style: normal; font-weight: normal; text-decoration: none; font-size: 12pt; }
 .s5 { color: #DD2400; font-family:"Times New Roman", serif; font-style: normal; font-weight: normal; text-dec