In [1]:
#
# Copyright © 2019 Sunho Kim. All rights reserved.
#

In [2]:
cd ..

/home/sunho/Documents/dev/gorani-reader/backend/dataserver


In [3]:
from pyspark.sql import SparkSession, DataFrame

spark = SparkSession\
    .builder\
    .appName('Create Book')\
    .getOrCreate()
sc = spark.sparkContext

In [4]:
# parameters
lib_path = './in'

In [8]:
from os import listdir
from os.path import isfile, join
files = [join(lib_path, f) for f in listdir(lib_path) if isfile(join(lib_path, f)) and f.endswith('.epub')]
print(files)

['./in/1.epub']


In [12]:
import pyspark.sql.functions as F
from gorani.spark import write_data, read_data_all
from ebooklib import epub
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

def get_id(file):
    import re
    p = re.compile('.+?([0-9]+?)\.epub')
    match = p.match(file)
    return int(match.group(1))

def clean_html(html):
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html)
    for script in soup(['script', 'style']):
        script.extract()

    text = soup.get_text()
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk for chunk in chunks if chunk)
    return text

def convert_to_book_words(df, id):
    df = df.groupBy('word').count()\
        .select('word', F.col('count').alias('n'))\
        .withColumn('book_id', F.lit(id))
    write_data('book_words',df)

def covert_to_book(df, title, id):
    df = df.agg(F.collect_list('word').alias('content'))\
    .withColumn('id', F.lit(id))\
    .withColumn('name', F.lit(title))
    write_data('books',df)

for file in files:
    lemmatizer = WordNetLemmatizer()
    engstopwords = set(stopwords.words('english'))

    book = epub.read_epub(file)
    # get html text from epub > words array
    df = sc.parallelize(book.items)\
        .filter(lambda item: isinstance(item, epub.EpubHtml))\
        .map(lambda item: item.get_content())\
        .map(clean_html) \
        .flatMap(word_tokenize)\
        .map(lambda s: lemmatizer.lemmatize(s))\
        .map(lambda s: s.lower())\
        .filter(lambda s: s not in engstopwords)\
        .toDF('string')
    
    # filter out the words not in dictionary
    words = read_data_all(spark, 'words', cache=True)
    sig_df = df.join(words, df['value'] == words['word'], 'inner')\
        .select('word').cache()

    id = get_id(file)
    convert_to_book_words(sig_df, id)
    covert_to_book(sig_df, book.title, id)

print('success')

success
