# Fetch guternberg books from a category

## Step 1, get book ids

- go to http://m.gutenberg.org/ebooks/search.mobile/?query=Erotic+%21+bsxErotic&sort_order=downloads

- scroll to the bottom and click "show more" a few times
- enter the javascript below in the browsers js console
- it should have copied the ids to your clipboard, you can paste it into "ids" below


```js
// to get all book ids shown on page, paste this javascript into js console in browser when on the page above
a_elems = document.getElementsByClassName("table link")
hrefs = Array.from(a_elems)
  .map(e=>e.href) // get link
  .filter(e=>e) // remove empty links
ids = hrefs.map(e=>/(\d+)\.mobile/.exec(e)) // regular expression match
  .filter(e=>e) // remove ones not found
  .map(e=>e[1]) // get just id
copy(ids) // copy to clipboard
```

In [None]:
import requests
import os
import re
import bs4
import time
import json
from tqdm import tqdm_notebook as tqdm

dest_dir = 'data/corpus/erotic_gutenberg'

In [None]:
# urls to download text inputs
ids = [
  "30254",
  "30360",
  "28520",
  "25305",
  "14005",
  "28522",
  "31284",
  "28521",
  "29827",
  "52059",
  "14323",
  "13610",
  "57284",
  "13972",
  "52205",
  "54672",
  "13614",
  "28718",
  "44877",
  "26804",
  "45150",
  "37491",
  "43438",
  "48943",
  "53807",
  "26456",
  "26808",
  "13971",
  "42406",
  "43823",
  "39220",
  "56779",
  "26809",
  "18610",
  "44181",
  "42212",
  "26806",
  "42586",
  "47892",
  "43822",
  "49855",
  "26562",
  "26739",
  "26807",
  "20568",
  "40877",
  "54419",
  "53944",
  "40557",
  "29049",
  "25543",
  "40902",
  "41301",
  "56491",
  "28789",
  "40496"
]

In [None]:
# from https://github.com/motoom/gutenberg-ebook-scraping/blob/master/gutenberg.py

# Repetitive stuff I don't want to read a 1000 times on my eBook reader.
remove = ["Produced by","End of the Project Gutenberg","End of Project Gutenberg"]

def beautify(text):
    ''' Reads a raw Project Gutenberg etext, reformat paragraphs,
    and removes fluff.  Determines the title of the book'''
    lines = [line.strip() for line in text.split('\n')]
    collect = False
    lookforsubtitle = False
    outlines = []
    startseen = endseen = False
    title=""
    author=""
    language=""
    extra=[]
    for line in lines:
        if line.startswith("Author: "):
            author = line[8:]
        if line.startswith("Language: "):
            language = line[10:]
        if line.startswith("Title: "):
            title = line[7:]
            lookforsubtitle = True
            continue
        if lookforsubtitle:
            if not line.strip():
                lookforsubtitle = False
            else:
                subtitle = line.strip()
                subtitle = subtitle.strip(".")
                title += ", " + subtitle
        if ("*** START" in line) or ("***START" in line) or (line.startswith("*END THE SMALL PRINT!")):
            collect = startseen = True
            paragraph = ""
            extra.append(line)
            continue
        if ("*** END" in line) or ("***END" in line):
            endseen = True
            extra.append(line)
            break
        if not collect:
            extra.append(line)
            continue
        if not line:
            paragraph = paragraph.strip()
            for term in remove:
                if paragraph.startswith(term):
                    extra.append(line)
                    paragraph = ""
                    break
            if paragraph:
                outlines.append(paragraph)
                outlines.append("")
            paragraph = ""
        else:
            paragraph += " " + line

    # Report on anomalous situations, but don't make it a showstopper.
    if not title:
#         print (ofn)
        print ("    Problem: No title found\n")
    if not startseen:
#         print (ofn)
        print ("    Problem: No '*** START' seen\n")
    if not endseen:
#         print (ofn)
        print ("    Problem: No '*** END' seen\n")
        
    return dict(
        content='\n'.join(outlines),
        title=title,
        author=author,
        language=language,
        extra=extra
    )
        


In [None]:
for bid in tqdm(ids):
    
    # first download index
    index_url = "http://www.gutenberg.org/files/{bid:}".format(bid=bid)
    r = requests.get(index_url)
    r.raise_for_status()
    soup = bs4.BeautifulSoup(r.content, "html5lib")
    hrefs = [e.attrs['href'] for e in soup.findAll('a')]
    links = [h for h in hrefs if h.endswith('.txt')]
    
    # download text
    for link in links:
        txt_url = index_url + '/' + link
        outfile = os.path.join(dest_dir, link.replace('.txt', '.json'))
        if not os.path.isfile(outfile):
            r = requests.get(txt_url)
            r.raise_for_status()
            info = beautify(r.text)
            if (info['language'] == 'English') and len(info['language']):
                # TODO some are empty, check before saving
                json.dump(info, open(outfile, 'w'))
        
    time.sleep(0.5) # avoid ddos/ban

# 2. turn into csv, like rocstories

In [None]:
import uuid
import pandas as pd
import nltk
# nltk.download('punkt')

dest_dir = 'data/corpus/erotic_gutenberg'
max_len = 400
num_sent = 6
data=[]
for infile in os.listdir(dest_dir):
    path = os.path.join(dest_dir, infile)
    info = json.load(open(path))
    paragraphs = info['content'].split('\n\n')
    for paragraph in paragraphs:
#         sentances = [p for p in paragraph.strip().split('. ')]
        sentances = nltk.sent_tokenize(paragraph)
        if len(sentances)>num_sent:
            for i in range(len(sentances)//num_sent):
                data.append(dict(
                    storyid=uuid.uuid4().hex,
                    sentence1=sentances[i*5+0][:max_len],
                    sentence2=sentances[i*5+1][:max_len],
                    sentence3=sentances[i*5+2][:max_len],
                    sentence4=sentances[i*5+3][:max_len],
                    sentence5=sentances[i*5+4][:max_len],
                    AnswerRightEnding=1
                ))
df = pd.DataFrame(data)
df = df[['storyid', 'sentence1', 'sentence2', 'sentence3', 'sentence4', 'sentence5', 'AnswerRightEnding']]
df

In [None]:
# # Test: concat small sentances
# # And split large ones
# sent = []
# for s in sentances:
#     if len(s)>10:
#         sent.append(s)
#     else:
#         sent[-1]+=' '+s
# sent

In [None]:
%matplotlib inline
df['sentence1'].str.len().plot.hist(bins=55)
df['sentence1'].str.len().max()

In [None]:
val_idx = int(len(df)*0.7)
df_train = df[:val_idx]
df_val = df[val_idx:]

In [None]:
df_train.to_csv('data/erotic_gutenberg_TRAIN.csv', index=False)
df_val.to_csv('data/erotic_gutenberg_VAL.csv', index=False)

In [None]:
# import csv
# def _rocstories(path):
#     with open(path, encoding='utf_8') as f:
#         f = csv.reader(f)
#         st = []
#         ct1 = []
#         ct2 = []
#         y = []
#         for i, line in enumerate(tqdm(list(f), ncols=80, leave=False)):
#             if i > 0:
#                 s = ' '.join(line[1:5])
#                 c1 = line[5]
#                 c2 = line[6]
#                 st.append(s)
#                 ct1.append(c1)
#                 ct2.append(c2)
#                 y.append(int(line[-1])-1)
#         return st, ct1, ct2, y
    
# _rocstories('data/erotic_gutenberg_TRAIN.csv')