Notes: some of this is based off of AO3Scraper by radiolarian@git

## Import libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import argparse
import time
import os
import csv
import sys
from unidecode import unidecode

import warnings
warnings.simplefilter('ignore')


We ultimately want the following fields:
author
title
word count
tags
date published(?)

and from tags we want:
fandom
pairing
possibly guess at genre, otherwise leave as unknown

## Functions for extracting tags and stats, having already extracted the html from the page


In [2]:
# error handling taken from ao3 scraper

def access_denied(soup):
    if (soup.find(class_="flash error")):
        return True
    if (not soup.find(class_="work meta group")):
        if soup.find("h3",class_="heading").text=="Sorry!":
            print("This content is restricted to logged in users only")
        return True
    
    return False

In [3]:
#taken from ao3 scraper
def get_tag_info(category, meta):
	'''
	given a category and a 'work meta group, returns a list of tags (eg, 'rating' -> 'explicit')
	'''
	try:
		tag_list = meta.find("dd", class_=str(category) + ' tags').find_all(class_="tag")
	except AttributeError as e:
		return []
	return [unidecode(result.text) for result in tag_list] 

In [77]:
def get_meta_info(fic_id,soup):
    #will return a dictionary with author title word count tags date published(?)
    # and from tags we want: fandom pairing characters
    #tags = ['rating', 'category', 'fandom', 'relationship', 'character', 'freeform']
    #categories = ['language', 'published', 'status', 'words', 'chapters', 'comments', 'kudos', 'bookmarks', 'hits'] 
    
    #categories of info we want
    work_id_names=['fic_id']
    headmeta_names=['author','title']
    stat_names=['words','published','status','chapters']
    tag_names=['rating','category','fandom','relationship','character','warning','freeform']
    #dictionary with categories as keys
    meta_dict=dict.fromkeys(work_id_names,headmeta_names+stat_names+tag_names)


    #get the work meta and the heading/preface meta from the soup
    meta = soup.find("dl", class_="work meta group")
    headmeta=soup.find("div",class_="preface group")

    #add the fic id as a field
    meta_dict['fic_id']=fic_id

    #get the author and title
    meta_dict['author'] = [tag.contents[0] for tag in headmeta.find("h3", class_="byline heading").contents if tag.name=='a'] #may be multiple authors
    meta_dict['title']= unidecode(headmeta.find("h2", class_="title heading").string).strip()

    # get the info for listed stats
    for stat in stat_names:
        if stat=='status':
            if meta.find("dt",class_=stat):
                meta_dict[stat]='Completed' if unidecode(meta.find("dt",class_=stat).text)=='Completed' else ('Updated: '+unidecode(meta.find("dd",class_=stat).text))
            else:
                meta_dict[stat]='oneshot'
        else:
            meta_dict[stat] = unidecode(meta.find("dd",class_=stat).text)

    
    
    #get the info for listed tags
    for cat in tag_names:
        meta_dict[cat]=get_tag_info(cat,meta)

    return meta_dict

In [78]:
def scrape_from_ao3(fic_id):
    print('Scraping:', fic_id)
    url = 'http://archiveofourown.org/works/'+str(fic_id)+'?view_adult=true'
    print(url)
    print("getting arguments")
    fic_ids, csv_out, headers, restart, is_csv, only_first_chap, lang, include_bookmarks, metadata_only = get_args()
    
    req = requests.get(url, headers={'User-Agent':'Chrome'})
    req=requests.get(url)
    src = req.text
    if req.status_code>=400:
        print("Error scraping fic_id:", fic_id, " and now exiting")
        return None
    
    soup = BeautifulSoup(src, 'html.parser')
    
    if (access_denied(soup)):
        print('Access Denied')
        return None
    
    print("Getting Info")
    meta_dict=get_meta_info(fic_id,soup)

    return meta_dict

In [26]:
def get_args(): 
	parser = argparse.ArgumentParser(description='Scrape and save some fanfic, given their AO3 IDs.')
	parser.add_argument(
		'ids', metavar='IDS', nargs='+',
		help='a single id, a space seperated list of ids, or a csv input filename')
	parser.add_argument(
		'--csv', default='fanfics.csv',
		help='csv output file name')
	parser.add_argument(
		'--header', default='',
		help='user http header')
	parser.add_argument(
		'--restart', default='', 
		help='work_id to start at from within a csv')
	parser.add_argument(
		'--firstchap', default='', 
		help='only retrieve first chapter of multichapter fics')
	parser.add_argument(
		'--lang', default='', 
		help='only retrieves fics of certain language (e.g English), make sure you use correct spelling and capitalization or this argument will not work')
	parser.add_argument(
		'--bookmarks', action='store_true',
		help='retrieve bookmarks; ')
	parser.add_argument(
		'--metadata-only', action='store_true',
		help='only retrieve metadata')
	args,unknown=parser.parse_known_args()
	fic_ids = args.ids
	is_csv = (len(fic_ids) == 1 and '.csv' in fic_ids[0]) 
	csv_out = str(args.csv)
	headers = str(args.header)
	restart = str(args.restart)
	ofc = str(args.firstchap)
	lang = str(args.lang)
	include_bookmarks = args.bookmarks
	metadata_only = args.metadata_only
	if ofc != "":
		ofc = True
	else:
		ofc = False
	if lang == "":
		lang = False
	return fic_ids, csv_out, headers, restart, is_csv, ofc, lang, include_bookmarks, metadata_only


In [28]:

#testing the call
# fic_id=32262178 should be access denied
#fic_id=61766950 #should be allowed
#fic_id=33751156 #utsdih
fic_id=61766950 #to the world
#fic_id=12005586 #rwm
# ahot https://archiveofourown.org/works/30248028
scrape_from_ao3(fic_id)

# have saved metas: rwm_soup, rwm_meta,ahot_soup,ahot_meta,utsdih_soup,utsdih_meta

Scraping: 33751156
http://archiveofourown.org/works/33751156?view_adult=true


KeyboardInterrupt: 

In [275]:
# store things

%store ahot_soup
%store utsdih_soup
%store rwm_soup

Stored 'ahot_soup' (BeautifulSoup)
Stored 'utsdih_soup' (BeautifulSoup)
Stored 'rwm_soup' (BeautifulSoup)


In [45]:
## Write metadata to csv
# In a folder called Data with a file called data_csv.csv
def csv_writer(meta_dict):
    dir_path="./Data"
    file_name="data_csv.csv"
    dir_exists=os.path.exists(dir_path)
    #check if directory exists
    if not dir_exists:
        os.makedirs(dir_path)
    
    file_exists=os.path.exists(dir_path+"/"+file_name)
    
    with open(dir_path+"/"+file_name,"a") as data_f:
        data_w=csv.DictWriter(data_f,meta_dict.keys())
        if not file_exists: data_w.writeheader()
        data_w.writerow(meta_dict)


# GUI

In [None]:
from nicegui import ui

In [2]:
ui.label('Hello NiceGUI!')

<nicegui.elements.label.Label at 0x7fd1745794f0>