


Notes
- Add 5 second delays before requesting from AO3's server are in compliance with the AO3 terms of service.
- Cannot scrape restricted fics, which require login. Can use https://pypi.org/project/ao3/ to log in but cannot use it to scrape restricted fics.
- Some accounts no longer exist. i.e. ``ButterflyPup``.
- account name ``orphan_account`` is a default pseud of the Orphan Account for works that are no longer associated with their creator's account. Skipped it for now.

In [1]:
import pandas as pd
import json
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import pprint
from datetime import datetime
import numpy as np
import requests
from bs4 import BeautifulSoup
import argparse
import time
import os
import csv
import sys
from unidecode import unidecode
from itertools import groupby
from operator import itemgetter

In [2]:
# path_to_stories = sys.argv[-1]
path_to_stories = 'sampledata/ao3_harrypotter_text_stories.csv'

In [3]:
def get_authors_from_storiescsv(path_to_stories):
    '''
    param: path to stories.csv
    return: a Series of author_keys with duplicates removed
    '''
    return pd.read_csv(path_to_stories,usecols = ['author_key'])['author_key'].drop_duplicates()

In [4]:
authorschema = ('author_key',
                'pseudos',
                'bday',
                'author_fic_ids',
                'joined_on',
                'live_in',
                'author_work_count',
                'author_bookmark_count',
                'bio',
                'linked_social_media')

pseudoschema = ('author_key',
                   'pseudo',
                   'pd_work_count',
                   'pd_bookmark_count',
                   'pd_fic_ids')

In [5]:
def find_bookmark_work_counts(soup, author_key, pseudo = None):
    
    
    if pseudo == None:
        url = "/users/" + author_key
    else:
        url = "/users/" + author_key + "/pseuds/" + pseudo
        
    # get the # of fics this author/pseudo has bookmarked
    try:
        bookmark_cnt = soup.find("a", attrs = {'href': lambda x: x and x.lower() == url.lower() + "/bookmarks"}).text 
        bookmark_cnt = bookmark_cnt[11:-1]
    except:
        print('Error finding bookmarks of', url + "/bookmarks")
    
    # get the # of fics this author/pseduo has created (incl. WIP)
    try:
        work_cnt = soup.find("a", attrs = {'href': lambda x: x and x.lower() == url.lower() + "/works"}).text # i.e. "Works (1)"
        work_cnt = work_cnt[7:-1]
    except:
        print('Error finding bookmarks of' + url + "/works")
    
    return bookmark_cnt, work_cnt

In [6]:
def scrape_an_author(author_key,
                     authorschema = authorschema,
                     pseudoschema = pseudoschema):
    '''
    param: author_key (str)
    return: df of the author, linked_social_media left as None
    each row is an author identity (author_key and psudo pair)
    '''
    
    author = pd.DataFrame(columns= authorschema)
    pseudoddf = pd.DataFrame(columns= pseudoschema)
    
    # ==== profile the author ==== #
    # get start_date, location
    r1 = requests.get("https://archiveofourown.org/users/" + author_key + "/profile",
                      headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0'})
    soup1 = BeautifulSoup(r1.content, "html.parser")
    
    try:
        # if the author_key exists
        umeta = [dd.text for dd in soup1.find("dl", class_="meta").findAll("dd")]
        # print('Found author %s.'%author_key)
    except:
        print ('Cannot find author %s.'%author_key)
        return None, None
    
    start_date = umeta[1]
    try:
        location = umeta[2]
    except:
        location = ""
        
    # get all pseudos of the author_key
    pseudos = [i['href'].split('/')[-1] for i in soup1.find("dd", class_ = "pseuds").findAll("a")]
    
    # find the author's bio, if any
    biosoup = soup1.find("blockquote", class_ = "userstuff")
    if biosoup is not None:
        try:
            bio = biosoup.text
        except Exception as e:
            print(author_key, e)
            bio = "<!ERROR!>"
    else:
        bio = ""
        
    # find the author's bday, if listed
    try:
        bday = soup1.find("dd", class_="birthday").text
    except:
        bday = ""
    
    author_bookmark_cnt, author_work_cnt = find_bookmark_work_counts(soup1, author_key)
    
    author = author.append({'author_key': author_key,
                            'bday': bday,
                            'pseudos': pseudos,
                            'joined_on': start_date,
                            'live_in': location,
                            'author_work_count': author_work_cnt,
                            'author_bookmark_count': author_bookmark_cnt,
                            'fic_ids': None, #todo
                            'bio': bio,
                            'linked_social_media': None},
                             ignore_index = True)
    
    # ==== profile the author ==== #
    for pseudo in pseudos:
        
        r2 = requests.get("https://archiveofourown.org/users/" + author_key + "/pseuds/" + pseudo,
                      headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0'})
        soup2 = BeautifulSoup(r2.content, "html.parser")
        
        pd_bookmark_cnt, pd_work_cnt = find_bookmark_work_counts(soup2, author_key, pseudo)
        
        pseudoddf = pseudoddf.append({'author_key': author_key,
                                      'pseudo': pseudo,
                                      'pd_work_count': pd_work_cnt,
                                      'pd_bookmark_count': pd_bookmark_cnt,
                                      'pd_fic_ids': None
                                     },
                                     ignore_index = True)

    return author, pseudoddf


In [7]:
# Test cases:
# - allmylovesatonce 39 works
# - alexdesro08 2 pseudos, no works finished
# - liketolaugh 103 works (Interview study participant)

# scrape_an_author('allmylovesatonce')
# scrape_an_author('liketolaugh')
# scrape_an_author('alexdesro08')
# scrape_an_author('Tara')[1]
scrape_an_author('Falmarien')

(  author_key                     pseudos bday author_fic_ids   joined_on  \
 0  Falmarien  [falmarien, Navi, saoirse]                 NaN  2011-12-03   
 
         live_in author_work_count author_bookmark_count  \
 0  Cerin Amroth                25                   216   
 
                                                  bio linked_social_media  \
 0  tumblr: rainblownfieldsdreamwidth: falmarienol...                None   
 
   fic_ids  
 0    None  ,
   author_key     pseudo pd_work_count pd_bookmark_count pd_fic_ids
 0  Falmarien  falmarien            10               207       None
 1  Falmarien       Navi            15                 9       None
 2  Falmarien    saoirse             0                 0       None)

In [10]:
author_key_list = get_authors_from_storiescsv(path_to_stories)

authors = pd.DataFrame(columns= authorschema)
pseudos = pd.DataFrame(columns= pseudoschema)

cnt = 0

for author_key in author_key_list:
    if cnt >= 100:
        break
    
    if author_key == 'orphan_account':
        # skip ``orphan_account`` 
        # which is a default pseud of the Orphan Account for works that are no longer associated with their creator's account.
        pass
    else:
        author, pseudo = scrape_an_author(author_key)
        authors = authors.append(author)
        pseudos= pseudos.append(pseudo)
        cnt += 1
        print(cnt)

authors.to_csv('sampleoutput/sampleoutput_authors.csv', mode='w+', header=False, index=False)
pseudos.to_csv('sampleoutput/sampleoutput_pseudos.csv', mode='w+', header=False, index=False)


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
Cannot find author ButterflyPup.
85
86
87
88
89
90
91
92
93
94
95
96
97
98
Cannot find author Aisling_Isobel.
99
100
