In [1]:
import pandas as pd
import AO3
from bs4 import BeautifulSoup
import requests
import re
import time
from dotenv import dotenv_values
from datetime import datetime

In [None]:
search = AO3.Search(fandoms = 'AUSTEN Jane - Works', word_count = AO3.utils.Constraint(5000, 10000), completion_status = 'T')
search.update()
print(search.total_results)
for result in search.results:
    print(result)

In [None]:
work = AO3.Work(55189105)

In [None]:
print(work.nchapters)

In [None]:
with open(f'{work.title}.html', 'wb') as file:
    file.write(work.download('html'))

I like the idea of AO3, but it's not very usable due to lack of sufficient documentation on what methods are available (ie the docs say to use workid for search results but how do you actually get it?)

In [None]:
# file is output from running ao3downloader
# use this to get work_id and as metadata (get new csv based on updated search and choose log-in method)
work_metadata = pd.read_csv('../ao3downloader-main/downloads/links_05132024191458.csv')

In [None]:
work_metadata.head()

In [None]:
text = work.download('html')

In [None]:
soup = BeautifulSoup(text)

In [None]:
clean_text = soup.text

In [None]:
# clean soup string with regex - include text after 'Summary'.
# There's no clear universal way to drop the stuff at the end (kudos, etc)

In [None]:
# try trad webscraping (could just use one of the packages to get the list of work ids and/or urls)
# url format is archiveofourown.org/works/{work_id}?view_full_work=true

url = 'https://archiveofourown.org/works/55189105?view_full_work=true'

response = requests.get(url)

In [None]:
response.status_code

In [None]:
soup = BeautifulSoup(response.text)

In [None]:
soup

In [None]:
# returns a single element with all the text instead of a separate element for each chapter
test_text = soup.find_all('div', id = 'chapters', role = 'article')

In [None]:
test_text

In [None]:
# loop over test_text, each chapter is a separate list
# don't need to only look for p tags
test_text[0].find_all('p', attrs = {'p1'})

In [None]:
# produces list of strings (each one is the result of extracting text from a 'p' tag)
allowlist = ['p']

text_elements = [t for t in test_text[0].find_all(string = True) if t.parent.name in allowlist]

In [None]:
text_elements

In [None]:
# join the list of strings with a space separating each
' '.join(text_elements)

In [None]:
# don't need to loop over a single work any more
allowed_tags = ['p']
clean_text = []

for x in range(len(test_text)):
    # x_para = test_text[x].find_all('p', attrs = {'p1'})
    text_elements = [t for t in test_text[x].find_all(string = True) if t.parent.name in allowlist]
    clean_text.extend(text_elements)

joined_text = ' '.join(clean_text)

### Plan for getting all desired works
1. use one of the packages to get list of work_id
2. f-string url variable so work_id can be inserted as a string
3. authenticate to website so I can download restricted fics
4. for loop over work_id list
    - soup response.text
    - 'test_text' code line (change variable)
    - 'test_elements' code line (change variable)
    - ' '.join(test_elements) saved as variable
    - save resulting string as txt file in data/fanfic_texts
        - name is work_id.txt
5. metadata saved (from package?)

In [None]:
# # this didn't fail but it never fully ran - not sure why

# # how to authenticate to the website
# login_url = 'https://archiveofourown.org/users/login'
# credentials = dotenv_values('cred.env')

# username = credentials['username'] 
# password = credentials['password'] 

# session = requests.Session()

# login_sess = session.get(login_url)

# log_soup = BeautifulSoup(login_sess.text, 'html.parser')
# auth_token = log_soup.find('input', {'name' : 'authenticity_token'})['value']

# params = {
#     'authenticity_token' : auth_token,
#     'user[login]' : username,
#     'user[password]' : password,
# }

# log_res = session.post(login_url, params = params)

# # check success - look for word or phrase if it's not 'Welcome'
# if 'Successfully logged in' in log_res.text:
#     print('success')
# else:
#     print('failed, try again')

In [None]:
# new approach - use the session module from AO3 and then get and clean text with custom code
credentials = dotenv_values('cred.env')
username = credentials['username'] 
password = credentials['password']

s = AO3.Session(username, password)

# test with one link to see if it works
url = 'https://archiveofourown.org/works/35698438?view_full_work=true'
response = requests.get(url)
print(response.status_code)
soup = BeautifulSoup(response.text)
author_text = soup.find_all('div', id = 'chapters', role = 'article')
allowlist = ['p']
author_elements = [t for t in author_text[0].find_all(string = True) if t.parent.name in allowlist]
' '.join(author_elements)

In [None]:
credentials = dotenv_values('cred.env')
username = credentials['username'] 
password = credentials['password']

session = AO3.Session(credentials['username'], credentials['password'])
print(f"Bookmarks: {session.bookmarks}")

---------------------------------------

Downloading fics from AO3 without authentication token. This method currently retrieves around 350 works with the filters put in place. Authenticating would get almost 50 additional texts (with the same filters). I'm having a lot of trouble with getting the authentication token, so I'm pursuing the non-authentication required list to build out the code - if I can successfully fix the code for auth I'll do it but I think the texts I have will be sufficient if I can't get more.

- Set up correct search with desired filters.
- Using ao3downloader, get metadata using the search link.

In [2]:
# loading metadata into a dataframe
work_metadata = pd.read_csv('../data/fanfic_texts/links_05182024053844.csv')

- 'link' column has the link to the work, but only goes to the first chapter
- AO3 does have an option on the website to get all chapters on a single page which will reduce the number of calls needed to get everything
- extract work_id from the link column - split on / and take the last section

In [3]:
work_metadata['work_id'] = work_metadata['link'].str.split(pat = '/').str[-1]

- url variable: f-string with insert for work_id
- set up iteration over work_metadata['work_id]
- code to get response from url and parse out desired text
    - would be good to expand beyond 'p' tags: this tag gets the paragraphs of the actual text but misses things like chapter headers (although maybe that's good, and I should strip stuff like that out of the Austen texts as well)

In [19]:
def get_elements(text):
    allow_tags = {
        'p': {}
        # 'div': {'class': 'chapter', 'id': re.compile(r'^chapter-\d+$')}   # doesn't do anything because there's no text element in this tag
    }

    elements = []
    for parent in text[0].find_all(string = True):
        if parent.parent.name in allow_tags.keys():
            if all(parent.parent.get(attr) == value for attr, value in allow_tags[parent.parent.name].items()):
                elements.append(parent.parent.text.strip())
    ele = ' '.join(elements)

    return ele

    # allowlist = ['p']
    # elements = [t for t in text[0].find_all(string = True) if t.parent.name in allowlist]
    # ' '.join(elements)

    # return elements


In [5]:
def write_file(filename, elements):
    with open(filename, 'wt', encoding = 'utf-8') as fic_file:
        fic_file.write(str(elements))

In [20]:
# test with limited sample
work_ids = work_metadata['work_id'][:1]

for work_id in work_ids:          # work_metadata['work_id']:
    start_time = datetime.now()
    print(start_time)
    url = f'https://archiveofourown.org/works/{work_id}?view_full_work=true'
    print(work_id)
    res = requests.get(url)
    print(res.status_code)
    soup = BeautifulSoup(res.text)
    text = soup.find_all('div', id = 'chapters', role = 'article')
    e = get_elements(text)
    filename = f'../data/fanfic_texts/{work_id}.txt'
    write_file(filename, e)
    end_time = datetime.now()
    print(end_time)
    print(f'run-time for {work_id}: {end_time - start_time}')
    time.sleep(5)

2024-05-20 20:05:40.368404
55697983
200
2024-05-20 20:05:40.657150
run-time for 55697983: 0:00:00.288746


In [23]:
for work_id in work_metadata['work_id']:
    start_time = datetime.now()
    print(start_time)
    url = f'https://archiveofourown.org/works/{work_id}?view_full_work=true'
    print(work_id)
    res = requests.get(url)
    print(res.status_code)
    soup = BeautifulSoup(res.text)
    text = soup.find_all('div', id = 'chapters', role = 'article')
    e = get_elements(text)
    filename = f'../data/fanfic_texts/{work_id}.txt'
    write_file(filename, e)
    end_time = datetime.now()
    print(end_time)
    print(f'run-time for {work_id}: {end_time - start_time}')
    time.sleep(5)

2024-05-20 20:07:37.235832
55697983
200
2024-05-20 20:07:37.562332
run-time for 55697983: 0:00:00.326500
2024-05-20 20:07:42.563283
55785940
200
2024-05-20 20:07:42.908961
run-time for 55785940: 0:00:00.345678
2024-05-20 20:07:47.914066
55435072
200
2024-05-20 20:07:48.278353
run-time for 55435072: 0:00:00.364287
2024-05-20 20:07:53.283508
55736635
200
2024-05-20 20:08:40.278413
run-time for 55736635: 0:00:46.994905
2024-05-20 20:08:45.294218
55189105
200
2024-05-20 20:08:45.832403
run-time for 55189105: 0:00:00.538185
2024-05-20 20:08:50.837550
53512534
200
2024-05-20 20:08:51.513441
run-time for 53512534: 0:00:00.675891
2024-05-20 20:08:56.518595
53268571
200
2024-05-20 20:08:57.249043
run-time for 53268571: 0:00:00.730448
2024-05-20 20:09:02.254229
53699089
200
2024-05-20 20:09:02.980287
run-time for 53699089: 0:00:00.726058
2024-05-20 20:09:07.985477
30955361
200
2024-05-20 20:09:08.535901
run-time for 30955361: 0:00:00.550424
2024-05-20 20:09:13.541092
54270448
200
2024-05-20 20:0