In [1]:
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import re
import time
import json

options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu')

# Scraping donjon

In [2]:
def scrape_dj(url):
    '''add card data to list'''
    all_items = []
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    all_rows = driver.find_elements_by_xpath("//tr")
    for row in all_rows[4:]:
        item_dict = {}
        row_html = row.get_attribute('innerHTML')[4:-5]
        split = row_html.split('</td><td>')
        item_dict = {'item':split[0],'type':split[1],
                     'rarity':split[2],'attunement':split[3],
                     'notes':split[4],'source':split[5]}
        all_items.append(item_dict)  
    driver.quit()
    return all_items
    
url = 'https://donjon.bin.sh/5e/magic_items/'
dj_items = scrape_dj(url)

dj_df = pd.DataFrame(dj_items)
dj_df

Unnamed: 0,item,type,rarity,attunement,notes,source
0,Armor of Gleaming,Armor,Common,,medium or heavy,xge 136
1,Cast-Off Armor,Armor,Common,,"light, medium, or heavy",xge 136
2,Shield of Expression,Armor,Common,,shield,xge 139
3,Smoldering Armor,Armor,Common,,,xge 139
4,Adamantine Armor,Armor,Uncommon,,"medium or heavy, but not hide",dmg 150
...,...,...,...,...,...,...
449,Crook of Rao,Wondrous Item,Artifact,yes,,tce 123
450,Demonomicon of Iggwilv,Wondrous Item,Artifact,yes,,tce 125
451,Luba's Tarokka of Souls,Wondrous Item,Artifact,yes,,tce 129
452,Mighty Servant of Leuk-o,Wondrous Item,Artifact,yes,,tce 131


# Scraping Roll20

In [3]:
def scrape_basic(url):
    '''open all expansion tabs and add data to list'''
    print('SETTING UP DRIVER....')
    driver = webdriver.Chrome(options=options)
    driver.get(url)    
    time.sleep(5)
    print('Done.')
    print('OPENING DROPDOWNS....')
    buttons = driver.find_elements_by_class_name('dropdown-toggle')
    with tqdm(total=len(buttons)) as pbar:    
        for button in buttons[7:]:
            time.sleep(.5)
            driver.execute_script("arguments[0].click();", button)
            pbar.update(1)
    print('Done.')
    print('FINDING CARD ELEMENTS....')
    item_list = [item.text for item in driver.find_elements_by_class_name('card')]
    driver.quit()
    print('Done.\n------')
    return item_list
    
url = 'https://roll20.net/compendium/dnd5e/Items%20List#content'
item_list = scrape_basic(url)
print(len(item_list))

SETTING UP DRIVER....
Done.
------
OPENING DROPDOWNS....


HBox(children=(IntProgress(value=0, max=780), HTML(value='')))


Done.
------
FINDING CARD ELEMENTS....
Done.


772


In [64]:
stats_list = ['Item Rarity','Weight','Modifiers',
              'AC','Save','Stealth','Damage',
              'Damage Type','Duration','Secondary Damage',
              'Range','Properties']
all_items = []
for item in item_list:
    item_dict = {}
    split = item.split('\n')
    item_dict['item'] = split[0]
    if split[1] == '(equipment pack)':
        item_dict['type'] = 'equipment pack'
        item_dict['notes'] = 'pack contents: ' + split[4]
    elif '(' in split[1]:
        item_dict['type'] = split[1].split(' (')[0]
        item_dict['notes'] = split[1].split(' (')[1].split(')')[0]
    else:
        item_dict['type'] = split[1]
        item_dict['notes'] = ''
    for stat in stats_list:
        for x in split[2:]:
            if stat.lower() in item_dict and item_dict[stat.lower()] != '':
                continue
            elif f'{stat}: ' in x:
                item_dict[stat.lower()] = x.split(f'{stat}: ')[1]
            else:
                item_dict[stat.lower()] = ''
    all_items.append(item_dict)
    
df = pd.DataFrame(all_items)
df = df.applymap(lambda s:s.lower() if type(s) == str else s)
df = df.sort_values(by='item')
df

Unnamed: 0,item,type,notes,item rarity,weight,modifiers,ac,save,stealth,damage,damage type,duration,secondary damage,range,properties
0,abacus,adventuring gear,,,2,,,,,,,,,,
1,acid,adventuring gear,consumable,,1,,,,,2d6,acid,,,20/60,improvised weapons
2,adamantine armor,armor,"medium or heavy, but not hide",,,,,,,,,,,,
3,alchemist's fire,adventuring gear,consumable,,1,,,dexterity,,1d4,fire,,,20/60,improvised weapons
4,alchemist's supplies,adventuring gear,artisan's tools,standard,8,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
766,wings of flying,adventuring gear,wondrous item,rare,,,,,,,,,,,
767,woodcarver's tools,adventuring gear,artisan's tools,standard,5,,,,,,,,,,
768,wooden shield,shield,shield,standard,6,,2,,,,,,,,
769,wooden staff,adventuring gear,arcane focus,standard,4,,,,,,,,,,


In [65]:
item_names = df.item.unique().tolist()

url_prefix = 'https://roll20.net/compendium/dnd5e/'

item_urls = []
for x in item_names:
    item_urls.append(url_prefix + x.replace(' +','%20%2B').replace(' ', '%20'))
    
len(item_urls)

771

In [66]:
def scrape_text(item_urls):
    '''takes list of urls and extracts item text'''
    item_data = []
    print('SCRAPING ITEM DATA....')
    with tqdm(total=len(item_urls)) as pbar:    
        for url in item_urls:
            driver = webdriver.Chrome(options=options)
            driver.get(url)
            # get item info
            item_info = driver.find_element(By.ID,value='pagecontent').text
            item_data.append(item_info)
            driver.quit()
            pbar.update(1)
    print('Done.\n------')
    return item_data

all_items = scrape_text(item_urls)
len(all_items)

SCRAPING ITEM DATA....


HBox(children=(IntProgress(value=0, max=771), HTML(value='')))


Done.
------


771

In [67]:
df = df.drop_duplicates()
df['details'] = all_items
df['attunement'] = np.where(df.details.str.contains('Requires Attunement'), 1, 0)
df = df[['item','type','notes','item rarity','weight',
         'attunement','details','modifiers','ac',
         'save','stealth','damage','damage type',
         'duration','secondary damage','range',
         'properties']].rename(columns={'item rarity':'item_rarity',
                                         'damage type':'damage_type'})
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,item,type,notes,item rarity,weight,modifiers,ac,save,stealth,damage,damage type,duration,secondary damage,range,properties,details,attunement
0,abacus,adventuring gear,,,2.0,,,,,,,,,,,,0
1,acid,adventuring gear,consumable,,1.0,,,,,2d6,acid,,,20/60,improvised weapons,"As an action, you can splash the contents of t...",0
2,adamantine armor,armor,"medium or heavy, but not hide",,,,,,,,,,,,,This suit of armor is reinforced with adamanti...,0
3,alchemist's fire,adventuring gear,consumable,,1.0,,,dexterity,,1d4,fire,,,20/60,improvised weapons,"This sticky, adhesive fluid ignites when expos...",0
4,alchemist's supplies,adventuring gear,artisan's tools,standard,8.0,,,,,,,,,,,These Special tools include the items needed t...,0


In [69]:
df.to_csv('../output/roll_20_items.csv', index=False)