In [1]:
import pandas
from bs4 import BeautifulSoup
import splinter
import re
import os

In [2]:
monsters_to_import = [
    'http://dndroll.wikidot.com/creatures:giant-poisonous-snake',
    'http://dndroll.wikidot.com/creatures:ape',
    'http://dndroll.wikidot.com/creatures:spider'
]
database = 'monsters_wikidot.csv'

In [3]:
def get_AC(line):
    return re.match(r'[^0-9]*([0-9]+)', line).group(1)
def get_HP(line):
    return re.match(r'[^0-9]*(.*)', line).group(1)
def get_stat(row):
    tds = row.find_all("td")
    return f'{tds[1].text} ({tds[2].text})'
def get_CR(line):
    CRstr = re.match(r'.*CR\s*([0-9/]+)', line).group(1)
    match_div = re.match(r'([0-9])+/([0-9])+', CRstr)
    if match_div:
        return float(match_div.group(1)) / float(match_div.group(2))
    else:
        return float(CRstr)
def abbreviate(string):
    s2 = string.replace("Skills ", "")\
               .replace("Senses ","")\
               .replace("Languages ","")\
               .replace("Challenge", "CR")\
               .replace("—","")\
               .replace("\n\n","\n")\
               .replace("\n","<br>")\
               .replace(" ft.","'")\
               .replace("<br>",", ")
    s2 = re.sub(r'Passive Perception [0-9]*<br>', "", s2)
    return s2

def get_monster(url):
    monster = {}
    monster['url'] = url
    browser = splinter.Browser("firefox")
    browser.visit(url)
    soup = BeautifulSoup(browser.html, 'html.parser')
    browser.quit()
    main_block = soup.find("div", id="main-content")
    monster['name'] = main_block.find("div", id="page-title").text.strip()
    stats_table = main_block.find("table")
    page_block = main_block.find("div", id="page-content")
    img = page_block.find("img", class_="image")
    if img == None:
        monster['img'] = ""
    else:
        monster['img'] = img["src"]
    if page_block.find("p").findNext("table") == None: # no paragraph before stats table
        for line in page_block.text.split("\n"):  
            line = line.strip()
            if line == "":
                continue
            if not 'meta' in monster.keys():
                monster['meta'] = line
                print(f"setting meta to {monster['meta']}")
            if re.match(r'.*Armor\s+Class', line):
                monster['AC'] = get_AC(line)
            elif re.match(r'.*Hit\s+Points', line):
                monster['HP'] = get_HP(line)
            elif re.match(r'.*ft\.', line) or re.match(r'.*feet', line):
                monster['movement'] = line
            elif re.match(r'.*Ability', line):
                break;
    else:
        first_paragraph = main_block.find("div", id="page-content").find("p")
        fp_lines = first_paragraph.text.split("\n")
        monster['meta'] = fp_lines[0]
        monster['AC'] = get_AC(fp_lines[1])
        monster['HP'] = get_HP(fp_lines[2])
        monster['movement'] = fp_lines[3]
        
    rows= stats_table.find_all("tr")
    stats = [get_stat(row) for row in rows[1:7]]
    monster['str'] = stats[0]
    monster['dex'] = stats[1]
    monster['con'] = stats[2]
    monster['int'] = stats[3]
    monster['wis'] = stats[4]
    monster['cha'] = stats[5]
    second_paragraph = stats_table.findNext("p")
    tidbits = abbreviate(second_paragraph.text)
    monster['tidbits'] = abbreviate(second_paragraph.text)
    monster['CR'] = get_CR(monster['tidbits'])
    
    other_paragraphs = second_paragraph.find_all_next("p")
    monster['attributes'] = ""
    for paragraph in other_paragraphs:
        monster['attributes'] += "<br>" + abbreviate(paragraph.text)
    monster['attributes'] = monster['attributes'][4:]
    return monster

In [4]:
old_monsters_df = None
if os.path.exists(database):
    old_monsters_df = pandas.read_csv(database)
else:
    old_monsters_df = pandas.DataFrame()
monsters_list = []
for url in monsters_to_import:
    if 'url' in old_monsters_df.columns and url in list(old_monsters_df['url']):
        old_row = old_monsters_df.loc[old_monsters_df['url'] == url]
        print(f"using old data for {old_row['name']}")
        monsters_list.append(old_row.to_dict())
    else:
        print(f"downloading {url}")
        monsters_list.append(get_monster(url))
monsters_df = pandas.DataFrame(monsters_list)
monsters_df.to_csv(database, index=False)
monsters_df

using old data for 0    Giant Poisonous Snake
Name: name, dtype: object
using old data for 1    Ape
Name: name, dtype: object
using old data for 2    Spider
Name: name, dtype: object


Unnamed: 0.1,Unnamed: 0,url,name,img,meta,AC,HP,movement,str,dex,con,int,wis,cha,tidbits,CR,attributes
0,{0: 0},{0: 'http://dndroll.wikidot.com/creatures:gian...,{0: 'Giant Poisonous Snake'},{0: nan},"{0: 'Medium beast, unaligned'}",{0: 14},{0: '11 (2d8 + 2)'},"{0: 'Speed 30 ft., swim 30 ft.'}",{0: '10 (+0)'},{0: '18 (+4)'},{0: '13 (+1)'},{0: '2 (-4)'},{0: '10 (+0)'},{0: '3 (-4)'},"{0: 'Perception +2, Blindsight 10', Passive Pe...",{0: 0.25},"{0: 'Bite. Melee Weapon Attack: +6 to hit, rea..."
1,{1: 1},{1: 'http://dndroll.wikidot.com/creatures:ape'},{1: 'Ape'},{1: nan},"{1: 'Medium beast, unaligned'}",{1: 12},{1: '19 (3d8 + 6)'},"{1: 'Speed 30 ft., climb 30 ft.'}",{1: '16 (+3)'},{1: '14 (+2)'},{1: '14 (+2)'},{1: '6 (-2)'},{1: '12 (+1)'},{1: '7 (-2)'},"{1: 'Athletics +5, Perception +3, Passive Perc...",{1: 0.5},{1: 'Multiattack. The ape makes two fist attac...
2,{2: 2},{2: 'http://dndroll.wikidot.com/creatures:spid...,{2: 'Spider'},{2: 'https://media-waterdeep.cursecdn.com/avat...,"{2: 'Tiny beast, unaligned'}",{2: 12},{2: '1 (1d4 - 1)'},"{2: 'Speed 20 ft., climb 20 ft.'}",{2: '2 (-4)'},{2: '14 (+2)'},{2: '8 (-1)'},{2: '1 (-5)'},{2: '10 (+0)'},{2: '2 (-4)'},"{2: 'Stealth +4, Darkvision 30', Passive Perce...",{2: 0.0},{2: 'Spider Climb. The spider can climb diffic...


In [5]:
old_monsters_df.index

RangeIndex(start=0, stop=3, step=1)

In [6]:
'http://dndroll.wikidot.com/creatures:giant-poisonous-snake' in old_monsters_df['url']

False

In [7]:
monsters_to_import[0] in old_monsters_df['url']

False

In [8]:
old_monsters_df

Unnamed: 0.1,Unnamed: 0,url,name,img,meta,AC,HP,movement,str,dex,con,int,wis,cha,tidbits,CR,attributes
0,0,http://dndroll.wikidot.com/creatures:giant-poi...,Giant Poisonous Snake,,"Medium beast, unaligned",14,11 (2d8 + 2),"Speed 30 ft., swim 30 ft.",10 (+0),18 (+4),13 (+1),2 (-4),10 (+0),3 (-4),"Perception +2, Blindsight 10', Passive Percept...",0.25,"Bite. Melee Weapon Attack: +6 to hit, reach 10..."
1,1,http://dndroll.wikidot.com/creatures:ape,Ape,,"Medium beast, unaligned",12,19 (3d8 + 6),"Speed 30 ft., climb 30 ft.",16 (+3),14 (+2),14 (+2),6 (-2),12 (+1),7 (-2),"Athletics +5, Perception +3, Passive Perceptio...",0.5,Multiattack. The ape makes two fist attacks.<b...
2,2,http://dndroll.wikidot.com/creatures:spider,Spider,https://media-waterdeep.cursecdn.com/avatars/t...,"Tiny beast, unaligned",12,1 (1d4 - 1),"Speed 20 ft., climb 20 ft.",2 (-4),14 (+2),8 (-1),1 (-5),10 (+0),2 (-4),"Stealth +4, Darkvision 30', Passive Perception...",0.0,Spider Climb. The spider can climb difficult s...
