# Goal
Create a dataset of D&D monsters from the SRD.

## Data Collection
Data is sourced from a document known as the SRD which contains a compendium of over 300 monsters from D&D and their game data.


*This work includes material from the System Reference Document 5.2.1 (“SRD 5.2.1”) by Wizards of the Coast LLC, available at https://www.dndbeyond.com/srd. The SRD 5.2.1 is licensed under the Creative
Commons Attribution 4.0 International License, available at https://creativecommons.org/licenses/by/4.0/legalcode.*

In [9]:
%%bash
curl https://media.dndbeyond.com/compendium-images/srd/5.2/SRD_CC_v5.2.1.pdf > srd.pdf
pip install pypdf



  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100 5890k  100 5890k    0     0  11.7M      0 --:--:-- --:--:-- --:--:-- 11.7M


In [10]:
from pypdf import PdfReader

reader = PdfReader('srd.pdf')
pages = reader.pages[257:] #monster descriptions start on page 258 (index 257)

fragments = [] #fragments of text

#function called as the 'visitor' during text extraction
def vis(text, um, tm, font_dict, font_size):
  if text=='\n': fragments.append(text)
  if font_dict and font_dict['/BaseFont']=='/CFGFPT+GillSans-SemiBold':
    if tm[0]==14.55:
      #text fragment is a monster name, tag it
      fragments.append('~MONSTERNAME~'+text)
  else:
    #all other fonts are read normally
    fragments.append(text)

#extract text from all pages
for page in pages:
  page.extract_text(visitor_text=vis)
srd_text = ''.join(fragments)

#truncate the beginning
srd_text = srd_text[srd_text.find('~MONSTERNAME~'):]

In [11]:
import numpy as np

#generate a list of all lines
srd_lines = srd_text.split('\n')
srd_lines = [line for line in srd_lines if line]

#remove the footer
srd_lines = [line for line in srd_lines if line != 'System Reference Document 5.2.1']

#use the ~MONSTERNAME~ tag to separate each monster into its own block of lines
blocks = []
for line in srd_lines:
  if '~MONSTERNAME~' in line:
    line = line.replace('~MONSTERNAME~','')
    #start new block
    blocks.append([])
  blocks[-1].append(line)
blocks[-1]

['Wolf',
 'Medium Beast, Unaligned',
 'AC 12   Initiative +2 (12)',
 'HP 11 (2d8 + 2)',
 'Speed 40 ft.',
 'MOD SAVE MOD SAVE MOD SAVE',
 'Str \t 14 +2 +2 Dex  15 +2 +2 Con  12 +1 +1',
 'Int  3 −4 −4 WIS 12 +1 +1 Cha  6 −2 −2',
 'Skills Perception +5, Stealth +4',
 'Senses Darkvision 60 ft.; Passive Perception 15',
 'Languages None',
 'CR 1/4 (XP 50; PB +2)',
 'Traits',
 'Pack Tactics. The wolf has Advantage on attack rolls ',
 'against a creature if at least one of the wolf’s allies is ',
 'within 5 feet of the creature and the ally doesn’t have ',
 'the Incapacitated condition.',
 'Actions',
 'Bite. Melee Attack Roll: +4, reach 5 ft. Hit: 5 (1d6 + 2) ',
 'Piercing damage. If the target is a Medium or smaller ',
 'creature, it has the Prone condition.']

In [12]:
#extracts meaningful data from a block and returns a row
def interpret_block(block):
  out = {}

  out['Name'] = block[0]
  out['SizeType'], out['Align'] = block[1].split(',')
  out['AC'], out['Init'] = block[2].replace('AC','').split('Initiative')
  out['HP'] = block[3].replace('HP','')

  block = block[4:]
  out['Multiattack'] = False
  out['LegRes'] = False
  out['LegAct'] = False
  for i,line in enumerate(block):
    if 'Speed' in line and 'Speed' not in out.keys():
      out['Speed'] = line.replace('Speed','')
    if 'Str' in line and 'Scores' not in out.keys():
      out['Scores'] = line+' '+block[i+1]
    if 'PB +' in line:
      out['CR'] = line[3:].split('(')[0]
      out['PB'] = line.split('PB +')[1][:-1]
    if 'Multiattack.' in line:
      out['Multiattack'] = True
    if 'Legendary Resistance' in line:
      out['LegRes'] = True
    if 'Legendary Actions' in line:
      out['LegAct'] = True

  #remove leading/trailing whitespace
  for k in out.keys():
    if type(out[k])==str: out[k]= out[k].strip()
  return out

interpret_block(blocks[99])

{'Name': 'Adult Green Dragon',
 'SizeType': 'Huge Dragon (Chromatic)',
 'Align': 'Lawful Evil',
 'AC': '19',
 'Init': '+11 (21)',
 'HP': '207 (18d12 + 90)',
 'Multiattack': True,
 'LegRes': True,
 'LegAct': True,
 'Speed': '40 ft., Fly 80 ft., Swim 40 ft.',
 'Scores': 'Str  23 +6 +6 Dex  12 +1 +6 Con  21 +5 +5 Int  18 +4 +4 WIS 15 +2 +7 Cha  18 +4 +4',
 'CR': '15',
 'PB': '5'}

In [13]:
import pandas as pd

data = pd.DataFrame([interpret_block(block) for block in blocks])
data.head(3)

Unnamed: 0,Name,SizeType,Align,AC,Init,HP,Multiattack,LegRes,LegAct,Speed,Scores,CR,PB
0,Aboleth,Large Aberration,Lawful Evil,17,+7 (17),150 (20d10 + 40),True,True,True,"10 ft., Swim 40 ft.",Str 21 +5 +5 Dex 9 −1 +3 Con 15 +2 +6 Int ...,10,4
1,Air Elemental,Large Elemental,Neutral,15,+5 (15),90 (12d10 + 24),True,False,False,"10 ft., Fly 90 ft. (hover)",Str 14 +2 +2 Dex 20 +5 +5 Con 14 +2 +2 Int ...,5,3
2,Animated Armor,Medium Construct,Unaligned,18,+2 (12),33 (6d8 + 6),True,False,False,25 ft.,Str 14 +2 +2 Dex 11 +0 +0 Con 13 +1 +1 Int ...,1,2


In [14]:
#some preliminary cleaning
def parse_scores_string(scores:str):
  scores = scores.lower()
  out = {}
  out['Str'] = scores.split('str')[1].split('dex')[0]
  out['Dex'] = scores.split('dex')[1].split('con')[0]
  out['Con'] = scores.split('con')[1].split('int')[0]
  out['Int'] = scores.split('int')[1].split('wis')[0]
  out['Wis'] = scores.split('wis')[1].split('cha')[0]
  out['Cha'] = scores.split('cha')[1]

  for k in out.keys():
    out[k] = out[k].strip().replace(' ','/').replace('\t','')
  return out

parse_scores_string(data['Scores'].iloc[0])

{'Str': '21/+5/+5',
 'Dex': '9/−1/+3',
 'Con': '15/+2/+6',
 'Int': '18/+4/+8',
 'Wis': '15/+2/+6',
 'Cha': '18/+4/+4'}

In [15]:
#create a cleaner version of the data
parsed_scores = list(data['Scores'].map(lambda s: parse_scores_string(s)))
data_cleaner = data.join(pd.DataFrame(parsed_scores))
data_cleaner = data_cleaner.drop(columns='Scores')

#save as csv
data_cleaner.to_csv('monsters.csv',index=False)
data_cleaner.head()

Unnamed: 0,Name,SizeType,Align,AC,Init,HP,Multiattack,LegRes,LegAct,Speed,CR,PB,Str,Dex,Con,Int,Wis,Cha
0,Aboleth,Large Aberration,Lawful Evil,17,+7 (17),150 (20d10 + 40),True,True,True,"10 ft., Swim 40 ft.",10,4,21/+5/+5,9/−1/+3,15/+2/+6,18/+4/+8,15/+2/+6,18/+4/+4
1,Air Elemental,Large Elemental,Neutral,15,+5 (15),90 (12d10 + 24),True,False,False,"10 ft., Fly 90 ft. (hover)",5,3,14/+2/+2,20/+5/+5,14/+2/+2,6/−2/−2,10/+0/+0,6/−2/−2
2,Animated Armor,Medium Construct,Unaligned,18,+2 (12),33 (6d8 + 6),True,False,False,25 ft.,1,2,14/+2/+2,11/+0/+0,13/+1/+1,1/−5/−5,3/−4/−4,1/−5/−5
3,Animated Flying Sword,Small Construct,Unaligned,17,+4 (14),14 (4d6),False,False,False,"5 ft., Fly 50 ft. (hover)",1/4,2,12/+1/+1,15/+2/+4,11/+0/+0,1/−5/−5,5/−3/−3,1/−5/−5
4,Animated Rug of Smothering,Large Construct,Unaligned,12,+4 (14),27 (5d10),False,False,False,10 ft.,2,2,17/+3/+3,14/+2/+2,10/+0/+0,1/−5/−5,3/−4/−4,1/−5/−5
