In [33]:
import os
import sys
import re
import pandas as pd
import numpy as np
import zipfile
from enum import Enum, EnumMeta
from random import sample
import shutil
import requests
from bs4 import BeautifulSoup
from io import BytesIO
from IPython.display import display
from typing import List, Dict, Optional, Union



In [130]:
class WebResource(Enum):
    
    SAV_V10 = r'https://gibberlings3.github.io/iesdp/file_formats/ie_formats/sav_v1.htm'
    GAM_V11 = r'https://gibberlings3.github.io/iesdp/file_formats/ie_formats/gam_v1.1.htm'  #BG1, IWD, PST
    GAM_V20 = r'https://gibberlings3.github.io/iesdp/file_formats/ie_formats/gam_v2.0.htm'  #BG2
    GAM_V22 = r'https://gibberlings3.github.io/iesdp/file_formats/ie_formats/gam_v2.2.htm'  #IDW2
    CHR_V10 = r'https://gibberlings3.github.io/iesdp/file_formats/ie_formats/chr_v1.htm' #BG1
    CHR_V12 = r'https://gibberlings3.github.io/iesdp/file_formats/ie_formats/chr_v1.2.htm' #PST
    CHR_V20 = r'https://gibberlings3.github.io/iesdp/file_formats/ie_formats/chr_v2.htm' #BG2
    CHR_V22 = r'https://gibberlings3.github.io/iesdp/file_formats/ie_formats/chr_v2.2.htm' #IDW2
    CHR_V90 = r'https://gibberlings3.github.io/iesdp/file_formats/ie_formats/chr_v9.htm' #IDW
    CRE_V10 = r'https://gibberlings3.github.io/iesdp/file_formats/ie_formats/cre_v1.htm' #BG1, BG2, BGEE
    CRE_V12 = r'https://gibberlings3.github.io/iesdp/file_formats/ie_formats/cre_v1.2.htm' #PST
    CRE_V22 = r'https://gibberlings3.github.io/iesdp/file_formats/ie_formats/cre_v2.2.htm' #IDW2
    CRE_V90 = r'https://gibberlings3.github.io/iesdp/file_formats/ie_formats/cre_v9.htm' #IDW
    
    def __init__(self, url:str):
        self.local_url = 'Resources/Web Resources/File formats/' + url.split('/')[-1][:-4].replace('.', "") + '.txt'
        self.resource_type = self.name.split('_')[0]
        with open(self.local_url, 'r') as f:
            self.html = BeautifulSoup(f.read())
        self.games = self.get_games()
        self.allgame_str = '|'.join(self.games)
        self.segments = self.get_segments()
        self.segment_tables = self.get_tables()
            
    
    def get_games(self):
        tag = self.html.find('div', 
                             {'class':'fileHeader'},
                             text = re.compile(r'\s*Applies to:\s*')).find_next('div', {'class': 'indent1'})
        return [_.strip() for _ in tag.text.split(',') if not ':' in _]
        
        
    def get_segments(self):
        tag = self.html.find('div', 
                             {'class':'fileHeader'}, 
                             text =re.compile('\s*Detailed Description\s*')).find_next('div', 
                                                                     {'class': 'indent1'})
        ls = [_.text.strip() for _ in tag.find_all('li')]
        refs = [_.find('a')['href'][1:] if _.find('a') and _.find('a')['href'].startswith('#') else np.nan for _ in tag.find_all('li')]
        seg_df = pd.DataFrame(refs, ls).reset_index()
        seg_df.columns = ['Segment', 'Table Reference']
        return seg_df
                                      
    
    def get_tables(self):
        tabs = self.segments['Table Reference'].dropna()
        table_results = list()
        for t in tabs:
            desc_tag = self.html.find('a', {'name': re.compile(f'\w*{t}')}).parent
            table_tag = desc_tag.next_sibling.next_sibling
            if table_tag.find('table'):
                table_df = pd.read_html(str(table_tag))[0]
                offset_index = table_df[table_df.nunique(axis=1)!=1].index
                sep_index = table_df[table_df.nunique(axis=1)==1].index
                table_df['Game'] = table_df['Offset']
                table_df['Game'][offset_index] = np.nan
                table_df['Game'] = table_df['Game'].fillna(method='ffill').fillna('ALL')
                table_df.drop(sep_index, inplace=True)
                table_results.append((t, desc_tag.text.strip(), table_df))
        return dict([(g, self.filter_df(table_results, g)) for g in self.games])
    
    def filter_df(self, table_results:List, game:str):
        return dict([(t, self.clean_df(df.loc[(df['Game'].str.contains(game))|
                                         (df['Game'].str.contains(re.compile('all', flags=re.I)))]
                                  .reset_index(drop=True).drop('Game', axis=1)))
                     for (t, desc, df) in table_results if (game in desc) or not re.search(self.allgame_str, desc)])
    
    
    @staticmethod
    def parse_size(s:str):
        search = re.search(r'(?P<num1>\d+)(\s*(?P<mul>\*)\s*(?P<num2>\d+))?\s*\((?P<type>[a-zA-Z0-9* ]+)\)', s)
        if search is None:
            return pd.Series([np.nan, np.nan])
        return pd.Series([int(search.group('num1')) * int(search.group('num2')) if search.group('mul') else int(search.group('num1')),
                          search.group('type')])
        
        
    @classmethod
    def clean_df(cls, df:pd.DataFrame):
        df.columns = df.columns.map(lambda x: x.replace('data type', 'datatype'))
        col = [_ for _ in df.columns if re.search(r'Size\s*\(data\s*type\)', _, flags=re.I)][0]
        df[['Span', 'datatype']] = df[col].apply(cls.parse_size)
        df['Start'] = df['Offset'].apply(lambda x: int(x, base=16) if re.fullmatch(r'0x[0-9a-fA-F]+', x) else np.nan)
        df['End'] = df['Start'] + df['Span']
        return df
    
    @classmethod
    def collect(cls, keywords:re.compile=re.compile('|'.join(['Party Gold', 'PC structs', 'Party Reputation', 'Character Name', 'CRE resource data', '^Name$'
                                                              ,'Name of Protagonist', 'CRE Structure', 'Long Name', 'Short Name', 'Creature Flags', 'XP of the creature'
                                                              ,'Gold Carried', 'Permanent Status Flags', 'Maximum Hit Point', 'Hide in Shadows', 'Armor Class'
                                                              ,'THAC0', 'Number of Attacks', 'Save Versus', 'Resist\s\w+\s\(0-100\)', 'Detect Illusion', 'Set Traps'
                                                              ,'Lore', 'Lockpicking', 'Move Silently', 'Disarm Traps', 'Pick Pockets', 'Luck', 'Proficiency', 'Tracking Target', 'Turn Undead'
                                                              ,'Tracking', 'Stealth','Current\s\w+\sClass Experience', 'Level\s\w+\sClass', '^Sex$', 'Strength', 'Strength % Bonus'
                                                              ,'Intelligence', 'Wisdom', 'Dexterity','Constitution', 'Charisma', 'Morale', 'Morale Break', 'Racial Enemy', 'Kit Information'
                                                              ,'General \(General\.IDS\)', 'Race \(Race\.IDS\)', 'Class \(Class\.IDS\)', 'Specific \(Specific\.IDS\)', 'Gender \(Gender\.IDS\)'
                                                              ,'Alignment', 'Known Spells', 'Spell Memorization info', 'Memorized Spells', 'Offset to Item', 'Count to Item'
                                                              ,'Offset to Effects', 'Count to Effects', 'Highest Attained Level', 'XP \(\w+ class\)', 'Item Slots Offset'
                                                              ,'Items Offset', 'Items Count'])
                                                    ,flags=re.I)):
        dfs = list()
        for k, v in cls.__members__.items():
            for g, t in v.segment_tables.items():
                for seg, df in t.items():
                    dfs.append(df[df['Description'].str.contains(keywords)].assign(Game=g, File=k.split('_')[0], Segment=seg.split('_')[-1]))
        df = pd.concat(dfs)
        #df['Span'] = df['Span'].astype(int)
        #df['Start'] = df['Start'].astype(int)
        #df['End'] = df['End'].astype(int)  
        return df
                               
                    
            
    

In [131]:
df1 = WebResource.collect()