In [4]:
#!wget -P ../data https://s3.amazonaws.com/wikia_xml_dumps/g/ge/gensinimpact_pages_current.xml.7z
#!7za x ../data/gensinimpact_pages_current.xml.7z -o../data
#!rm ../data/gensinimpact_pages_current.xml.7z

In [5]:
import re
import mwparserfromhell
from lxml import etree as ET
from pprint import pp

In [6]:
# Define XML file path (replace with your actual file)
xml_file = '../data/gensinimpact_pages_current.xml'

# Parse the XML file
tree = ET.parse(xml_file)
root = tree.getroot()

In [7]:
pages = root.findall(".//page", namespaces=root.nsmap)

In [8]:
def parse_page(page_element):
    """
    Parses a page element and returns a dictionary with its data.

    Args:
        page_element: The XML element representing a page.

    Returns:
        A dictionary containing the page data.
    """
    page_data = {}
    for child in page_element.getchildren():
        tag = child.tag
        # use regex to remove the http link between brackets
        tag = re.sub(r'\{.*\}', '', tag)
        # Handle elements with text content
        if child.text and len(child.text.strip()) > 0:
            # strip and remove extra spaces
            
            text = child.text.strip()
            text = re.sub(r"\s+", " ", text).strip()
            text = re.sub(r'\{\{\s*Meta.*?\}\}','', text, flags=re.IGNORECASE).strip()
            text = re.sub(r'\{\{\s*About.*?\}\}','', text, flags=re.IGNORECASE).strip()
            text = re.sub(r'\{\{\s*Stub.*?\}\}','', text, flags=re.IGNORECASE).strip()
            text = re.sub(r'\{\{\s*CustomTabs.*?\}\}','', text, flags=re.IGNORECASE).strip()
            text = re.sub(r'\{\{\s*See also.*?\}\}','', text, flags=re.IGNORECASE).strip()
            text = re.sub(r'^\{\{Character\s+\{\{', '{{CharacterTabs}} {{', text, flags=re.IGNORECASE).strip()
            # Replace where it starts with character infobox to npc infobox
            text = re.sub(r'^\{\{Character Infobox','{{NPC Infobox', text, flags=re.IGNORECASE).strip()
            
            # remove charactertabs
            text = text.replace('{{CharacterTabs}}', '')
            text = text.replace('{{Wish', '{{Wish Infobox')
            text = text.replace('{{Event', '{{Wish Infobox')

            text = text.strip()

            match = re.match(r"^\{\{((\w*\s)*)Infobox", text)
            if match:
                page_data['class'] = match.group(1).strip()

            page_data[tag] = text
        # Handle nested elements recursively
        elif child.getchildren():
            page_data[tag] = parse_page(child)
        # Handle elements with attributes
        else:
            attributes = child.attrib
            if attributes:
                page_data[tag] = attributes
    return page_data

In [9]:
def parse_page_text(text):
    wikicode = mwparserfromhell.parse(text)

    info_dict = {}
    other_languages = {}
    for template in wikicode.filter_templates():
        for param in template.params:
            key = str(param.name).strip()
            value = str(param.value).strip()
            if "other languages" == template.name.lower().strip():
                other_languages.update({key: value})
                continue
            info_dict[key] = value

    info_dict["other_languages"] = other_languages

    return info_dict

In [37]:
def parse_all_pages(pages):
    parsed_pages = []
    for page in pages:
        parsed_page = parse_page(page)
        if ":" not in parsed_page['title'] and not re.match(r'^#redirect', parsed_page['revision']['text'], re.IGNORECASE):
            if 'class' not in parsed_page.keys():
                try:
                    parsed_page['class'] = parsed_page['revision']['class']
                    parsed_page['revision'].pop('class', None)
                except:
                    parsed_page['class'] = 'Special'

            parsed_page["revision"]["info"] = parse_page_text(parsed_page["revision"]["text"])
            del parsed_page["revision"]["text"]

            # only keep relevant fields and get rid of revision
            parsed_page["id"] = parsed_page["revision"].pop("id")
            parsed_page["data"] = parsed_page["revision"].pop("info")
            del parsed_page["ns"]
            del parsed_page["revision"]
            parsed_pages.append(parsed_page)
    return parsed_pages

every item inside == title == or \<\!--title--\> is considered a different segment, the first segment which is the main page description does not start with == title ==

In [38]:
len(pages)

277971

In [39]:
parsed_pages = parse_all_pages(pages)

In [40]:
parsed_pages[1]

{'title': 'Kaeya',
 'id': '1512130',
 'class': 'Character',
 'data': {'image': '<gallery> Kaeya Card.png|Card Character Kaeya Full Wish.png|Wish Character Kaeya Game.png|In-Game </gallery>',
  'type': 'Playable <!--Playable Character Information-->',
  'quality': '4',
  'weapon': 'Sword',
  'element': 'Cryo',
  'name': 'Kaeya <!--Character Information-->',
  'realname': 'Kaeya Alberich',
  'birthday': 'November 30th',
  'constellation': 'Pavo Ocellus',
  'region': 'Mondstadt',
  'region2': "Khaenri'ah",
  'affiliation': 'Knights of Favonius',
  'affiliation2': "Khaenri'ah",
  'affiliation3': 'Alberich Clan',
  'dish': 'Fruity Skewers',
  'namecard': 'Kaeya: Pavo Ocellus',
  'obtainType': 'Standard Wish',
  'obtain': "*Complete [[Crash Course]] *[[Wanderlust Invocation]] (Wish) *[[Paimon's Bargains]]",
  'releaseDate': '2020-09-28',
  'title': 'Frostwind Swordsman <!--Titles-->',
  'title2': 'Quartermaster of the Knights',
  'titleRef2': '<ref>[https://twitter.com/GenshinImpact/status/1

In [41]:
problems = []
for page in parsed_pages:
    try:
        if page['class'] == 'Character':
            # print(page)
            pass
    except:
        problems.append(page)
len(problems)

0

In [42]:
import pickle

with open('../data/genshin_database.pickle', 'wb') as handle:
    pickle.dump(parsed_pages, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [43]:
with open('../data/genshin_database.pickle', 'rb') as handle:
    genshin_db = pickle.load(handle)

In [44]:
genshin_db[1]

{'title': 'Kaeya',
 'id': '1512130',
 'class': 'Character',
 'data': {'image': '<gallery> Kaeya Card.png|Card Character Kaeya Full Wish.png|Wish Character Kaeya Game.png|In-Game </gallery>',
  'type': 'Playable <!--Playable Character Information-->',
  'quality': '4',
  'weapon': 'Sword',
  'element': 'Cryo',
  'name': 'Kaeya <!--Character Information-->',
  'realname': 'Kaeya Alberich',
  'birthday': 'November 30th',
  'constellation': 'Pavo Ocellus',
  'region': 'Mondstadt',
  'region2': "Khaenri'ah",
  'affiliation': 'Knights of Favonius',
  'affiliation2': "Khaenri'ah",
  'affiliation3': 'Alberich Clan',
  'dish': 'Fruity Skewers',
  'namecard': 'Kaeya: Pavo Ocellus',
  'obtainType': 'Standard Wish',
  'obtain': "*Complete [[Crash Course]] *[[Wanderlust Invocation]] (Wish) *[[Paimon's Bargains]]",
  'releaseDate': '2020-09-28',
  'title': 'Frostwind Swordsman <!--Titles-->',
  'title2': 'Quartermaster of the Knights',
  'titleRef2': '<ref>[https://twitter.com/GenshinImpact/status/1

In [45]:
len([p for p in genshin_db if p["title"] == "Kaeya"])

1

In [46]:
classes = []
for element in genshin_db:
    classes.append(element['class'])
classes = list(set(classes))
print(classes)

['Furnishing', 'Wildlife', 'Book Collection', 'Quest', 'Special', 'Artifact Set', 'Soundtrack', 'Genius Invokation TCG', 'Book', 'Location', 'Achievement Category', 'Traveler', 'Weapon', 'Manga', 'Status', 'Chapter', 'Wish Infobox Series', 'Genius Invokation TCG Skill', 'Achievement Set', 'Food', 'Talent', 'Character', 'Weapon Series', 'Hidden Exploration Objectives', 'Outfit', 'Element', 'Item', 'Enemy', 'Album', 'Act', 'Achievement', 'Component', 'Constellation Overview', 'Constellation', 'Terminology', 'Artifact', 'Domain', 'NPC', 'Faction', 'Wish Infobox', 'Anime', 'Wish']
