Ideia:
- Extract entities from the quests
- Replace entities with masks
- Train causal language model to generate quests if masks
- Use masked language modeling to fill masks in quests

https://huggingface.co/transformers/usage.html#named-entity-recognition

In [1]:
import json

import pandas as pd
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
import torch

from english_words import english_words_lower_set
from nltk.tokenize import word_tokenize

In [2]:
df = pd.read_csv("../data/quests/quest_objectives.csv", sep=";").drop(columns="Unnamed: 0")
df.head()

Unnamed: 0,source,id,title,objectives
0,skyrim,55074,A Blade in the Dark,Talk to Delphine \nLocate the dragon burial si...
1,skyrim,55212,A Chance Arrangement,Talk to Brynjolf about joining the Thieves Gui...
2,skyrim,51114,A Cornered Rat,Talk to Brynjolf \nSearch the Ratway for Esber...
3,skyrim,52017,A Daedra's Best Friend,Talk with Barbas \nTravel with Barbas to the s...
4,skyrim,63392,A False Front (Imperials),Find the Stormcloak courier \nRetrive the Stor...


In [3]:
nlp = pipeline('ner', grouped_entities=True)

In [4]:
sentence = df.iloc[1]["objectives"]
print(sentence)

Talk to Brynjolf about joining the Thieves Guild 
Meet Brynjolf during daytime 
Steal Madesi's ring from his strongbox 
Plant Madesi's ring 
Speak to Brynjolf


In [5]:
nlp(sentence)

[{'entity_group': 'I-PER', 'score': 0.9526254137357076, 'word': 'Brynjolf'},
 {'entity_group': 'I-ORG',
  'score': 0.9813089668750763,
  'word': 'Thieves Guild'},
 {'entity_group': 'I-PER', 'score': 0.9145381450653076, 'word': 'Brynjolf'},
 {'entity_group': 'I-PER', 'score': 0.9662201702594757, 'word': 'Madesi'},
 {'entity_group': 'I-PER', 'score': 0.9338123500347137, 'word': 'Madesi'},
 {'entity_group': 'I-PER', 'score': 0.813558836778005, 'word': 'Brynjolf'}]

In [14]:
entity_map = {
    "I-PER": "xperx",
    "I-ORG": "xorgx",
    "I-LOC": "xlocx",
    "I-MISC": "xmiscx"
}

token_entity_map = {}

for sentence in df["objectives"].values:
    for ent in nlp(sentence):
        if len(ent["word"]) > 1:
            token_entity_map[ent["word"]] = entity_map[ent["entity_group"]]

In [15]:
with open("../data/assets/entities.json", "w") as outfile:
    json.dump(token_entity_map, outfile)

---

In [16]:
def replace_sentence(sentence, token_map):
    return ' '.join([
        word if word not in token_map else token_map[word] 
        for word
        in word_tokenize(sentence)
    ])

In [17]:
df["objectives_entities"] = df["objectives"].apply(lambda x: replace_sentence(x, token_entity_map))

In [18]:
df.sample()["objectives_entities"].values

array(['Befriend the xorgx xmiscx xlocx Assist five citizens xmiscx xlocx Purchase xlocx Return to the xorgx'],
      dtype=object)

In [19]:
df.to_csv("../data/quests/quest_objectives_entities.csv", sep=";", index=False)

---