# Dungelon Study

## Preparation

### Install Python packages

In [None]:
%pip install -qq --upgrade interpret gensim umap-learn umap-learn[plot]

### Fetch data
Reference: https://github.com/woctezuma/dungleon

#### Representations of characters

In [None]:
# Markdown characters, e.g. "":fr:"" for the frog
!curl -o characters.md https://raw.githubusercontent.com/woctezuma/dungleon/main/data/characters.md

# Emojis, e.g. "🐸" for the frog
!curl -o emojis.txt https://raw.githubusercontent.com/woctezuma/dungleon/main/data/emojis.txt

# Letters, e.g. "F" for the frog
!curl -o letters.txt https://raw.githubusercontent.com/woctezuma/dungleon/main/data/letters.txt

# Words, e.g. "Frog" for the frog
!curl -o characters.txt https://raw.githubusercontent.com/woctezuma/dungleon/main/data/characters.txt

#### Solutions represented with Markdown characters

In [None]:
SOLUTION_FNAME = "solutions.seen.md"

In [None]:
!curl -o {SOLUTION_FNAME} https://raw.githubusercontent.com/woctezuma/dungleon/main/data/{SOLUTION_FNAME}

## Data pre-processing



### Define functions

In [None]:
def load_file(fname):
  with open(fname, 'r') as f:
    data = [line.strip() for line in f.readlines()]
  return data

def build_dictionary(doc_markdown, doc_emojis, doc_letters, doc_words):
  # Match the characters' representations with a dictionary for the ease of mind
  d = {}
  for markdown, emoji, letter, word in zip(doc_markdown, doc_emojis, doc_letters, doc_words):
    d[markdown] = emoji
    d[emoji] = letter
    d[letter] = word
    d[word] = markdown
  return d

def to_solution_as_emojis(solution_as_markdown):
  # Convert solution from a Markdown text to a list of emojis for compactness
  return [ d[markdown] for markdown in solution_as_markdown.split()]

def show_solution_as_emojis(texts, num_to_display = 10):
  num_to_display = min(len(texts), num_to_display)
  print('\n'.join(sorted(''.join(t) for t in texts[:num_to_display])))

### Create a dictionary

In [None]:
d = build_dictionary(doc_markdown=load_file('characters.md'),
                     doc_emojis=load_file('emojis.txt'),
                     doc_letters=load_file('letters.txt'),
                     doc_words=load_file('characters.txt')
                     )

print(d[":fr:"])
print(d["🐸"])
print(d["F"])
print(d["Frog"])

### Reformat solutions

In [None]:
texts = [ to_solution_as_emojis(solution_as_markdown)
          for solution_as_markdown in load_file(SOLUTION_FNAME)
        ]

show_solution_as_emojis(texts, num_to_display=10)

### Create a dataset of masked solutions

An emoji is hidden and will have to be predicted.

In [None]:
dummy_character = ['N/A']
feature_names = ["🇦", "🇧", "🇨", "🇩", "🇪"]

masked_data = {}

for i, slot in enumerate(feature_names):
  X = []
  y = []

  for t in texts:
    masked_solution = t[:i] + dummy_character + t[(i+1):]
    hidden_character = t[i]

    X.append(masked_solution)
    y.append(hidden_character)

  masked_data[slot] = {"X": X, "y": y}

## "Explainable Boosting Machine" with InterpretML
Reference: https://interpret.ml/docs/ebm.html

In [None]:
slot_no = 0
assert 0 <= slot_no < len(feature_names)

slot = feature_names[slot_no]
print(f"Slot: {slot}")

In [None]:
from interpret.glassbox import ExplainableBoostingClassifier
from interpret import show

X = masked_data[slot]["X"]
y = masked_data[slot]["y"]

ebm = ExplainableBoostingClassifier(feature_names=feature_names)
ebm.fit(X, y)

ebm_global = ebm.explain_global()
show(ebm_global)

## Word2Vec with Gensim

Reference: https://radimrehurek.com/gensim/auto_examples/core/run_corpora_and_vector_spaces.html

In [None]:
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
from gensim import corpora

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

### Transformations

Reference: https://radimrehurek.com/gensim/auto_examples/core/run_topics_and_transformations.html

In [None]:
from gensim import models

num_topics = 2

tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics)
corpus_lsi = lsi_model[corpus_tfidf]

lda_model = models.LdaModel(corpus, id2word=dictionary, num_topics=num_topics)
corpus_lda = lda_model[corpus]

### Word2Vec

Reference: https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html

In [None]:
import numpy as np

vector_size = 5

model = models.Word2Vec(texts, min_count=1, vector_size=vector_size)

vectors = np.asarray(model.wv.vectors)
labels = np.asarray(model.wv.index_to_key)

labels_with_names = np.array([ d[em] for em in labels])

### Clustering

In [None]:
import sklearn.cluster as cluster

n_clusters = 4

kmeans_labels = cluster.KMeans(n_clusters=n_clusters, n_init='auto').fit_predict(vectors)

for cref in sorted(set(kmeans_labels)):
  cluster_content = [n for n,c in zip(labels, kmeans_labels) if cref == c]
  cluster_content_as_str = ' '.join(cluster_content)
  print(f'Cluster n°{cref}:\t{cluster_content_as_str}')

### Visualization

References:
- https://github.com/lmcinnes/umap
- https://matplotlib.org/stable/tutorials/colors/colormaps.html

In [None]:
import umap
import umap.plot

# n_neighbors = vectors.shape[0] - 1
n_neighbors = 3

# The smaller the distance, the closer to each other points are allowed to be.
min_dist = 0.3

mapper = umap.UMAP(n_neighbors=n_neighbors,
                   min_dist=min_dist,
                   metric='cosine').fit(vectors)

embedding = mapper.transform(vectors)

ax = umap.plot.points(mapper, labels=labels_with_names, show_legend=False)

for coord, name in zip(embedding, labels_with_names):
  ax.text(coord[0], coord[1], name, horizontalalignment='center')

## Characters

- :ar:	🏹    	Archer
- :ba:	🦇    	Bat
- :ch:	💰    	Chest
- :co:	🟡    	Coins
- :dr:	🐲    	Dragon
- :fr:	🐸    	Frog
- :go:	👺    	Blade Orc
- :ki:	👑    	The King
- :ma:	🧙‍♀️ 	Mage
- :ne:	👿    	Necromancer
- :or:	👹    	Axe Orc
- :sk:	💀    	Skeleton
- :sp:	🕷    	Spider
- :th:	👤    	Bandit
- N/A	🚫    	N/A
- :ty:	🏆    	Relic
- :vi:	👨‍🌾  	Villager
- :wa:	🤺    	Knight
- :wi:	🧙‍♂️ 	Sorcerer
- :zo:	🧟    	Zombie