In [1]:
import numpy as np
import pandas as pd

# Load data

In [2]:
import requests

url = 'https://raw.githubusercontent.com/vorpalhex/srd_spells/master/spells.json'
r = requests.get(url)
spells = r.json()

In [3]:
spells_df = pd.DataFrame(spells)

In [4]:
spells_df.head()

Unnamed: 0,casting_time,classes,components,description,duration,higher_levels,level,name,range,ritual,school,tags,type
0,1 action,"[sorcerer, wizard]","{'material': False, 'raw': 'V, S', 'somatic': ...",You hurl a bubble of acid. Choose one creature...,Instantaneous,,cantrip,Acid Splash,60 feet,False,Conjuration,"[sorcerer, wizard, cantrip]",Conjuration cantrip
1,1 action,"[ranger, wizard]","{'material': True, 'materials_needed': ['a tin...",You set an alarm against unwanted intrusion. C...,8 hours,,1,Alarm,30 feet,True,abjuration,"[ranger, wizard, level1]",1st-level abjuration (ritual)
2,1 action,"[bard, druid, ranger]","{'material': True, 'materials_needed': ['a mor...",This spell lets you convince a beast that you ...,24 hours,When you cast this spell using a spell slot of...,1,Animal Friendship,30 feet,False,enchantment,"[bard, druid, ranger, level1]",1st-level enchantment
3,1 action,"[bard, cleric]","{'material': True, 'materials_needed': ['a dro...",Up to three creatures of your choice that you ...,"Concentration, up to 1 minute",When you cast this spell using a spell slot of...,1,Bane,30 feet,False,enchantment,"[bard, cleric, level1]",1st-level enchantment
4,1 action,"[bard, sorcerer, warlock, wizard]","{'material': False, 'raw': 'V, S', 'somatic': ...",You extend your hand and trace a sigil of ward...,1 Round,,cantrip,Blade Ward,Self,False,Abjuration,"[bard, sorcerer, warlock, wizard, cantrip]",Abjuration cantrip


Some data cleaning is necessary.

In [5]:
spells_df = spells_df.fillna('')

In [6]:
spells_df[spells_df.level=='wind']

Unnamed: 0,casting_time,classes,components,description,duration,higher_levels,level,name,range,ritual,school,tags,type
166,10 minutes,"[cleric, druid, wizard]","{'material': True, 'materials_needed': ['burni...",You take control of the weather within 5 miles...,"Concentration, up to 8 hours",,wind,Control Weather,Self (5-mile radius),False,,"[cleric, druid, wizard, level8]",Wind


In [7]:
spells_df.loc[166, 'level'] = '8'

In [8]:
spells_df['level'] = spells_df.level.apply(lambda x: '0' if x == 'cantrip' else x)

# Text analysis methods
This follows Patrick Harrison's [**Modern NLP in Python**](https://github.com/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb) tutorial.

In [9]:
import spacy

nlp = spacy.load('en')

In [10]:
spells_df.shape

(379, 13)

In [11]:
spell = nlp(spells_df.description.iloc[10])

In [12]:
for i, x in enumerate(spell.sents):
    print(i, x)

0 A dazzling array of flashing, colored light springs from your hand.
1 Roll 6d10; the total is how many hit points of creatures this spell can effect.
2 Creatures in a 15-foot cone originating from you are affected in ascending order of their current hit points (ignoring unconscious creatures and creatures that can't see).


3 Starting with the creature that has the lowest current hit points, each creature affected by this spell is blinded until the spell ends.
4 Subtract each creature's hit points from the total before moving on to the creature with the next lowest hit points.
5 A creature's hit points must be equal to or less than the remaining total for that creature to be affected.


In [13]:
for i, ent in enumerate(spell.ents):
    print ('{}: {} - {}'.format(i, ent, ent.label_))

0: Roll 6d10 - PERSON
1: 15-foot - CARDINAL


### Phrase modeling

Due to the highly structured nature of the corpus, phrase modeling will be an effective method for reducing dimensionality.

In [14]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.models.word2vec import LineSentence

Using TensorFlow backend.


In [15]:
def punct_space(token):
    return token.is_punct or token.is_space

def entry_to_lines(filename):
    with open(filename, encoding='utf_8') as f:
        for entry in f:
            yield entry.replace('\\n', '\n')
    return
            
def lemmatized_sentence_corpus(filename):
    for parsed_review in nlp.pipe(entry_to_lines(filename),
                                  batch_size=10000, n_threads=4):
        for sent in parsed_review.sents:
            yield u' '.join([token.lemma_ for token in sent if not punct_space(token)])

In [16]:
import tempfile

raw_spells = tempfile.NamedTemporaryFile(mode='w')
with open(raw_spells.name, 'w') as f:
    for x in spells_df.description:
        if type(x) is str:
            f.write(x)
        f.write('\n')

In [17]:
unigram = tempfile.NamedTemporaryFile(mode='w')

with open(unigram.name, 'w') as f:
    for sentence in lemmatized_sentence_corpus(raw_spells.name):
        f.write(sentence + '\n')

unigram_sentences = LineSentence(unigram.name)

In [18]:
bigram_model = Phrases(unigram_sentences)

bigram = tempfile.NamedTemporaryFile(mode='w')

with open(bigram.name, 'w') as f:
    for unigram_sentence in unigram_sentences:
        bigram_sentence = u' '.join(bigram_model[unigram_sentence])
        f.write(bigram_sentence + '\n')

bigram_sentences = LineSentence(bigram.name)



In [19]:
trigram_model = Phrases(bigram_sentences)

trigram = tempfile.NamedTemporaryFile(mode='w')

with open(trigram.name, 'w') as f:
    for bigram_sentence in bigram_sentences:
        trigram_sentence = u' '.join(trigram_model[bigram_sentence])
        f.write(trigram_sentence + '\n')

trigram_sentences = LineSentence(trigram.name)



### Word vector embedding

In [20]:
total_words = sum(len(x) for x in trigram_sentences)

In [21]:
from gensim.models import Word2Vec

word2vec_file = tempfile.NamedTemporaryFile(mode='w')

spell2vec = Word2Vec(trigram_sentences, size=100, window=5, min_count=5, workers=4)
spell2vec.save(word2vec_file.name)

# perform another 11 epochs of training
for i in range(11):
    spell2vec.train(trigram_sentences, total_words=total_words, epochs=1)
    spell2vec.save(word2vec_file.name)

print(u'{} training epochs so far.'.format(spell2vec.train_count))

spell2vec.init_sims()

12 training epochs so far.


In [22]:
len(spell2vec.wv.vocab)

966

In [23]:
ordered_vocab = [(term, voc.index, voc.count) for term, voc in spell2vec.wv.vocab.items()]
ordered_vocab = sorted(ordered_vocab, key=lambda x: -x[2])
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)
word_vectors = pd.DataFrame(spell2vec.wv.syn0norm[term_indices, :],
                            index=ordered_terms)
word_vectors.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
-PRON-,0.075745,-0.142208,-0.040497,-0.160118,0.01247,-0.069556,-0.04972,-0.026434,0.017886,0.012021,...,0.061241,-0.062969,0.043747,-0.25172,0.084236,0.17,-0.143195,0.239713,0.10145,-0.083604
the,0.044114,-0.108344,-0.133454,-0.036152,-0.052254,-0.125946,-0.113244,0.005527,-0.055453,0.2128,...,-0.101534,0.059824,-0.133685,-0.122317,0.023074,0.009635,-0.045694,0.144171,-0.018857,-0.036276
a,-0.1956,0.039141,0.00284,-0.015285,-0.011551,-0.025759,-0.121911,0.174815,-0.052116,0.011435,...,0.024179,-0.065814,-0.085807,0.021879,-0.032526,-0.200594,0.027359,0.270756,-0.071218,0.17025
of,0.063009,0.020924,0.046436,0.018964,0.039105,-0.024931,-0.031004,-0.040194,-0.03288,0.086891,...,-0.022015,-0.140763,-0.098863,-0.020423,0.005102,-0.043631,0.018367,0.104076,0.040446,0.097542
creature,-0.079563,-0.155791,-0.057823,-0.094326,-0.024565,-0.048661,-0.087624,0.047916,-0.098794,0.064731,...,0.076434,-0.056793,-0.135326,-0.232481,0.066032,-0.003538,-0.085275,0.318385,-0.033788,0.079978


#### [Todo]: visualize, and clean up the code above. Figure out how to use the model to create the lemmatized text for each example, and to prepare it in a dataframe for doc2vec.

# Doc2Vec
We'll follow the gensim Doc2Vec [tutorial](https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb).

#### Goal
learn a vector embedding for each document in the corpus, and train a multinomial classifier on the vector embeddings.

In [24]:
def read_corpus(fname, training=False):
    with open(fname, 'r') as f:
        for i, line in enumerate(f):
            if training:
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])
            else:
                yield gensim.utils.simple_preprocess(line)

In [25]:
spells_df.head()

Unnamed: 0,casting_time,classes,components,description,duration,higher_levels,level,name,range,ritual,school,tags,type
0,1 action,"[sorcerer, wizard]","{'material': False, 'raw': 'V, S', 'somatic': ...",You hurl a bubble of acid. Choose one creature...,Instantaneous,,0,Acid Splash,60 feet,False,Conjuration,"[sorcerer, wizard, cantrip]",Conjuration cantrip
1,1 action,"[ranger, wizard]","{'material': True, 'materials_needed': ['a tin...",You set an alarm against unwanted intrusion. C...,8 hours,,1,Alarm,30 feet,True,abjuration,"[ranger, wizard, level1]",1st-level abjuration (ritual)
2,1 action,"[bard, druid, ranger]","{'material': True, 'materials_needed': ['a mor...",This spell lets you convince a beast that you ...,24 hours,When you cast this spell using a spell slot of...,1,Animal Friendship,30 feet,False,enchantment,"[bard, druid, ranger, level1]",1st-level enchantment
3,1 action,"[bard, cleric]","{'material': True, 'materials_needed': ['a dro...",Up to three creatures of your choice that you ...,"Concentration, up to 1 minute",When you cast this spell using a spell slot of...,1,Bane,30 feet,False,enchantment,"[bard, cleric, level1]",1st-level enchantment
4,1 action,"[bard, sorcerer, warlock, wizard]","{'material': False, 'raw': 'V, S', 'somatic': ...",You extend your hand and trace a sigil of ward...,1 Round,,0,Blade Ward,Self,False,Abjuration,"[bard, sorcerer, warlock, wizard, cantrip]",Abjuration cantrip


# [TODO]: Techniques to apply

* Bag of words - construct weighted encoding of key words, train on weights
    * Tf–idf or counter
* word2vec - construct word embeddings, average, and train on average
    * consider using an auto-encoder
* feature engineering

use a more sophisticated loss function (ordinal regression - c.f. mord on pypi doesn't work)

Lee, Yoonkyung, Yi Lin, and Grace Wahba. "Multicategory support
vector machines: Theory and application to the classification of
microarray data and satellite radiance data." Journal of the
American Statistical Association 99.465 (2004): 67-81

## Bag of words with logistic regression

In [26]:
X = spells_df['description'] + spells_df['higher_levels']
y = spells_df['level']

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', LogisticRegression()),
])

pipeline.fit(X, y)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [28]:
pipeline.score(X, y)

0.75197889182058042

This is surprisingly good! I suspect that it's overfitting, though. I need to visualize the worse offenders.

In [29]:
pred = pd.DataFrame(pipeline.predict_proba(X), columns=pipeline.classes_)
pred['level'] = y
pred['level_hat'] = pipeline.predict(X)
pred['name'] = spells_df['name']

In [30]:
pred[pred.level != pred.level_hat].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,level,level_hat,name
24,0.207831,0.21308,0.129044,0.087321,0.078354,0.088914,0.072199,0.041186,0.044081,0.037991,0,1,Eldritch Blast
49,0.18516,0.116862,0.210702,0.129589,0.072867,0.079722,0.063907,0.049409,0.048579,0.043203,0,2,Light
55,0.131843,0.165125,0.155568,0.134159,0.075335,0.10694,0.087367,0.053545,0.047564,0.042553,0,1,Message
71,0.149884,0.151201,0.191585,0.129379,0.068979,0.106455,0.06396,0.048477,0.045328,0.044753,0,2,Shillelagh
86,0.058,0.264448,0.250306,0.110267,0.072967,0.072915,0.064827,0.040513,0.030334,0.035424,2,1,Aid


## word2vec with logistic regression

### Continuous bag of words (CBOW) vs Skip-gram
Skip-gram tends to perform better then CBOW on large datasets, while CBOW generally outperforms Skip-gram on small datasets; see this [Tensorflow tutorial](https://www.tensorflow.org/tutorials/word2vec) for references.

In [31]:
import tempfile

corpus_file = tempfile.NamedTemporaryFile(mode='w')
with open(corpus_file.name, 'w') as f:
    for x in spells_df.description:
        if type(x) is str:
            f.write(x)
        f.write('\n')

unigram_file = tempfile.NamedTemporaryFile(mode='w')
with open(unigram_file.name, 'w') as f:
    for sentence in lemmatized_sentence_corpus(corpus_file.name):
        f.write(sentence + '\n')

unigram_sentences = LineSentence(unigram_file.name)

In [32]:
def build_models(corpus, passes=1):
    models = list()
    fname = make_lemmatized_corpus(corpus)

    for i in range(passes):
        sentences = LineSentence(fname)
        model, fname = create_model(sentences)
        models.append(model)
    return models


def make_lemmatize_corpus(corpus):
    tmp_file = tempfile.NamedTemporaryFile(mode='w')
    with open(tmp_file.name, 'w') as f:
        for sentence in lemmatized_sentence_corpus(corpus_file.name):
            f.write(sentence + '\n')
    return tmp_file.name


def create_model(sentences):
    phrases = Phrases(sentences)
    model = Phraser(phrases)
    tmp_file = tempfile.NamedTemporaryFile(mode='w')
    with open(tmp_file.name, 'w') as f:
        for sentence in sentences:
            sentence = u' '.join(model[sentence])
            f.write(sentence + '\n')
    return model, tmp_file.name

In [33]:
total_words = sum(len(x) for x in trigram_sentences)
word2vec_file = tempfile.NamedTemporaryFile(mode='w')

spell2vec = Word2Vec(trigram_sentences, size=100, window=5, min_count=5, workers=4)
spell2vec.save(word2vec_file.name)

# perform another 11 epochs of training
for i in range(11):
    spell2vec.train(trigram_sentences, total_words=total_words, epochs=1)
    spell2vec.save(word2vec_file.name)

print(u'{} training epochs so far.'.format(spell2vec.train_count))

spell2vec.init_sims()
len(spell2vec.wv.vocab)

12 training epochs so far.


966

## Using only Keras

In [34]:
corpus_file.name

'/var/folders/4d/2_vxy65n171blyghwfcl628h0000gn/T/tmp3jl4pn5r'

1. tokenize
1. embed


In [35]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on

AttributeError: 'Tokenizer' object has no attribute 'fit_on'

In [None]:
from keras import preprocessing


In [None]:
from keras.layers import Embedding