In [1]:
import numpy as np
import pandas as pd

# Load data

In [2]:
import requests

url = 'https://raw.githubusercontent.com/vorpalhex/srd_spells/master/spells.json'
r = requests.get(url)
spells = r.json()

In [3]:
spells_df = pd.DataFrame(spells)

In [4]:
spells_df.head()

Unnamed: 0,casting_time,classes,components,description,duration,higher_levels,level,name,range,ritual,school,tags,type
0,1 action,"[sorcerer, wizard]","{'material': False, 'raw': 'V, S', 'somatic': ...",You hurl a bubble of acid. Choose one creature...,Instantaneous,,cantrip,Acid Splash,60 feet,False,Conjuration,"[sorcerer, wizard, cantrip]",Conjuration cantrip
1,1 action,"[ranger, wizard]","{'material': True, 'materials_needed': ['a tin...",You set an alarm against unwanted intrusion. C...,8 hours,,1,Alarm,30 feet,True,abjuration,"[ranger, wizard, level1]",1st-level abjuration (ritual)
2,1 action,"[bard, druid, ranger]","{'material': True, 'materials_needed': ['a mor...",This spell lets you convince a beast that you ...,24 hours,When you cast this spell using a spell slot of...,1,Animal Friendship,30 feet,False,enchantment,"[bard, druid, ranger, level1]",1st-level enchantment
3,1 action,"[bard, cleric]","{'material': True, 'materials_needed': ['a dro...",Up to three creatures of your choice that you ...,"Concentration, up to 1 minute",When you cast this spell using a spell slot of...,1,Bane,30 feet,False,enchantment,"[bard, cleric, level1]",1st-level enchantment
4,1 action,"[bard, sorcerer, warlock, wizard]","{'material': False, 'raw': 'V, S', 'somatic': ...",You extend your hand and trace a sigil of ward...,1 Round,,cantrip,Blade Ward,Self,False,Abjuration,"[bard, sorcerer, warlock, wizard, cantrip]",Abjuration cantrip


In [42]:
spells_df = spells_df.fillna('')

# Text analysis methods
This follows Patrick Harrison's [**Modern NLP in Python**](https://github.com/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb) tutorial.

In [5]:
import spacy

nlp = spacy.load('en')

In [6]:
spells_df.shape

(379, 13)

In [7]:
spell = nlp(spells_df.description.iloc[10])

In [8]:
for i, x in enumerate(spell.sents):
    print(i, x)

0 A dazzling array of flashing, colored light springs from your hand.
1 Roll 6d10; the total is how many hit points of creatures this spell can effect.
2 Creatures in a 15-foot cone originating from you are affected in ascending order of their current hit points (ignoring unconscious creatures and creatures that can't see).


3 Starting with the creature that has the lowest current hit points, each creature affected by this spell is blinded until the spell ends.
4 Subtract each creature's hit points from the total before moving on to the creature with the next lowest hit points.
5 A creature's hit points must be equal to or less than the remaining total for that creature to be affected.


In [9]:
for i, ent in enumerate(spell.ents):
    print ('{}: {} - {}'.format(i, ent, ent.label_))

0: Roll 6d10 - PERSON
1: 15-foot - CARDINAL


### Phrase modeling

Due to the highly structured nature of the corpus, phrase modeling will be an effective method for reducing dimensionality.

In [10]:
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

Using TensorFlow backend.


In [11]:

def punct_space(token):
    return token.is_punct or token.is_space

def entry_to_lines(filename):
    with open(filename, encoding='utf_8') as f:
        for entry in f:
            yield entry.replace('\\n', '\n')
    return
            
def lemmatized_sentence_corpus(filename):
    for parsed_review in nlp.pipe(entry_to_lines(filename),
                                  batch_size=10000, n_threads=4):
        for sent in parsed_review.sents:
            yield u' '.join([token.lemma_ for token in sent if not punct_space(token)])

In [12]:
import tempfile

raw_spells = tempfile.NamedTemporaryFile(mode='w')
with open(raw_spells.name, 'w') as f:
    for x in spells_df.description:
        if type(x) is str:
            f.write(x)
        f.write('\n')

In [13]:
unigram = tempfile.NamedTemporaryFile(mode='w')

with open(unigram.name, 'w') as f:
    for sentence in lemmatized_sentence_corpus(raw_spells.name):
        f.write(sentence + '\n')

unigram_sentences = LineSentence(unigram.name)

In [14]:
bigram_model = Phrases(unigram_sentences)

bigram = tempfile.NamedTemporaryFile(mode='w')

with open(bigram.name, 'w') as f:
    for unigram_sentence in unigram_sentences:
        bigram_sentence = u' '.join(bigram_model[unigram_sentence])
        f.write(bigram_sentence + '\n')

bigram_sentences = LineSentence(bigram.name)



In [15]:
trigram_model = Phrases(bigram_sentences)

trigram = tempfile.NamedTemporaryFile(mode='w')

with open(trigram.name, 'w') as f:
    for bigram_sentence in bigram_sentences:
        trigram_sentence = u' '.join(trigram_model[bigram_sentence])
        f.write(trigram_sentence + '\n')

trigram_sentences = LineSentence(trigram.name)



### Word vector embedding

In [16]:
total_words = sum(len(x) for x in trigram_sentences)

In [17]:
from gensim.models import Word2Vec

word2vec_file = tempfile.NamedTemporaryFile(mode='w')

spell2vec = Word2Vec(trigram_sentences, size=100, window=5, min_count=5, workers=4)
spell2vec.save(word2vec_file.name)

# perform another 11 epochs of training
for i in range(11):
    spell2vec.train(trigram_sentences, total_words=total_words, epochs=1)
    spell2vec.save(word2vec_file.name)

print(u'{} training epochs so far.'.format(spell2vec.train_count))

spell2vec.init_sims()

12 training epochs so far.


In [20]:
len(spell2vec.wv.vocab)

966

In [21]:
ordered_vocab = [(term, voc.index, voc.count) for term, voc in spell2vec.wv.vocab.items()]
ordered_vocab = sorted(ordered_vocab, key=lambda x: -x[2])
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)
word_vectors = pd.DataFrame(spell2vec.wv.syn0norm[term_indices, :],
                            index=ordered_terms)
word_vectors.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
-PRON-,-0.013314,-0.009932,-0.018664,-0.176529,-0.144393,0.0988,0.035082,0.050201,0.048908,0.01607,...,0.208038,-0.091442,-0.17275,0.01844,-0.016496,0.199059,0.04444,0.027788,-0.097366,-0.012778
the,0.018035,-0.076436,0.03391,-0.016733,-0.017778,0.099923,0.11411,0.029634,-0.117731,0.068414,...,0.196702,-0.044928,-0.133564,-0.007312,-0.152144,0.085482,-0.032751,-0.053783,0.040127,0.034605
a,0.035959,-0.028454,0.025207,-0.099713,0.131321,0.094496,-0.021451,-0.05335,0.004977,-0.195385,...,0.079844,0.026852,-0.172453,-0.099108,0.041437,0.021292,0.042967,0.09291,0.048738,0.130596
of,-0.025458,-0.010637,0.082763,-0.117139,0.070152,0.14434,-0.125088,0.16563,0.045662,-0.105419,...,0.094386,0.046887,-0.165871,0.13691,-0.089316,0.014339,-0.058216,0.038772,0.002747,0.065026
creature,-0.006428,-0.014887,0.033444,-0.163029,-0.021579,0.238921,-0.00967,0.191085,0.054034,-0.117876,...,0.194915,-0.077747,-0.1684,0.009497,0.060857,0.152564,-0.002145,0.010235,-0.018746,0.058937


#### [Todo]: visualize, and clean up the code above. Figure out how to use the model to create the lemmatized text for each example, and to prepare it in a dataframe for doc2vec.

# Doc2Vec
We'll follow the gensim Doc2Vec [tutorial](https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb).

#### Goal
learn a vector embedding for each document in the corpus, and train a multinomial classifier on the vector embeddings.

In [None]:
def read_corpus(fname, training=False):
    with open(fname, 'r') as f:
        for i, line in enumerate(f):
            if training:
                yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])
            else:
                yield gensim.utils.simple_preprocess(line)

In [29]:
spells_df.head()

Unnamed: 0,casting_time,classes,components,description,duration,higher_levels,level,name,range,ritual,school,tags,type
0,1 action,"[sorcerer, wizard]","{'material': False, 'raw': 'V, S', 'somatic': ...",You hurl a bubble of acid. Choose one creature...,Instantaneous,,cantrip,Acid Splash,60 feet,False,Conjuration,"[sorcerer, wizard, cantrip]",Conjuration cantrip
1,1 action,"[ranger, wizard]","{'material': True, 'materials_needed': ['a tin...",You set an alarm against unwanted intrusion. C...,8 hours,,1,Alarm,30 feet,True,abjuration,"[ranger, wizard, level1]",1st-level abjuration (ritual)
2,1 action,"[bard, druid, ranger]","{'material': True, 'materials_needed': ['a mor...",This spell lets you convince a beast that you ...,24 hours,When you cast this spell using a spell slot of...,1,Animal Friendship,30 feet,False,enchantment,"[bard, druid, ranger, level1]",1st-level enchantment
3,1 action,"[bard, cleric]","{'material': True, 'materials_needed': ['a dro...",Up to three creatures of your choice that you ...,"Concentration, up to 1 minute",When you cast this spell using a spell slot of...,1,Bane,30 feet,False,enchantment,"[bard, cleric, level1]",1st-level enchantment
4,1 action,"[bard, sorcerer, warlock, wizard]","{'material': False, 'raw': 'V, S', 'somatic': ...",You extend your hand and trace a sigil of ward...,1 Round,,cantrip,Blade Ward,Self,False,Abjuration,"[bard, sorcerer, warlock, wizard, cantrip]",Abjuration cantrip


In [35]:
corpus = tempfile.NamedTemporaryFile(mode='w')`b



cantrip
1
1
1
cantrip
1
1
1
cantrip
1
1
1
1
1
1
1
cantrip
1
1
1
1
1
1
cantrip
cantrip
1
1
1
1
1
1
1
cantrip
1
cantrip
1
1
cantrip
1
1
1
1
1
1
1
1
1
1
1
cantrip
1
1
cantrip
1
cantrip
cantrip
cantrip
cantrip
cantrip
cantrip
1
1
cantrip
1
3
cantrip
cantrip
1
1
1
1
cantrip
cantrip
1
1
cantrip
1
cantrip
cantrip
1
1
cantrip
1
cantrip
1
1
2
2
2
2
2
2
2
2
2
2
2
3
4
4
4
3
4
3
4
3
3
3
3
3
2
4
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
2
2
2
3
3
3
3
3
3
3
3
3
3
3
3
3
4
4
4
4
4
3
2
2
2
2
2
2
2
2
8
8
8
9
8
wind
8
8
8
8
9
9
8
8
9
8
9
8
9
8
9
9
8
9
9
9
8
8
9
9
8
9
9
9
7
7
4
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
2
2
5
5
6
5
5
6
6
6
5
5
5
5
5
5
6
5
5
5
6
6
2
5
5
6
4
5
4
5
5
2
2
2
4
6
2
6
2
4
2
5
2
6
6
4
5
2
4
6
3
4
4
5
4
6
2
4
5
6
2
6
6
5
4
5
2
5
2
2
3
4
2
6
2
2
5
6
2
5
2
5
2
6
2
5
4
2
6
5
4
6
5
2
2
5
2
2
5
5
2
2
2
2
4
4
6
4
5
5
5
6
5
6
3
4
6
5
5
6
3
3
2
6
3
6
2
1
2
2
cantrip
2
cantrip
cantrip
cantrip
cantrip
cantrip
cantrip
cantrip
1
1
1
2
2
1
5
6
3
4
3
6
6
6
6
5
6
3
5
3
4
7
5
4
4
3


# [TODO]: Techniques to apply

* Bag of words - construct weighted encoding of key words, train on weights
    * Tf–idf or counter
* word2vec - construct word embeddings, average, and train on average
* feature engineering

In [43]:
spells_df[['level', 'description', 'higher_levels']].head()

Unnamed: 0,level,description,higher_levels
0,cantrip,You hurl a bubble of acid. Choose one creature...,
1,1,You set an alarm against unwanted intrusion. C...,
2,1,This spell lets you convince a beast that you ...,When you cast this spell using a spell slot of...
3,1,Up to three creatures of your choice that you ...,When you cast this spell using a spell slot of...
4,cantrip,You extend your hand and trace a sigil of ward...,


In [51]:
X = spells_df['description'] + spells_df['higher_levels']
y = spells_df['level']

In [52]:
y.head()

0    cantrip
1          1
2          1
3          1
4    cantrip
Name: level, dtype: object

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [54]:
vx = vectorizer.fit_transform(X)

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', SGDClassifier()),
])

pipeline.fit(vx, y)



AttributeError: lower not found