##
#  Iain O'Brien's Attempt at creating a DnD Character Class Predictor
### This will take in values from a data source and attempt to guess what class the character actually is.
### I'll do this by feeding in a data set of public characters and and through analysis and modelling, attempt to teach the system the rules of DnD character creation.
### Using various modelling techniques, including Linear Regression, Random Forests, and Support Vector Machines.   I'll attempt to refine the output for accuracy. 
### The initial model was shown to struggle to magic casters, especially the 3 Charisma based casters, so this was accounted for and has now been grouped with the rest of the data tidying.
##

In [None]:
import json
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn import tree 
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, cross_val_predict
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV 
import sklearn.metrics as metrics
from sklearn.ensemble import RandomForestClassifier

# Get the dataset from the GitHub

In [None]:
#On Azure, can also use '!wget <url>' and have the DataFrame be built from the local file instead
url = "https://raw.githubusercontent.com/oganm/dnddata/master/data-raw/dnd_chars_all.json"

#Using 'df = pd.read_json('dnd_chars_all.json') would work on Azure, and would create a local copy to Azure.
df = pd.read_json(url)

#We're turning the dataFrame into an object here, because there are multiple dtypes involved when we transpose (reflecting the df over its main diagonal, writing rows as columns and columns as rows.) 
if df.shape[0] < df.shape[1]:
    df = df.transpose()

#
# unpack data and drop unnecessary items (ip, hash keys, IRL locations, weapons and spells
#

In [None]:
#We don't need to see the data associated with the uploader, and the personality choices, character name and alias, and specific weapons and spells, have no bearing on our predictions. 
ignore = ['ip', 'finger', 'date', 'location', 'hash', 'choices', 'weapons', 'name', 'spells', 'alias']

clean_df = pd.json_normalize(df['name'])
clean_df = clean_df['alias'].str[0]

for key in df.keys():
    if key in ignore:
        continue
    try: # try normalising the json, will fail if it isnt actually a json
        data = pd.json_normalize(df[key])
        for col in data.keys(): #unpack singletons in each column
            data[col] = data[col].str[0]
    except AttributeError:
        data = df[key].str[0]
        data = data.reset_index(drop=True)

    clean_df = pd.concat([clean_df, data], axis=1)

num_spells = df['spells'].apply(len).reset_index(drop=True)

clean_df['num_spells'] = num_spells

#
# Add a column for the primary class of the character, so whatever class has the highest level is considered primary.
#

In [176]:
# Normalise the class column
class_list = ['Artificer.level','Barbarian.level','Bard.level','Cleric.level','Druid.level','Fighter.level','Monk.level','Paladin.level','Ranger.level','Rogue.level','Sorcerer.level','Warlock.level','Wizard.level']
class_df = clean_df[class_list]

# return index of highest class level based on column
primary_class = class_df.idxmax(axis=1)
class_primary = list(primary_class.str.split('.').str[0])
#clean it up
clean_df['class_primary'] = class_primary

#make primary class into a label 
primary_class_level = class_df.max(axis=1)
clean_df['class_level_one'] = primary_class_level

# Average HP/Level stat
clean_df['hp_level'] = clean_df['HP']/clean_df['level']

# convert categoricals (casting stat and race) into dummy variables for preprocessing
castingStat = pd.get_dummies(clean_df["castingStat"], prefix='castingStat')
clean_df = pd.concat([clean_df, castingStat], axis = 1)

# taking processed race goes by the "root" race and not the subraces
processedRace = pd.get_dummies(clean_df["processedRace"], prefix='processedRace')
clean_df = pd.concat([clean_df, processedRace], axis = 1)

# remove any duplicates in our now processed dataset
clean_df = clean_df.drop_duplicates(keep='first', ignore_index=True)

# clean out any characters that have 0 classes associated
num_classes = clean_df[class_list].apply(lambda x: x.count(), axis=1)
no_null_df = clean_df[num_classes >= 1]

no_null_df

Unnamed: 0,alias,race,processedRace,background,Rogue.class,Rogue.subclass,Rogue.level,Wizard.class,Wizard.subclass,Wizard.level,...,processedRace_Satyr,processedRace_Shifter,processedRace_Simic hybrid,processedRace_Tabaxi,processedRace_Tiefling,processedRace_Triton,processedRace_Turtle,processedRace_Vedalken,processedRace_Warforged,processedRace_Yaun-Ti
0,pedantic_kalam,Warforged,Warforged,Entertainer,Rogue,Thief,5.0,,,,...,0,0,0,0,0,0,0,0,1,0
1,zen_brown,Mountain Dwarf,Dwarf,Guild Artisan,,,,Wizard,,1.0,...,0,0,0,0,0,0,0,0,0,0
2,charming_payne,Triton,Triton,Faction Agent,,,,,,,...,0,0,0,0,0,1,0,0,0,0
3,charming_poitras,Human,Human,Acolyte,,,,,,,...,0,0,0,0,0,0,0,0,0,0
4,suspicious_jones,Ghastly Halfling,Halfling,Sailor,,,,,,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9325,nervous_goldstine,Half-Elf,Half-Elf,Mercenary Veteran,,,,,,,...,0,0,0,0,0,0,0,0,0,0
9326,gallant_hugle,Human,Human,Far Traveler,,,,,,,...,0,0,0,0,0,0,0,0,0,0
9327,condescending_lederberg,Dragonborn,Dragonborn,Noble,,,,,,,...,0,0,0,0,0,0,0,0,0,0
9328,relaxed_cori,Meadowguard,,Acolyte,,,,,,,...,0,0,0,0,0,0,0,0,0,0


# We'll get rid of some fields that won't help the algorithm, and organise the rest into a human readable order (leaving the processedrace field at the end)

In [None]:
safe = ['race','class_primary', 'class_level_one', 'background',
        'feats',
        'HP', 'AC', 'hp_level',
        'Str', 'Dex', 'Con', 'Int', 'Wis', 'Cha',
        'processedAlignment',
        'skills', 
        'castingStat_Cha', 'castingStat_Con', 'castingStat_Dex', 'castingStat_Int', 'castingStat_Str', 'castingStat_Wis',
        'num_spells'] + [x for x in processedRace.columns]

characters_with_class = no_null_df[safe]

characters_with_class

In [None]:
hp_per_level = characters_with_class['hp_level']
dia = plt.figure()
dia.suptitle('HP per Level')

ax = dia.add_subplot(111)
ax.boxplot(hp_per_level)

ax.set_ylabel('HP gained per level')


# Starting the actual exploratory analysis
# Going to start by looking at how AC is distributed across classes

In [None]:
ac_df = clean_df[['AC','class_primary']]

ax = sns.boxplot(x="AC", y="class_primary", data=ac_df)
ax.set(ylabel='Primary Class')

# Build the model
## Build a model to attempt to predict/suggest a class based on character stats
## Take a 70/30 split for training/test data, 

In [None]:
# CLean up any extreme outliers
attributes = ['Str', 'Wis', 'Con', 'Cha', 'Int', 'Dex']
attributes_data = clean_df[attributes]

print(attributes_data.describe())

clean_df[ ['Str','Dex','Con','Int','Wis','Cha'] ].describe()


# distributions

# IQR / boxplot outlier analysis
attr_dfs = []
for attr in attributes:
  attr_dfs.append(clean_df[attr])

plt.boxplot(attr_dfs, labels=attributes)
plt.show()

for col in "HP", "AC":
  zzz = plt.boxplot(clean_df[col], labels=[col])
  plt.show()
  x = [item.get_ydata()[1] for item in zzz['whiskers']]
  print(x)

In [None]:
casters = ['Artificer', 'Bard', 'Cleric', 'Druid', 'Sorceror', 'Warlock', 'Wizard']

characters_with_class['is_caster'] = characters_with_class['class_primary'].isin(casters).astype(int)

attributes = ['Str', 'Dex', 'Con', 'Int', 'Wis', 'Cha']
castingStat = [f'castingStat_{attribute}' for attribute in attributes]
features = ['AC', 'hp_level'] + attributes

X = characters_with_class[ features ]
Y = characters_with_class['is_caster']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

#check these are balanced
print(Y_train.groupby(Y_train).size())
print(Y_test.groupby(Y_test).size())

In [None]:
# Lets get some Logistic Regression on the go (Binary)

blrf = LogisticRegression(random_state=42, solver='liblinear').fit(X_train, Y_train)

print('training accuracy', blrf.score(X_train, Y_train))
print('testing accuracy', blrf.score(X_test, Y_test))

blrfCV = LogisticRegressionCV(cv = 10, random_state = 42, solver='liblinear').fit(X_train, Y_train)

print('CV training accuracy', blrfCV.score(X_train, Y_train))
print('CV testing accuracy', blrfCV.score(X_test, Y_test))

In [None]:
### Let's add some f1 scoring and confusion matrice to drill down for accuracy
## We'll use the blrfCV since it was exceptionally marginally more accurate
blrfCV = LogisticRegressionCV(cv = 10, random_state = 42, solver='liblinear', scoring = 'f1').fit(X_train, Y_train)

print('f1 training score', blrfCV.score(X_train, Y_train))
print('f1 testing score', blrfCV.score(X_test, Y_test))

Y_pred = blrfCV.predict(X_test)

print('\nConfusion Matrix:')
print(metrics.confusion_matrix(Y_test, Y_pred))
print('')

tn, fp, fn, tp = metrics.confusion_matrix(Y_test, Y_pred).ravel()
print('TPR: ', tp /(tp+fn))
print('FPR: ', fp /(fp+tn))

In [None]:
## calculate the fpr and tpr for all thresholds to see if we can change the boundaries for better results

#what is the probability of being in each class?
c_prob = blrf.predict_proba(X_test)
print(c_prob)

# whats the probablity the observation is class 1
pred = c_prob[:,1]
print(pred)

# create some arrays to hold fpr/tpr/threshold for given thresholds
fpr, tpr, threshold = metrics.roc_curve(Y_test, pred)
roc_auc = metrics.auc(fpr, tpr)

fig = plt.figure(figsize=[10,7])
ax = plt.subplot(111)
ax.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.title('Receiver Operating Characteristic')
plt.legend(loc = 'lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.grid(True)
x_ticks = np.arange(0, 11, 1)
plt.xticks(x_ticks / 10.)
plt.show()


# Hmmm, doesn't seem like this is our best fit.
## Let's try a decision tree instead. at least we can counter overfilling with hyperparams then.

In [None]:
bdtc = tree.DecisionTreeClassifier(criterion='gini', max_depth = 25, min_samples_leaf = 20)
bdtc = bdtc.fit(X_train, Y_train)

print('BDTC testing accuracy: ', bdtc.score(X_test, Y_test))
print('BDTC training accuracy: ', bdtc.score(X_train, Y_train))

## That's definitely better, but does it hold up to cross validation?

In [None]:
# Going on info from https://scikit-learn.org/stable/modules/grid_search.html#model-specific-cross-validation

# 10 fold CV and print the f1 score after each fold
bdtc = tree.DecisionTreeClassifier(criterion = 'gini', max_depth = 25, min_samples_leaf = 20)
scoring = cross_val_score(bdtc, X_train, Y_train, cv=10, scoring='f1_macro') #for scoring classification models based macro-averaged

plt.bar(range(len(scoring)), scoring)
plt.show()
print(f'Mean f1 score of 10 fold cross validation: {scoring.mean():.3f}, (std: {scoring.std():.3f})')

if 1: # please give this time to run, it takes a while
    #hyperparam tuning with grid search
    param_grid = {"criterion": ['gini', 'entropy'],
                    "max_depth": [None, 5, 10, 20],
                    "min_samples_leaf": [1, 5, 10, 20],
                    "min_samples_split": [2, 5, 10, 20]
                    }
    dtc = tree.DecisionTreeClassifier()

    grid_search = GridSearchCV(dtc, param_grid=param_grid, cv=10, scoring='f1')
    grid_search.fit(X_train, Y_train)
    print('Best fit params: ', grid_search.best_params_)
    print('Resulting best score: ', grid_search.best_score_)

    dtc = tree.DecisionTreeClassifier(criterion='entropy', max_depth=25, min_samples_leaf = 20, min_samples_split = 10)
    scoring2 = cross_val_score(dtc, X_train, Y_train, cv=10, scoring='f1_macro')
    plt.bar(range(len(scoring2)), scoring2)
    plt.show()
    print(f'mean f1 score of 10 fold cross validation: {scoring2.mean():.3f}, (std: {scoring2.std():.3f})')


In [None]:
## Let's cross examine with a confusion matrix again.

dtc = tree.DecisionTreeClassifier(criterion='entropy', max_depth=25, min_samples_leaf=20, min_samples_split=10)
dtc = dtc.fit(X_train, Y_train)

Y_preds = dtc.predict(X_test)

print('Confusion Matrix #2:')
print(metrics.confusion_matrix(Y_test, Y_preds))
print('')

tn, fp, fn, tp = metrics.confusion_matrix(Y_test, Y_preds).ravel()
print('TPR: ', tp / (tp+fn))
print('FPR: ', fp / (fp+tn))

In [None]:
## OK, lets use a single decision tree this time for probabilities

probs = dtc.predict_proba(X_test)
preds = probs[:,1]

fpr, tpr, threshold = metrics.roc_curve(Y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

fig = plt.figure(figsize=[10,7])
ax = plt.subplot(111)
ax.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.title('Receiver Operating Characteristic')
plt.legend(loc = 'lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.grid(True)
x_ticks = np.arange(0, 11, 1)
plt.xticks(x_ticks / 10.)
plt.show()


In [None]:
#Let's model for if a character is multi-class

attrs = ['Str', 'Dex', 'Con', 'Int', 'Wis', 'Cha']
castStat = [f'castingStat_{attr}' for attr in attrs]
features = ['AC', 'hp_level'] + attrs + castStat

x = characters_with_class[ features ]
y = characters_with_class['class_primary']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

print(y_train.groupby(y_train).size())
print(y_test.groupby(y_test).size())

In [None]:
mclrf = LogisticRegression(random_state=42, solver='liblinear', multi_class='auto').fit(x_train, y_train)

print('test acc: ', mclrf.score(x_test, y_test))
print('training acc: ', mclrf.score(x_train, y_train))

y_pred = mclrf.predict(x_test)
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred, digits=3))

probs_mc = mclrf.predict_proba(x_test)
print(probs_mc)

preds = probs_mc[:,1]
print(preds)

In [None]:
#ok, so its struggling to split the Bard/Sorcerer/Warlock apart, understandably.
# Lets add a new column/stat to help it there, while I'm here, lets run a RandomForestClassifier to show I know how.
# it'll take a while to grid search cv, thats expected

characters_with_class.groupby('class_primary')['num_spells'].describe()

spells_level = characters_with_class['num_spells'].divide(characters_with_class['class_level_one'])
characters_with_class['spells_level'] = spells_level
castingStat = ['castingStat_Int', 'castingStat_Wis', 'castingStat_Cha']
#check that works before proceeding
print(characters_with_class.groupby('class_primary')['spells_level'].describe())

features = ['AC', 'hp_level', 'spells_level'] + attributes + castingStat
X = characters_with_class[ features ]
Y = characters_with_class['class_primary']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42 )
param_grid = {
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300]
}
rando_for = RandomForestClassifier()
grid_search = GridSearchCV(rando_for, param_grid = param_grid, cv=10, scoring='f1_macro')
grid_search.fit(X_train, Y_train)
print('Best Params: ', grid_search.best_params_)
print('Best_Score: ', grid_search.best_score_)

mcrfc = RandomForestClassifier(n_estimators=300, min_samples_leaf=3, min_samples_split=8, random_state=42)
mcrfc.fit(X_train, Y_train)

y_pred = mcrfc.predict(X_test)

print(metrics.confusion_matrix(Y_test, y_pred))

print(metrics.classification_report(Y_test, y_pred, digits=3))

In [None]:
# Let's go through a Decision Tree
## keep the same values as single class to reduce variance.
mcdtc = tree.DecisionTreeClassifier(criterion='gini', max_depth=25, min_samples_leaf=20)
scores = cross_val_score(mcdtc, x_train, y_train, cv=10, scoring='f1_macro')
plt.bar(range(len(scores)), scores)
plt.show()
print(f'mean f1 score of 10 fold cross validation: {scores.mean():.3f}, (std: {scores.std():.3f})')

if 1: # again, takes a while
    param_grid = {"criterion": ['gini', 'entropy'],
                    "max_depth": [None, 5, 10, 20],
                    "min_samples_leaf": [1,5,10,20],
                    "min_samples_split": [2,5,10,20]
                    }
    dt = tree.DecisionTreeClassifier()

    grid_search = GridSearchCV(dt, param_grid=param_grid, cv=10, scoring='f1_macro')
    grid_search.fit(x_train, y_train)
    print('Best Params: ', grid_search.best_params_)
    print('Best score: ', grid_search.best_score_)

# do a 10 fold cv and print the f1 score for each run using optimised params
mcdtc = tree.DecisionTreeClassifier(criterion='gini', max_depth=25, min_samples_leaf=5, min_samples_split=20, random_state=42)
scores = cross_val_score(mcdtc, x_train, y_train, cv=10, scoring='f1_macro')
plt.bar(range(len(scores)), scores)
plt.show()
print(f'mean f1 score of 10 fold cv: {scores.mean():.3f}, (std: {scores.std():.3f}')

In [None]:
mcdtc = tree.DecisionTreeClassifier(criterion='gini', max_depth=25, min_samples_leaf=5, min_samples_split=20, random_state=42)
mcdtc = mcdtc.fit(x_train, y_train)
y_pred = mcdtc.predict(x_test)

print('Confusion Matrix #3')
print(metrics.confusion_matrix(y_test, y_pred))
print('')
print('Success Metrics')
print(metrics.classification_report(y_test, y_pred, digits=3))

In [None]:
#Lets try feature importance instead...

importance = mcdtc.feature_importances_

importance_df = zip(list(x_train.columns), importance)
importance_df = set(importance_df)
importance_df = pd.DataFrame(importance_df, columns = ['feature', 'importance'])

importance_df = importance_df.sort_values('importance', ascending=True)
importance_df.reset_index(drop = True)

plt.figure()
plt.title("MC Features by Importance")
plt.barh(range(importance_df.shape[0]), width = importance_df['importance'], align="center")
plt.yticks(range(importance_df.shape[0]), importance_df['feature'])
plt.ylim([-1, importance_df.shape[0]])
plt.show()

In [None]:
# Support Vector Machines
## Bring in some new libraries to help with SVM's

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
from sklearn.svm import SVC

In [None]:
# scale the data and put it into a linear svm to form a classifier

mcsvm = make_pipeline(StandardScaler(), SVC(kernel='linear'))
mcsvm = mcsvm.fit(x_train, y_train)
y_pred = mcsvm.predict(x_test)

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred, digits=3))

In [None]:
# Scale the data to maker the pipeline easier
X_train_scaled = scale(x_train)
X_test_scaled = scale(x_test)

In [None]:
# Do another grid search

if 1: #will take upwards of 300s
    param_grid = {"kernel": ['linear', 'rbf', 'poly', 'sigmoid'],
                    "degree": [2,3,4],
                    "decision_function_shape": ['ovr', 'ovo']
                    }
    mcsvm = SVC()
    grid_search = GridSearchCV(mcsvm, param_grid=param_grid, cv=10, scoring='f1_macro')
    grid_search.fit(X_train_scaled, y_train)
    print('Best params: ', grid_search.best_params_)
    print('Best result: ', grid_search.best_score_)

In [None]:
#build with above params
mcsvm = make_pipeline(StandardScaler(), SVC(kernel='rbf', degree=2, decision_function_shape='ovr'))
mcsvm = mcsvm.fit(X_train, Y_train)
y_pred = mcsvm.predict(X_test)

print(metrics.confusion_matrix(Y_test, y_pred))

print(metrics.classification_report(Y_test, y_pred, digits=3))

In [None]:
#lets test this on some of my own characters
# First: a Half-Orc Druid (wisdom caster, bulky due to Half-Orc race)
my_char = pd.DataFrame({
    'AC': [13],
    'hp_level': [31/4.],
    'spells_level': [7/4.],
    'Str': [14],
    'Dex': [10],
    'Con': [15],
    'Int': [8],
    'Wis': [17],
    'Cha': [13],
    'castStat_Int': [0], # 0/1 for no/yes
    'castStat_Wis': [1],
    'castStat_Cha': [0]
})



print(mcsvm.predict(my_char))