In [1]:
# learn.py - Copyright Vess 2023
# Take the scraped dataset from the Pathfinder 2e Dashboard website and throw it into the blender of machine learning.

# Known Bugs, Issues, and Limitations
# 1. Expects data to be formatted a certain way. Will break otherwise.

In [2]:
# Imports.
import numpy as np
import operator
import pandas as pd
import pickle
import pprint
import random
import time
import warnings

# From imports.
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multioutput import MultiOutputRegressor
from sklearn.utils import all_estimators
from sklearn.utils.estimator_checks import _construct_instance

# Stop it from truncating the displays.
pd.set_option("display.max_columns", 100)

# Some models get angsty about non-finetuned options.
warnings.filterwarnings("ignore")

In [3]:
# Read in our data and see some preliminary statistics.
pf_data = pd.read_csv("statblock-data.csv")

pf_data.head()

Unnamed: 0,name,level,size,law,moral,ac,hp,perception,fortitude,reflex,willpower,strength,dexterity,constitution,intelligence,wisdom,charisma,acrobatics,arcana,athletics,crafting,deception,diplomacy,intimidation,medicine,nature,occultism,performance,religion,society,stealth,survival,thievery
0,Graylok Artillerists,8,4,1,1,26,100,16,19,16,13,5,3,5,3,2,0,0,0,21,18,0,0,16,0,0,0,0,0,0,0,0,0
1,Ancient Tupilaq,11,3,2,2,32,145,22,17,15,12,7,5,5,-5,5,-5,0,0,24,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Ashen Swale,7,3,2,1,23,120,13,18,12,15,1,1,3,2,2,4,12,15,0,0,17,0,17,0,15,0,0,0,0,0,13,12
3,Skrik Nettle,6,4,2,2,22,130,16,10,17,14,4,5,2,-4,4,0,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Canker Cultist,3,3,1,1,19,45,11,7,9,12,4,3,1,1,3,3,9,0,10,0,0,0,10,0,0,8,0,8,0,9,0,0


In [4]:
pf_data.describe()

Unnamed: 0,level,size,law,moral,ac,hp,perception,fortitude,reflex,willpower,strength,dexterity,constitution,intelligence,wisdom,charisma,acrobatics,arcana,athletics,crafting,deception,diplomacy,intimidation,medicine,nature,occultism,performance,religion,society,stealth,survival,thievery
count,3153.0,3153.0,3153.0,3153.0,3153.0,3153.0,3153.0,3153.0,3153.0,3153.0,3153.0,3153.0,3153.0,3153.0,3153.0,3153.0,3153.0,3153.0,3153.0,3153.0,3153.0,3153.0,3153.0,3153.0,3153.0,3153.0,3153.0,3153.0,3153.0,3153.0,3153.0,3153.0
mean,7.628608,3.322867,1.861402,1.545829,25.730733,137.048842,15.97431,16.161116,15.893118,15.120203,3.87282,3.4621,3.411354,0.095782,2.80019,1.725024,9.105297,2.490961,12.526166,1.414526,5.33365,3.385664,7.640343,0.909927,2.506819,2.685062,1.168728,3.170314,2.650174,9.538535,3.936886,1.356169
std,5.837861,1.061456,0.648219,0.604454,8.785342,114.559965,8.931971,8.739609,8.234526,8.926829,3.228109,2.265648,2.223652,3.48333,2.09802,3.322483,10.64403,7.71679,11.429547,5.279594,10.045597,8.420595,11.186016,4.53163,7.146982,7.753996,5.265726,8.361926,7.400506,10.240626,7.993423,4.921626
min,-1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,-1.0,0.0,-5.0,-5.0,-2.0,-5.0,-5.0,-5.0,0.0,0.0,0.0,0.0,0.0,-5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0
25%,3.0,3.0,1.0,1.0,18.0,45.0,9.0,9.0,9.0,8.0,2.0,2.0,2.0,-3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,6.0,3.0,2.0,1.0,24.0,105.0,14.0,15.0,14.0,13.0,4.0,4.0,3.0,0.0,2.0,2.0,6.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0
75%,12.0,4.0,2.0,2.0,32.0,210.0,22.0,22.0,21.0,21.0,6.0,5.0,5.0,2.0,4.0,4.0,16.0,0.0,21.0,0.0,7.0,0.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,4.0,0.0
max,25.0,6.0,3.0,3.0,54.0,600.0,48.0,47.0,43.0,48.0,12.0,12.0,12.0,10.0,12.0,12.0,47.0,45.0,50.0,43.0,49.0,48.0,48.0,40.0,50.0,43.0,39.0,45.0,43.0,45.0,46.0,45.0


In [5]:
# For splitting our data into training and test sets.
# Pre: Takes our input data and our list of columns to exclude.
# Post: Returns a fit pipe for xtrain, a fit pipe for xtest, and then ytrain and ytest.
def split_data(input_data, input_list):
    exclude_final = input_data.columns.difference(input_list)
    data_x = input_data[exclude_final].to_numpy()
    data_y = input_data[input_list].to_numpy()
    state_var = random.randint(0, 420) # Blaze it.
    data_xtrain, data_xtest, data_ytrain, data_ytest = train_test_split(data_x, data_y, test_size=0.1, random_state=state_var)
    return data_xtrain, data_xtest, data_ytrain, data_ytest

In [6]:
# We don't need names from this point forward for our data.
pf_data = pf_data.drop("name", axis=1)

In [7]:
# Target keep is what we are providing to our models.
target_keep = {
    "level": None, # 2
    "size": 3, # 3 | 1 Tiny, 2 Small, 3 Medium, 4 Large, 5 Huge, 6 Gargantuan.
    "law": 2, # 2 | 1 Chaotic, 2 Neutral, 3 Lawful.
    "moral": 1, # 1 | 1 Evil, 2 Neutral, 3 Good.
    "ac": 17, # 17
    "hp": None, # 34
    "perception": 7, # 7
    "fortitude": 9, # 9
    "reflex": 8, # 8
    "willpower": 5, # 5
    "strength": 4, # 4
    "dexterity": 2, # 2
    "constitution": 3, # 3
    "intelligence": -1, # -1
    "wisdom": 1, # 1
    "charisma": 0, # 0
    "acrobatics": 6, # 6
    "arcana": 0, # 0
    "athletics": 7, # 7
    "crafting": 0, # 0
    "deception": 0, # 0
    "diplomacy": 0, # 0
    "intimidation": 4, # 4
    "medicine": 0, # 0
    "nature": 0, # 0
    "occultism": 0, # 0
    "performance": 0, # 0
    "religion": 0, # 0
    "society": 0, # 0
    "stealth": 6, # 6
    "survival": 0, # 0
    "thievery": 0, # 0
}

"""target_keep = {
    "level": 2, # 2
    "size": None, # 3 | 1 Tiny, 2 Small, 3 Medium, 4 Large, 5 Huge, 6 Gargantuan.
    "law": None, # 2 | 1 Chaotic, 2 Neutral, 3 Lawful.
    "moral": None, # 1 | 1 Evil, 2 Neutral, 3 Good.
    "ac": None, # 17
    "hp": 34, # 34
    "perception": None, # 7
    "fortitude": None, # 9
    "reflex": None, # 8
    "willpower": None, # 5
    "strength": None, # 4
    "dexterity": None, # 2
    "constitution": None, # 3
    "intelligence": None, # -1
    "wisdom": None, # 1
    "charisma": None, # 0
    "acrobatics": None, # 6
    "arcana": None, # 0
    "athletics": None, # 7
    "crafting": None, # 0
    "deception": None, # 0
    "diplomacy": None, # 0
    "intimidation": None, # 4
    "medicine": None, # 0
    "nature": None, # 0
    "occultism": None, # 0
    "performance": None, # 0
    "religion": None, # 0
    "society": None, # 0
    "stealth": None, # 6
    "survival": None, # 0
    "thievery": None, # 0
}"""

# Target diff is what were are not providing to our models.
target_diff = [temp_point for temp_point in target_keep.keys() if target_keep[temp_point] == None]

In [8]:
# Split the data out into training and test sets.
data_xtrain, data_xtest, data_ytrain, data_ytest = split_data(pf_data, target_diff)

In [9]:
# Grab our list of regressor models.
reg_list = all_estimators(type_filter="regressor")

# Some regressors get mad that we ask too much of them.
reg_ban = ["CCA", "GammaRegressor", "GaussianProcessRegressor", "IsotonicRegression", "KernelRidge", "MultiTaskElasticNet", "MultiTaskElasticNetCV", "MultiTaskLasso", "MultiTaskLassoCV", "PLSCanonical", "PLSRegression", "PoissonRegressor", "QuantileRegressor", "RegressorChain"]
# Or if we ask too little.
reg_ban.extend(["OrthogonalMatchingPursuitCV"])
#These regressors are just fat...
reg_ban.extend(["ExtraTreesRegressor", "RandomForestRegressor"])
# These regressors are just slow...
reg_ban.extend(["HistGradientBoostingRegressor", "MLPRegressor", "NuSVR", "SVR", "TheilSenRegressor"])
# These regressors are just dogwater.
reg_ban.extend(["RadiusNeighborsRegressor", "SGDRegressor"])
# These regressors we don't need.
reg_ban.extend(["DummyRegressor", "MultiOutputRegressor", "StackingRegressor", "VotingRegressor"])

# Create a final list that excludes the ban list.
reg_final = [temp_reg for temp_reg in reg_list if temp_reg[0] not in reg_ban]

In [10]:
# For keeping score from our regressors.
score_list = []

time_before = time.time()

# Iterate through each model...
for temp_reg in enumerate(reg_final, start=1):
    # Create an instance of the model.
    model_instance = _construct_instance(temp_reg[1][1])
    model_reg = MultiOutputRegressor(model_instance)
    # Train it, score it, and time it.
    print(str(temp_reg[0]) + ". Training: " + temp_reg[1][0], end=", ")
    time_start = time.time()
    model_reg.fit(data_xtrain, data_ytrain)
    model_score = model_reg.score(data_xtest, data_ytest)
    time_end = time.time()
    time_diff = round(time_end - time_start, 2)
    print(str(model_score) + ", " + str(time_diff) + "s.")
    score_list.append((model_score, temp_reg[1][0], model_instance))
    
time_after = time.time()
time_diff = round(time_after - time_before, 2)
print("Total training time: " + str(time_diff))

1. Training: ARDRegression, 0.964227384438856, 0.03s.
2. Training: AdaBoostRegressor, 0.9472317663148004, 0.51s.
3. Training: BaggingRegressor, 0.968087845371375, 0.27s.
4. Training: BayesianRidge, 0.964270979965545, 0.02s.
5. Training: DecisionTreeRegressor, 0.9341995451707517, 0.04s.
6. Training: ElasticNet, 0.9627241280636425, 0.05s.
7. Training: ElasticNetCV, 0.963710325994287, 0.18s.
8. Training: ExtraTreeRegressor, 0.9341419339676111, 0.02s.
9. Training: GradientBoostingRegressor, 0.9695700607022472, 0.95s.
10. Training: HuberRegressor, 0.9617321151243156, 0.11s.
11. Training: KNeighborsRegressor, 0.9265411549489211, 0.06s.
12. Training: Lars, 0.9180731368207954, 0.01s.
13. Training: LarsCV, 0.9639397356944701, 0.06s.
14. Training: Lasso, 0.9627384514371264, 0.06s.
15. Training: LassoCV, 0.9640090472165532, 0.2s.
16. Training: LassoLars, 0.9627371470012726, 0.01s.
17. Training: LassoLarsCV, 0.9642600214565155, 0.07s.
18. Training: LassoLarsIC, 0.9642932804744646, 0.02s.
19. Train

In [11]:
# Get and show the top three.
score_final = sorted(score_list, key=operator.itemgetter(0))
score_top = score_final[-3:]
print("Training:")
print(score_top)

# Create a bag of the top three.
bag_list = []
for temp_model in score_top:
    bag_list.append((temp_model[1], temp_model[2]))

# Time it and train a top three voting regressor.
time_start = time.time()
model_bag = VotingRegressor(estimators=bag_list)
model_reg_final = MultiOutputRegressor(model_bag)
model_reg_final.fit(data_xtrain, data_ytrain)
model_score = model_reg_final.score(data_xtest, data_ytest)
time_end = time.time()
time_diff = round(time_end - time_start, 2)
print(str(model_score) + ", " + str(time_diff) + "s.")

Training:
[(0.9642932804744646, 'LassoLarsIC', LassoLarsIC()), (0.968087845371375, 'BaggingRegressor', BaggingRegressor()), (0.9695700607022472, 'GradientBoostingRegressor', GradientBoostingRegressor())]
0.972462241747661, 1.26s.


In [12]:
# Let's free up some memory.
score_list = None
score_final = None
score_top = None
bag_list = None

In [13]:
# Grab our list of classifier models.
class_list = all_estimators(type_filter="classifier")

# Some classifiers get mad that we ask too much of them.
class_ban = ["CategoricalNB", "ClassifierChain", "ComplementNB", "GaussianProcessClassifier", "GradientBoostingClassifier", "HistGradientBoostingClassifier", "LogisticRegressionCV", "MultinomialNB", "NuSVC", "QuadraticDiscriminantAnalysis", "RadiusNeighborsClassifier"]
#These classifiers are just bad...
class_ban.extend(["AdaBoostClassifier", "BernoulliNB", "GaussianNB", "LinearDiscriminantAnalysis", "LinearSVC", "LogisticRegression", "NearestCentroid", "OneVsRestClassifier", "OutputCodeClassifier", "PassiveAggressiveClassifier", "Perceptron", "RidgeClassifier", "RidgeClassifierCV", "SGDClassifier", "SVC"])
# These classifiers are just slow...
class_ban.extend(["CalibratedClassifierCV", "LabelSpreading", "MLPClassifier", "OneVsOneClassifier"])
# These classifiers we don't need.
class_ban.extend(["DummyClassifier", "MultiOutputClassifier", "StackingClassifier", "VotingClassifier"])

# Create a final list that excludes the ban list.
class_final = [temp_class for temp_class in class_list if temp_class[0] not in class_ban]

In [14]:
# For keeping score from our classifiers.
score_list = []

time_before = time.time()

# Iterate through each model...
for temp_class in enumerate(class_final, start=1):
    # Create an instance of the model.
    model_instance = _construct_instance(temp_class[1][1])
    model_class = MultiOutputClassifier(model_instance)
    # Train it, score it, and time it.
    print(str(temp_class[0]) + ". Training: " + temp_class[1][0], end=", ")
    time_start = time.time()
    model_class.fit(data_xtrain, data_ytrain)
    model_score = model_class.score(data_xtest, data_ytest)
    time_end = time.time()
    time_diff = round(time_end - time_start, 2)
    print(str(model_score) + ", " + str(time_diff) + "s.")
    score_list.append((model_score, temp_class[1][0], model_instance))
    
time_after = time.time()
time_diff = round(time_after - time_before, 2)
print("Total training time: " + str(time_diff))

1. Training: BaggingClassifier, 0.2721518987341772, 0.55s.
2. Training: DecisionTreeClassifier, 0.22468354430379747, 0.08s.
3. Training: ExtraTreeClassifier, 0.16455696202531644, 0.02s.
4. Training: ExtraTreesClassifier, 0.25, 1.61s.
5. Training: KNeighborsClassifier, 0.09177215189873418, 0.08s.
6. Training: LabelPropagation, 0.16455696202531644, 1.76s.
7. Training: RandomForestClassifier, 0.2879746835443038, 1.61s.
Total training time: 5.72


In [15]:
# Get and show the top three.
score_final = sorted(score_list, key=operator.itemgetter(0))
score_top = score_final[-3:]
print("Training:")
print(score_top)

# Create a bag of the top three.
bag_list = []
for temp_model in score_top:
    bag_list.append((temp_model[1], temp_model[2]))

# Time it and train a top three voting regressor.
time_start = time.time()
model_bag = VotingClassifier(estimators=bag_list)
model_class_final = MultiOutputClassifier(model_bag)
model_class_final.fit(data_xtrain, data_ytrain)
model_score = model_class_final.score(data_xtest, data_ytest)
time_end = time.time()
time_diff = round(time_end - time_start, 2)
print(str(model_score) + ", " + str(time_diff) + "s.")

Training:
[(0.25, 'ExtraTreesClassifier', ExtraTreesClassifier()), (0.2721518987341772, 'BaggingClassifier', BaggingClassifier()), (0.2879746835443038, 'RandomForestClassifier', RandomForestClassifier())]
0.2721518987341772, 3.8s.


In [16]:
# Let's free up some memory.
score_list = None
score_final = None
score_top = None
bag_list = None

In [17]:
# Prep our test target to "fill in the blanks" for.
target_test = [[temp_item for temp_item in target_keep.values() if temp_item != None]]
print(target_test)

[[3, 2, 1, 17, 7, 9, 8, 5, 4, 2, 3, -1, 1, 0, 6, 0, 7, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 6, 0, 0]]


In [18]:
# For matching up our target prediction with names.
# Pre: Takes a printer handle, our target mapping, and target predictions.
# Post: Returns none.
def match_target(input_printer, input_map, input_predict):
    target_name = [temp_name for temp_name in input_map.keys() if input_map[temp_name] == None]
    target_final = []
    for temp_name in enumerate(target_name):
        target_final.append((temp_name[1], input_predict[0][temp_name[0]]))
    input_printer.pprint(target_final)
    return

In [19]:
handle_printer = pprint.PrettyPrinter(indent=4)

# Now let's make some predictions!
target_predict = model_reg_final.predict(target_test)
match_target(handle_printer, target_keep, target_predict)
print()
target_predict = model_class_final.predict(target_test)
match_target(handle_printer, target_keep, target_predict)

[('level', -0.5640074149237634), ('hp', 27.64057304905558)]

[('level', -1), ('hp', 9)]
