# Train and evaluate all models

## Models:
* Linear regression
* Ridge
* Lasso
* LAD regression
* Huber regression
* Linear SVM
* KNN
* Kernel SVM
* Random Forest
* LightGBM

## Features
* Full + AOO

In [1]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from training.creating_dataset import load_and_preprocess_data, min_max_scale_data
from training.constants import FEATURES, DATASET_FILES, ORDERED_CHARACTERISTICS_FULL


bestiaries = load_and_preprocess_data(
    [f"../../pathfinder_2e_remaster_data/{f}" for f in DATASET_FILES],
    FEATURES + ["aoo"],
)

bestiaries = min_max_scale_data(bestiaries)
bestiaries = bestiaries[ORDERED_CHARACTERISTICS_FULL + ["book", "level"]]
bestiaries.head()

Unnamed: 0,str,dex,con,int,wis,cha,ac,hp,perception,fortitude,...,cold_weakness,cold-iron_weakness,evil_weakness,fire_weakness,good_weakness,slashing_weakness,splash-damage_weakness,aoo,book,level
0,0.588235,0.588235,0.357143,0.2,0.470588,0.529412,0.509434,0.252087,0.156522,0.425532,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Pathfinder #164: Hands of the Devil,8
1,0.588235,0.411765,0.428571,0.6,0.411765,0.411765,0.396226,0.123539,0.130435,0.297872,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Pathfinder #163: Ruins of Gauntlight,5
2,0.529412,0.470588,0.214286,0.4,0.470588,0.352941,0.320755,0.078464,0.069565,0.170213,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Pathfinder #163: Ruins of Gauntlight,3
3,0.352941,0.529412,0.214286,0.333333,0.470588,0.352941,0.358491,0.048414,0.104348,0.170213,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Pathfinder #164: Hands of the Devil,3
4,0.588235,0.588235,0.5,0.466667,0.529412,0.588235,0.490566,0.198664,0.156522,0.361702,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Pathfinder #164: Hands of the Devil,8


In [2]:
import os
from training.splitting_dataset import split_dataframe, get_date_books_mapping

os.chdir("../../training")
books_dates_map = get_date_books_mapping()

books_to_include = [
    book for _, row in books_dates_map["books"].iteritems() for book in row
]
bestiaries = bestiaries[bestiaries["book"].isin(books_to_include)]
X_train, X_test, y_train, y_test = split_dataframe(bestiaries)
os.chdir("../notebooks/models")

In [8]:
from training.train_and_evaluate_models import train_and_evaluate_models


results = train_and_evaluate_models(
    [
        "linear_regression",
        "linear_regression_ridge",
        "linear_regression_lasso",
        "lad_regression",
        "huber_regression",
        "linear_svm",
        "kernel_svm",
        "knn",
        "random_forest",
        "lightgbm",
    ],
    X_train,
    y_train,
    X_test,
    y_test,
    thresholds=[[0.05 * i for i in range(1, 20)], [0.05 * i for i in range(5, 16)]],
)

  res = _minimize_bfgs(f, x0, args, fprime, callback=callback, **opts)


         Current function value: 0.521839
         Iterations: 114
         Function evaluations: 165
         Gradient evaluations: 158




==== Ordered Model probit ====
	--> no_rounding
		--> train
			--> rmse: 0.47296288709376
			--> mae: 0.18690213392200147
		--> test
			--> rmse: 0.6966305460192359
			--> mae: 0.3161764705882353
	--> round 0.5
		--> train
			--> rmse: 0.47296288709376
			--> mae: 0.18690213392200147
			--> accuracy: 0.8263428991905813
		--> test
			--> rmse: 0.6966305460192359
			--> mae: 0.3161764705882353
			--> accuracy: 0.7242647058823529



  res = _minimize_bfgs(f, x0, args, fprime, callback=callback, **opts)


         Current function value: 0.478363
         Iterations: 226
         Function evaluations: 294
         Gradient evaluations: 282
==== Ordered Model logit ====
	--> no_rounding
		--> train
			--> rmse: 0.456334695083245
			--> mae: 0.1758646063281825
		--> test
			--> rmse: 0.7174300539794393
			--> mae: 0.3161764705882353
	--> round 0.5
		--> train
			--> rmse: 0.456334695083245
			--> mae: 0.1758646063281825
			--> accuracy: 0.8359087564385578
		--> test
			--> rmse: 0.7174300539794393
			--> mae: 0.3161764705882353
			--> accuracy: 0.7242647058823529





In [4]:
from training.train_and_evaluate_models import print_results


for model_name, model in results.items():
    print_results(model_name, model)

==== linear_regression ====
	--> no_rounding
		--> train
			--> rmse: 0.49936950177820355
			--> mae: 0.3492769345532107
		--> test
			--> rmse: 0.8056380232526472
			--> mae: 0.4848424178772823
	--> round 0.5
		--> train
			--> rmse: 0.48449088759427017
			--> mae: 0.2141280353200883
			--> accuracy: 0.7939661515820456
		--> test
			--> rmse: 0.8043996665398437
			--> mae: 0.39705882352941174
			--> accuracy: 0.6470588235294118
	--> best_single_threshold_0.05_0.95
		--> threshold
			--> 0.4
		--> train
			--> rmse: 0.48144374486558517
			--> mae: 0.2097130242825607
			--> accuracy: 0.7991169977924945
		--> test
			--> rmse: 0.8112263773411942
			--> mae: 0.40808823529411764
			--> accuracy: 0.6360294117647058
	--> best_multiple_thresholds_0.05_0.95
		--> thresholds
			--> -1: 0.6957370643540216
			--> 0: 0.5578239201179518
			--> 1: 0.5381821891943244
			--> 2: 0.41794065967045957
			--> 3: 0.3857337585870097
			--> 4: 0.4084343291117734
			--> 5: 0.4573472740162834
			--> 6: 0.378192

In [3]:
def print_results2(model_name: str, results: dict):
    print(f"==== {model_name} ====")
    for round_type, rounded_results in results.items():
        if round_type == "model":
            continue
        print(f"\t--> {round_type}")
        for set_name, value in rounded_results.items():
            print(f"\t\t--> {set_name}")
            if set_name in ["threshold", "thresholds"]:
                continue
            for measure, m_value in value.items():
                print(f"{m_value}", end="\t")
            print()

In [None]:
for model_name, model in results.items():
    print_results2(model_name, model)

In [5]:
from training.train_and_evaluate_models import train_and_evaluate_models


results = train_and_evaluate_models(
    [
        "ordered_model_probit",
        "ordered_model_logit",
    ],
    X_train,
    y_train,
    X_test,
    y_test,
    thresholds=[[0.05 * i for i in range(1, 20)], [0.05 * i for i in range(5, 16)]],
)

  res = _minimize_bfgs(f, x0, args, fprime, callback=callback, **opts)


         Current function value: 0.521839
         Iterations: 114
         Function evaluations: 165
         Gradient evaluations: 158


[I 2024-05-30 11:58:47,687] A new study created in memory with name: no-name-aaf865b5-0f0c-49f8-9dd8-56c07cce129f
[I 2024-05-30 11:58:47,697] Trial 0 finished with value: 0.18690213392200147 and parameters: {'level_-1': 0.3856487559113005, 'level_0': 0.8425093732116675, 'level_1': 0.8399272543786441, 'level_2': 0.05357555462657461, 'level_3': 0.2812589686489586, 'level_4': 0.3970154835187818, 'level_5': 0.33617017183578957, 'level_6': 0.1789255523037075, 'level_7': 0.3134296185198607, 'level_8': 0.7739021906373924, 'level_9': 0.4065517998362221, 'level_10': 0.3883635412881215, 'level_11': 0.5607113564953162, 'level_12': 0.13181737257729387, 'level_13': 0.48645378498766356, 'level_14': 0.3490645650684755, 'level_15': 0.5314089495659703, 'level_16': 0.8956494541471033, 'level_17': 0.9292912897633718, 'level_18': 0.20197281738392125, 'level_19': 0.6253386580508851, 'level_20': 0.613721853434315}. Best is trial 0 with value: 0.18690213392200147.
[I 2024-05-30 11:58:47,705] Trial 1 finished

         Current function value: 0.478363
         Iterations: 226
         Function evaluations: 294
         Gradient evaluations: 282


[I 2024-05-30 12:00:09,296] A new study created in memory with name: no-name-f3e491e3-e6e7-4ac3-940c-8f9ee4132b1c
[I 2024-05-30 12:00:09,302] Trial 0 finished with value: 0.1758646063281825 and parameters: {'level_-1': 0.4173954567983788, 'level_0': 0.4231453463791577, 'level_1': 0.13924862731322302, 'level_2': 0.4923640838560765, 'level_3': 0.20784745984497371, 'level_4': 0.25149765120201073, 'level_5': 0.3953335978565657, 'level_6': 0.40168563488043785, 'level_7': 0.8667975753284973, 'level_8': 0.16593631957595595, 'level_9': 0.513694942410958, 'level_10': 0.9442893070874807, 'level_11': 0.4665906486998363, 'level_12': 0.6028157691575129, 'level_13': 0.32610773060375386, 'level_14': 0.8550153642634786, 'level_15': 0.5028115725657407, 'level_16': 0.9180426396024068, 'level_17': 0.09163759350609046, 'level_18': 0.3560464743859851, 'level_19': 0.4587410916490782, 'level_20': 0.8191934097329653}. Best is trial 0 with value: 0.1758646063281825.
[I 2024-05-30 12:00:09,307] Trial 1 finished

In [6]:
for model_name, model in results.items():
    print_results2(model_name, model)

==== ordered_model_probit ====
	--> no_rounding
		--> train
0.47296288709376	0.18690213392200147	
		--> test
0.6966305460192359	0.3161764705882353	
	--> round 0.5
		--> train
0.47296288709376	0.18690213392200147	0.8263428991905813	
		--> test
0.6966305460192359	0.3161764705882353	0.7242647058823529	
	--> best_single_threshold_0.05_0.95
		--> threshold
		--> train
0.47296288709376	0.18690213392200147	0.8263428991905813	
		--> test
0.6966305460192359	0.3161764705882353	0.7242647058823529	
	--> best_multiple_thresholds_0.05_0.95
		--> thresholds
		--> train
0.47296288709376	0.18690213392200147	0.8263428991905813	
		--> test
0.6966305460192359	0.3161764705882353	0.7242647058823529	
	--> best_graph_thresholds_0.05_0.95
		--> thresholds
		--> train
0.47296288709376	0.18690213392200147	0.8263428991905813	
		--> test
0.6966305460192359	0.3161764705882353	0.7242647058823529	
	--> best_single_threshold_0.25_0.75
		--> threshold
		--> train
0.47296288709376	0.18690213392200147	0.8263428991905813	