# Classical regression models

* KNN
* Kernel SVM
* Random Forest

In [1]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from training.creating_dataset import load_and_preprocess_data
from training.constants import FEATURES, DATASET_FILES, ORDERED_CHARACTERISTICS_FULL


bestiaries = load_and_preprocess_data(
    [f"../../pathfinder_2e_remaster_data/{f}" for f in DATASET_FILES], FEATURES
)

columns = [col for col in bestiaries.columns if col not in ["book", "level"]]
scaler = MinMaxScaler()
min_max_df = pd.DataFrame()
min_max_df[columns] = pd.DataFrame(
    scaler.fit_transform(bestiaries[columns]), index=bestiaries.index
)
min_max_df["book"] = bestiaries["book"]
min_max_df["level"] = bestiaries["level"]
bestiaries = min_max_df
bestiaries = bestiaries[ORDERED_CHARACTERISTICS_FULL + ["book", "level"]]
bestiaries.head()

Unnamed: 0,str,dex,con,int,wis,cha,ac,hp,perception,fortitude,...,area-damage_weakness,cold_weakness,cold-iron_weakness,evil_weakness,fire_weakness,good_weakness,slashing_weakness,splash-damage_weakness,book,level
0,0.588235,0.588235,0.357143,0.2,0.470588,0.529412,0.509434,0.252087,0.156522,0.425532,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Pathfinder #164: Hands of the Devil,8
1,0.588235,0.411765,0.428571,0.6,0.411765,0.411765,0.396226,0.123539,0.130435,0.297872,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Pathfinder #163: Ruins of Gauntlight,5
2,0.529412,0.470588,0.214286,0.4,0.470588,0.352941,0.320755,0.078464,0.069565,0.170213,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Pathfinder #163: Ruins of Gauntlight,3
3,0.352941,0.529412,0.214286,0.333333,0.470588,0.352941,0.358491,0.048414,0.104348,0.170213,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Pathfinder #164: Hands of the Devil,3
4,0.588235,0.588235,0.5,0.466667,0.529412,0.588235,0.490566,0.198664,0.156522,0.361702,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Pathfinder #164: Hands of the Devil,8


In [2]:
import os
from training.splitting_dataset import split_dataframe, get_date_books_mapping

os.chdir("../../training")
books_dates_map = get_date_books_mapping()

books_to_include = [
    book for _, row in books_dates_map["books"].iteritems() for book in row
]
bestiaries = bestiaries[bestiaries["book"].isin(books_to_include)]
X_train, X_test, y_train, y_test = split_dataframe(bestiaries)
os.chdir("../notebooks/models")

In [21]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV


clf = KNeighborsRegressor()

hyper_params = {
    "leaf_size": list(range(50, 100, 10)),
    "weights": ["uniform", "distance"],
    "metric": ["minkowski", "manhattan", "euclidean"],
    "n_neighbors": list(range(1, 51)),
}

model = GridSearchCV(
    estimator=clf,
    param_grid=hyper_params,
    scoring="neg_mean_squared_error",
    cv=5,
    verbose=2,
    return_train_score=True,
    n_jobs=-1,
)
model.fit(X_train, y_train)

Fitting 5 folds for each of 1500 candidates, totalling 7500 fits


In [22]:
model.best_params_

{'leaf_size': 50,
 'metric': 'manhattan',
 'n_neighbors': 5,
 'weights': 'distance'}

In [23]:
from sklearn.metrics import mean_absolute_error, mean_squared_error


y_pred = model.predict(X_test)

print(mean_absolute_error(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))

1.2435195397196206
2.817641490858506


In [24]:
y_pred = model.predict(X_train)

print(mean_absolute_error(y_train, y_pred))
print(mean_squared_error(y_train, y_pred))

0.0
0.0


In [5]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV


svm = SVR(kernel="rbf", max_iter=10000)
hyper_params = {"C": [1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4, 1e5]}

model = GridSearchCV(
    estimator=svm,
    param_grid=hyper_params,
    scoring="neg_mean_squared_error",
    cv=5,
    verbose=2,
    return_train_score=True,
    n_jobs=-1,
)
model.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits




In [6]:
model.best_params_

{'C': 10.0}

In [25]:
svm = SVR(kernel="rbf", max_iter=10000)
hyper_params = {"C": np.linspace(1, 100, num=100)}

model = GridSearchCV(
    estimator=svm,
    param_grid=hyper_params,
    scoring="neg_mean_squared_error",
    cv=5,
    verbose=2,
    return_train_score=True,
    n_jobs=-1,
)
model.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [26]:
model.best_params_

{'C': 7.0}

In [27]:
svm = SVR(kernel="rbf", max_iter=10000)
hyper_params = {"C": np.linspace(1, 10, num=100)}

model = GridSearchCV(
    estimator=svm,
    param_grid=hyper_params,
    scoring="neg_mean_squared_error",
    cv=5,
    verbose=2,
    return_train_score=True,
    n_jobs=-1,
)
model.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [28]:
model.best_params_

{'C': 6.363636363636364}

In [29]:
from sklearn.metrics import mean_absolute_error, mean_squared_error


y_pred = model.predict(X_test)

mean_absolute_error(y_test, y_pred)

0.5594337917196204

In [30]:
mean_squared_error(y_test, y_pred)

1.0202996727869298

In [31]:
y_pred = model.predict(X_train)

print(mean_absolute_error(y_train, y_pred))
print(mean_squared_error(y_train, y_pred))

0.19529077488239519
0.09230465868082151


In [37]:
from sklearn.ensemble import RandomForestRegressor
from training.constants import RANDOM_STATE


rf = RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=-1)
hyper_params = {
    # "n_estimators": [
    #     int(x) for x in np.linspace(start=100, stop=800, num=8)
    # ],
    "max_features": [0.3],
    "max_depth": [100, 200, 500],
    "criterion": ["squared_error", "absolute_error"],
}
model = GridSearchCV(
    estimator=rf,
    param_grid=hyper_params,
    scoring="neg_mean_absolute_error",
    cv=5,
    return_train_score=True,
    n_jobs=-1,
)
model.fit(X_train, y_train)

In [38]:
model.best_params_

{'criterion': 'squared_error', 'max_depth': 100, 'max_features': 0.3}

In [39]:
from sklearn.metrics import mean_absolute_error, mean_squared_error


y_pred = model.predict(X_test)

mean_absolute_error(y_test, y_pred)

0.4423897058823529

In [41]:
mean_squared_error(y_test, y_pred)

0.5033687499999999

In [42]:
y_pred = model.predict(X_train)

print(mean_absolute_error(y_train, y_pred))
print(mean_squared_error(y_train, y_pred))

0.08296759941089833
0.01791509572901326
