# Using the QLattice with small data sets

In many cases, a researcher has collected a fairly small dataset of a few hundred individuals. In this notebook we apply a systematic approach to test the performance of the QLattice against the usual go-to technologies for fitting and ML.

We compare to other **interpretable** models:
- Linear models (both with and without LASSO)
- Decision Trees

And to other **ensemble models** which are more black-box:
- Random Forest
- Gradient Boosting

In [None]:
import pmlb
import pandas as pd
import feyn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import time


In [None]:
print(len(pmlb.classification_dataset_names)+len(pmlb.regression_dataset_names))

In [None]:
#for name in pmlb.regression_dataset_names:
#    df = pmlb.fetch_data(name, local_cache_dir="/tmp/pmlb_data")
#    print(f"('{name}', {len(df)}, {len(df.columns)}),")   

In [None]:
datasets = pd.DataFrame([
('1027_ESL', 488, 5),
('1028_SWD', 1000, 11),
('1029_LEV', 1000, 5),
('1030_ERA', 1000, 5),
('1089_USCrime', 47, 14),
('1096_FacultySalaries', 50, 5),
('1191_BNG_pbc', 1000000, 19),
('1193_BNG_lowbwt', 31104, 10),
('1196_BNG_pharynx', 1000000, 11),
('1199_BNG_echoMonths', 17496, 10),
('1201_BNG_breastTumor', 116640, 10),
('1203_BNG_pwLinear', 177147, 11),
('1595_poker', 1025010, 11),
('192_vineyard', 52, 3),
('195_auto_price', 159, 16),
('197_cpu_act', 8192, 22),
('201_pol', 15000, 49),
('207_autoPrice', 159, 16),
('210_cloud', 108, 6),
('215_2dplanes', 40768, 11),
('218_house_8L', 22784, 9),
('225_puma8NH', 8192, 9),
('227_cpu_small', 8192, 13),
('228_elusage', 55, 3),
('229_pwLinear', 200, 11),
('230_machine_cpu', 209, 7),
('294_satellite_image', 6435, 37),
('344_mv', 40768, 11),
('4544_GeographicalOriginalofMusic', 1059, 118),
('485_analcatdata_vehicle', 48, 5),
('503_wind', 6574, 15),
('505_tecator', 240, 125),
('519_vinnie', 380, 3),
('522_pm10', 500, 8),
('523_analcatdata_neavote', 100, 3),
('527_analcatdata_election2000', 67, 15),
('529_pollen', 3848, 5),
('537_houses', 20640, 9),
('542_pollution', 60, 16),
('547_no2', 500, 8),
('556_analcatdata_apnea2', 475, 4),
('557_analcatdata_apnea1', 475, 4),
('560_bodyfat', 252, 15),
('561_cpu', 209, 8),
('562_cpu_small', 8192, 13),
('564_fried', 40768, 11),
('573_cpu_act', 8192, 22),
('574_house_16H', 22784, 17),
('579_fri_c0_250_5', 250, 6),
('581_fri_c3_500_25', 500, 26),
('582_fri_c1_500_25', 500, 26),
('583_fri_c1_1000_50', 1000, 51),
('584_fri_c4_500_25', 500, 26),
('586_fri_c3_1000_25', 1000, 26),
('588_fri_c4_1000_100', 1000, 101),
('589_fri_c2_1000_25', 1000, 26),
('590_fri_c0_1000_50', 1000, 51),
('591_fri_c1_100_10', 100, 11),
('592_fri_c4_1000_25', 1000, 26),
('593_fri_c1_1000_10', 1000, 11),
('594_fri_c2_100_5', 100, 6),
('595_fri_c0_1000_10', 1000, 11),
('596_fri_c2_250_5', 250, 6),
('597_fri_c2_500_5', 500, 6),
('598_fri_c0_1000_25', 1000, 26),
('599_fri_c2_1000_5', 1000, 6),
('601_fri_c1_250_5', 250, 6),
('602_fri_c3_250_10', 250, 11),
('603_fri_c0_250_50', 250, 51),
('604_fri_c4_500_10', 500, 11),
('605_fri_c2_250_25', 250, 26),
('606_fri_c2_1000_10', 1000, 11),
('607_fri_c4_1000_50', 1000, 51),
('608_fri_c3_1000_10', 1000, 11),
('609_fri_c0_1000_5', 1000, 6),
('611_fri_c3_100_5', 100, 6),
('612_fri_c1_1000_5', 1000, 6),
('613_fri_c3_250_5', 250, 6),
('615_fri_c4_250_10', 250, 11),
('616_fri_c4_500_50', 500, 51),
('617_fri_c3_500_5', 500, 6),
('618_fri_c3_1000_50', 1000, 51),
('620_fri_c1_1000_25', 1000, 26),
('621_fri_c0_100_10', 100, 11),
('622_fri_c2_1000_50', 1000, 51),
('623_fri_c4_1000_10', 1000, 11),
('624_fri_c0_100_5', 100, 6),
('626_fri_c2_500_50', 500, 51),
('627_fri_c2_500_10', 500, 11),
('628_fri_c3_1000_5', 1000, 6),
('631_fri_c1_500_5', 500, 6),
('633_fri_c0_500_25', 500, 26),
('634_fri_c2_100_10', 100, 11),
('635_fri_c0_250_10', 250, 11),
('637_fri_c1_500_50', 500, 51),
('641_fri_c1_500_10', 500, 11),
('643_fri_c2_500_25', 500, 26),
('644_fri_c4_250_25', 250, 26),
('645_fri_c3_500_50', 500, 51),
('646_fri_c3_500_10', 500, 11),
('647_fri_c1_250_10', 250, 11),
('648_fri_c1_250_50', 250, 51),
('649_fri_c0_500_5', 500, 6),
('650_fri_c0_500_50', 500, 51),
('651_fri_c0_100_25', 100, 26),
('653_fri_c0_250_25', 250, 26),
('654_fri_c0_500_10', 500, 11),
('656_fri_c1_100_5', 100, 6),
('657_fri_c2_250_10', 250, 11),
('658_fri_c3_250_25', 250, 26),
('659_sleuth_ex1714', 47, 8),
('663_rabe_266', 120, 3),
('665_sleuth_case2002', 147, 7),
('666_rmftsa_ladata', 508, 11),
('678_visualizing_environmental', 111, 4),
('687_sleuth_ex1605', 62, 6),
('690_visualizing_galaxy', 323, 5),
('695_chatfield_4', 235, 13),
('706_sleuth_case1202', 93, 7),
('712_chscase_geyser1', 222, 3),
('banana', 5300, 3),
('titanic', 2201, 4),
],
columns=["name","n","fcount"])

In [None]:
chosen_datasets = datasets[(datasets["n"]>=1000)]
len(chosen_datasets)

In [None]:
chosen_datasets.plot.scatter(x="n", y="fcount", loglog=True, ylabel="Number of features", xlabel="Number of observations", figsize=(4,4))

# Unility functions

In [None]:
def get_pmlb_data(name, randomseed, trainsize=100):
    df = pmlb.fetch_data(name, local_cache_dir="pmlb_data")
    return train_test_split(df,train_size=trainsize, random_state=randomseed)

from sklearn import svm, tree, linear_model, ensemble

def X(df):
    return df.iloc[:,:-1]

def y(df):
    return df.iloc[:,-1]

def fit_and_r2_score(model, train, test):
    model.fit(X(train), y(train))
    return model.score(X(train), y(train)), model.score(X(test), y(test))

# Compare to the usual suspects

In [None]:
#results = pd.DataFrame(columns=["dataset", "model", "randomseed", "train_r2", "test_r2"])
results = pd.read_csv("results-cache-100.csv")

In [None]:
results

In [None]:
#results = results[~(results["model"].str.startswith("ensemble.RandomForestRegressor(n_estimators=25)"))].reindex()

In [None]:
models = [
    linear_model.LinearRegression(),
    linear_model.Lasso(alpha=0.01, max_iter=100000),
    linear_model.Lasso(alpha=0.05, max_iter=100000),
    linear_model.Lasso(alpha=0.10, max_iter=100000),

    tree.DecisionTreeRegressor(max_depth=1),
    tree.DecisionTreeRegressor(max_depth=2),
    tree.DecisionTreeRegressor(max_depth=4),
    tree.DecisionTreeRegressor(max_depth=6),
    
    ensemble.RandomForestRegressor(n_estimators=400),
    ensemble.RandomForestRegressor(n_estimators=200),
    ensemble.RandomForestRegressor(n_estimators=100),
    ensemble.RandomForestRegressor(n_estimators=50),

    ensemble.GradientBoostingRegressor(n_estimators=400),
    ensemble.GradientBoostingRegressor(n_estimators=200),
    ensemble.GradientBoostingRegressor(n_estimators=100),
    ensemble.GradientBoostingRegressor(n_estimators=50),
]

In [None]:
def fit_comparison(name, randomseed):    
    global results

    print("Seed %i, Dataset: %s"%(randomseed,name), end="")
    train, test = get_pmlb_data(name, randomseed)
    print(" ... fetched", end="")
    for m in models:
        if ((results["dataset"]==name) & (results["model"]==str(m)) & (results["randomseed"]==randomseed)).any():
            # Skip if already run
            continue
        r2_train, r2_test = fit_and_r2_score(m, train, test)
        results = results.append({"dataset": name, "model": str(m), "randomseed": randomseed, "train_r2": r2_train, "test_r2": r2_test}, ignore_index=True)

    print(" ... and fitted")


In [None]:
for name in chosen_datasets["name"]:
    for randomseed in range(0,1):
        fit_comparison(name, randomseed)

# Fit a qgraph for each data set

In [None]:
ql = feyn.QLattice()

In [None]:
def fit_qgraph(edges, criterion, randomseed):
    global results

    key = f"QG-{edges}-{criterion}"

    for name in chosen_datasets["name"]:    
        if ((results["dataset"]==name) & (results["model"]==key) & (results["randomseed"]==randomseed)).any():
            # Skip if already run
            continue

        train, test = get_pmlb_data(name, randomseed)
        ql.reset(randomseed)
        
        qg = ql.get_regressor(train.columns, train.columns[-1]).filter(feyn.filters.MaxEdges(edges))
    
        for _ in range(100):
            qg.fit(train, threads=10, criterion=criterion)
            print(key, randomseed, name, qg[0]._paramcount)
            print("Train:\t", qg[0].r2_score(train), "\nTest:\t", qg[0].r2_score(test))
            ql.update(qg.best())

        for _ in range(1000):
            qg[0].fit(train)

        results = results.append({"dataset": name, "model": key, "randomseed": randomseed, "train_r2": qg[0].r2_score(train), "test_r2": qg[0].r2_score(test)}, ignore_index=True)

        time.sleep(6) # Protect my poor cpu

# Fit all QGraphs

In [None]:
for randomseed in range(0,1):
    fit_qgraph(11, "bic", randomseed=randomseed)
    fit_qgraph(11, "aic", randomseed=randomseed)

In [None]:
def hist_among(models=None, datasets=None):
    if models is None:
        models = results["model"].unique()
    if datasets is None:
        datasets = results["dataset"].unique()

    for dataset in datasets:
        subset = results[(results["dataset"] == dataset) & (results["test_r2"]>-1)]
        seeds = subset["randomseed"].unique()
        for seed in seeds:
            subsubset = subset[subset["randomseed"]==seed].sort_values(by="test_r2")
            subsubset.plot.barh(x="model", y=["test_r2","train_r2"], title=dataset+" "+str(seed),figsize=(8,6))
                

In [None]:
hist_among(None,None)

In [None]:
results.to_csv(f"results-cache-100.csv", index=False)

# Compare all models

In [None]:
def rank_among(models=None, rankpositions = None):
    if models is None:
        models = results["model"].unique()

    if rankpositions is None:
        rankpositions = len(models)-1

    # Only consider results for the chosen models
    res = results[results["model"].isin(models)].sort_values(by="test_r2", ascending=False)
    
    points = {m: 0 for m in models}
    
    for name in res["dataset"].unique(): # For each dataset
        for seed in res["randomseed"].unique(): # For each seed
            subset = res[(res["dataset"]==name) & (res["randomseed"]==seed)]
            if len(subset):
                for rank in range(rankpositions): # For each rank position
                    m = subset.iloc[rank].model
                    r2 = subset.iloc[rank].test_r2
                    points[m] += rankpositions - rank
    return pd.DataFrame(points.items(), columns=["model", "points"]).sort_values(by="points", ascending=True)

In [None]:
rank_among(rankpositions=1).plot.barh(x="model", y="points", label="First places", figsize=(8,6), xlabel="")

In [None]:
rank_among(rankpositions=1)

In [None]:
rank_among(rankpositions=5).plot.barh(x="model", y="points", label="Points", figsize=(8,6), xlabel="")

In [None]:
rank_among(rankpositions=5)

In [None]:
rank_among(
    models=["QG-11-aic", 
            "GradientBoostingRegressor(n_estimators=400)", 
            "Lasso(alpha=0.1, max_iter=100000)",
            "RandomForestRegressor(n_estimators=400)",
            "DecisionTreeRegressor(max_depth=1)"
           ],
    rankpositions=1
).plot.barh(x="model", y="points", label="First places", figsize=(8,1.6), xlabel="")

In [None]:
rank_among(
    models=["QG-11-aic", 
            "GradientBoostingRegressor(n_estimators=400)", 
            "Lasso(alpha=0.1, max_iter=100000)",
            "RandomForestRegressor(n_estimators=400)",
            "DecisionTreeRegressor(max_depth=1)"
           ],
    rankpositions=1
)

In [None]:
rank_among(
    models=["QG-11-aic", 
            "Lasso(alpha=0.1, max_iter=100000)",
            "DecisionTreeRegressor(max_depth=1)"
           ],
    rankpositions=1
)

In [None]:
results["randomseed"].value_counts()