In [1]:
%reload_ext autoreload
%autoreload 2
import pandas as pd
from scipy import stats
import torch

from common import load_x264, split_data, split_data_cv, evaluate_ii, evaluate_cc

In [2]:
## Configuration
random_seed = 33154

# Enter names of performance columns to consider
performances = ["rel_kbs"]

# Number of nearest neighbours to consider
# Make multiples to allow better budget comparison
topk_values = (1, 3, 5, 15, 25)
topr_values = (1, 3, 5, 15, 25)

data_dir = "../data"

In [3]:
## Load and prepare data
perf_matrix, input_features, config_features, all_performances = load_x264(
    data_dir=data_dir
)

print(f"Loaded data x264")
print(f"perf_matrix:{perf_matrix.shape}")
print(f"input_features:{input_features.shape}")
print(f"config_features:{config_features.shape}")

data_split = split_data(perf_matrix, random_state=random_seed)
train_inp = data_split["train_inp"]
train_cfg = data_split["train_cfg"]
test_inp = data_split["test_inp"]
test_cfg = data_split["test_cfg"]

# This is a look up for performance measurements from inputname + configurationID
input_config_map = (
    perf_matrix[["inputname", "configurationID"] + performances]
    .sort_values(["inputname", "configurationID"])
    .set_index(["inputname", "configurationID"])
)
all_input_names = pd.Series(
    input_config_map.index.get_level_values("inputname").unique()
)
all_config_ids = pd.Series(
    input_config_map.index.get_level_values("configurationID").unique()
)

regret_map = input_config_map.groupby("inputname").transform(
    lambda x: (x - x.min()).abs() / abs(x.min())
)
average_mape = regret_map.mean(axis=1)

rank_map = input_config_map.groupby("inputname").transform(
    lambda x: stats.rankdata(x, method="min")
)
average_ranks = rank_map.mean(axis=1)

Loaded data x264
perf_matrix:(258687, 45)
input_features:(1287, 21)
config_features:(201, 48)
Training data: 63.64%
Both new: 4.09%
Config new: 16.31%
Input new: 15.96%


In [4]:
## Prepare necessary torch tensors
# Prepare lookup tables for input/configuration performances as torch tensors
rank_arr = torch.from_numpy(
    rank_map.reset_index()  # .loc[(train_inp, train_cfg), :]
    .pivot_table(index="inputname", columns="configurationID", values=performances[0])
    .values
)
regret_arr = torch.from_numpy(
    regret_map.reset_index()  # .loc[(train_inp, train_cfg), :]
    .pivot_table(index="inputname", columns="configurationID", values=performances[0])
    .values
)

# Prepare and select training/test data according to random split
input_arr = torch.from_numpy(input_features.values).float()
config_arr = torch.from_numpy(config_features.values).float()

train_input_mask = input_features.index.isin(train_inp)
test_input_mask = input_features.index.isin(test_inp)

train_config_mask = config_features.index.isin(train_cfg)
test_config_mask = config_features.index.isin(test_cfg)

train_input_arr = input_arr[train_input_mask]
train_config_arr = config_arr[train_config_mask]

In [5]:
train_cc_rank = []
train_cc_ratio = []
train_cc_regret = []

test_cc_rank = []
test_cc_ratio = []
test_cc_regret = []

train_ii_rank = []
train_ii_ratio = []
train_ii_regret = []

test_ii_rank = []
test_ii_ratio = []
test_ii_regret = []

# Query: test data
# Database: train data

for topk in topk_values:
    train_cc = evaluate_cc(
            config_arr,
            rank_arr=rank_arr,
            regret_arr=regret_arr,
            n_neighbors=topk,
            n_recs=topr_values,
        query_mask=torch.from_numpy(train_config_mask),
        reference_mask=torch.from_numpy(train_config_mask)
    )
    train_cc_rank.append(train_cc[0].numpy())
    train_cc_regret.append(train_cc[1].numpy())
    train_cc_ratio.append(train_cc[2].numpy())

    test_cc = evaluate_cc(
            config_arr,
            rank_arr=rank_arr,
            regret_arr=regret_arr,
            n_neighbors=topk,
            n_recs=topr_values,
        query_mask=torch.from_numpy(test_config_mask),
        reference_mask=torch.from_numpy(train_config_mask)
    )    
    test_cc_rank.append(test_cc[0].numpy())
    test_cc_regret.append(test_cc[1].numpy())
    test_cc_ratio.append(test_cc[2].numpy())

    train_ii = evaluate_ii(
        input_arr,
        rank_arr=rank_arr,
        regret_arr=regret_arr,
        n_neighbors=topk,
        n_recs=topr_values,  
        query_mask=torch.from_numpy(train_input_mask),
        reference_mask=torch.from_numpy(train_input_mask)
    )
    train_ii_rank.append(train_ii[0].numpy())
    train_ii_regret.append(train_ii[1].numpy())
    train_ii_ratio.append(train_ii[2].numpy())

    test_ii = evaluate_ii(
        input_arr,
        rank_arr=rank_arr,
        regret_arr=regret_arr,
        n_neighbors=topk,
        n_recs=topr_values,
        query_mask=torch.from_numpy(test_input_mask),
        reference_mask=torch.from_numpy(train_input_mask)
    )
    test_ii_rank.append(test_ii[0].numpy())
    test_ii_regret.append(test_ii[1].numpy())
    test_ii_ratio.append(test_ii[2].numpy())

In [31]:
def prepare_df(results, topr_values, topk_values, extra_info={}):
    df = pd.DataFrame(results, columns=topr_values)
    df["k"] = topk_values
    df.set_index("k", inplace=True)
    df.columns = pd.MultiIndex.from_product([["r"], df.columns])

    for k, v in extra_info.items():
        df[k] = v

    return df

# TODO Share results in README

print("train cc ratio\n", prepare_df(train_cc_ratio, topr_values, topk_values), "\n")
print("train cc best rank\n", prepare_df(train_cc_rank, topr_values, topk_values), "\n")
print("train cc best regret\n", prepare_df(train_cc_regret, topr_values, topk_values), "\n")

print("test cc ratio\n", prepare_df(test_cc_ratio, topr_values, topk_values), "\n")
print("test cc best rank\n", prepare_df(test_cc_rank, topr_values, topk_values), "\n")
print("test cc best regret\n", prepare_df(test_cc_regret, topr_values, topk_values), "\n")

print("train ii ratio\n", prepare_df(train_ii_ratio, topr_values, topk_values), "\n")
print("train ii best rank\n", prepare_df(train_ii_rank, topr_values, topk_values), "\n")
print("train ii best regret\n", prepare_df(train_ii_regret, topr_values, topk_values), "\n")

print("test ii ratio\n", prepare_df(test_ii_ratio, topr_values, topk_values), "\n")
print("test ii best rank\n", prepare_df(test_ii_rank, topr_values, topk_values), "\n")
print("test ii best regret\n", prepare_df(test_ii_regret, topr_values, topk_values), "\n")

train cc ratio
            r                                        
           1         3         5        15        25
k                                                   
1        NaN       NaN       NaN       NaN       NaN
3   0.145963  0.218427  0.239130  0.314079  0.346832
5   0.167702  0.259834  0.275155  0.359317  0.400932
15  0.266193  0.367938  0.392547  0.468708  0.518740
25  0.264493  0.388630  0.416667  0.504193  0.563499 

train cc best rank
            r                                        
           1         3         5        15        25
k                                                   
1   0.071658  0.072469  0.072397  0.072614  0.073522
3   0.073328  0.074039  0.074065  0.074167  0.074892
5   0.071861  0.073113  0.073677  0.073892  0.074566
15  0.070357  0.071391  0.071944  0.072607  0.073641
25  0.071489  0.072606  0.073149  0.074081  0.074983 

train cc best regret
            r                                        
           1         3         5     

In [37]:
# Run cross-validation over all train/test splits
dfs = []

for data_split in split_data_cv(perf_matrix, random_state=random_seed):
    train_inp = data_split["train_inp"]
    train_cfg = data_split["train_cfg"]
    test_inp = data_split["test_inp"]
    test_cfg = data_split["test_cfg"]

    # Prepare and select training/test data according to random split
    input_arr = torch.from_numpy(input_features.values).float()
    config_arr = torch.from_numpy(config_features.values).float()

    train_input_mask = input_features.index.isin(train_inp)
    test_input_mask = input_features.index.isin(test_inp)

    train_config_mask = config_features.index.isin(train_cfg)
    test_config_mask = config_features.index.isin(test_cfg)

    train_input_arr = input_arr[train_input_mask]
    train_config_arr = config_arr[train_config_mask]

    train_cc_rank = []
    train_cc_ratio = []
    train_cc_regret = []

    test_cc_rank = []
    test_cc_ratio = []
    test_cc_regret = []

    train_ii_rank = []
    train_ii_ratio = []
    train_ii_regret = []

    test_ii_rank = []
    test_ii_ratio = []
    test_ii_regret = []

    # Query: test data
    # Database: train data

    for topk in topk_values:
        train_cc = evaluate_cc(
                config_arr,
                rank_arr=rank_arr,
                regret_arr=regret_arr,
                n_neighbors=topk,
                n_recs=topr_values,
            query_mask=torch.from_numpy(train_config_mask),
            reference_mask=torch.from_numpy(train_config_mask)
        )
        train_cc_rank.append(train_cc[0].numpy())
        train_cc_regret.append(train_cc[1].numpy())
        train_cc_ratio.append(train_cc[2].numpy())

        test_cc = evaluate_cc(
                config_arr,
                rank_arr=rank_arr,
                regret_arr=regret_arr,
                n_neighbors=topk,
                n_recs=topr_values,
            query_mask=torch.from_numpy(test_config_mask),
            reference_mask=torch.from_numpy(train_config_mask)
        )    
        test_cc_rank.append(test_cc[0].numpy())
        test_cc_regret.append(test_cc[1].numpy())
        test_cc_ratio.append(test_cc[2].numpy())

        train_ii = evaluate_ii(
            input_arr,
            rank_arr=rank_arr,
            regret_arr=regret_arr,
            n_neighbors=topk,
            n_recs=topr_values,  
            query_mask=torch.from_numpy(train_input_mask),
            reference_mask=torch.from_numpy(train_input_mask)
        )
        train_ii_rank.append(train_ii[0].numpy())
        train_ii_regret.append(train_ii[1].numpy())
        train_ii_ratio.append(train_ii[2].numpy())

        test_ii = evaluate_ii(
            input_arr,
            rank_arr=rank_arr,
            regret_arr=regret_arr,
            n_neighbors=topk,
            n_recs=topr_values,
            query_mask=torch.from_numpy(test_input_mask),
            reference_mask=torch.from_numpy(train_input_mask)
        )
        test_ii_rank.append(test_ii[0].numpy())
        test_ii_regret.append(test_ii[1].numpy())
        test_ii_ratio.append(test_ii[2].numpy())

    dfs.append(prepare_df(train_cc_rank, topr_values, topk_values, {"metric": "rank", "mode": "cc", "split": "train"}))
    dfs.append(prepare_df(train_cc_regret, topr_values, topk_values, {"metric": "regret", "mode": "cc", "split": "train"}))
    dfs.append(prepare_df(train_cc_ratio, topr_values, topk_values, {"metric": "ratio", "mode": "cc", "split": "train"}))

    dfs.append(prepare_df(test_cc_rank, topr_values, topk_values, {"metric": "rank", "mode": "cc", "split": "test"}))
    dfs.append(prepare_df(test_cc_regret, topr_values, topk_values, {"metric": "regret", "mode": "cc", "split": "test"}))
    dfs.append(prepare_df(test_cc_ratio, topr_values, topk_values, {"metric": "ratio", "mode": "cc", "split": "test"}))

    dfs.append(prepare_df(train_ii_rank, topr_values, topk_values, {"metric": "rank", "mode": "ii", "split": "train"}))
    dfs.append(prepare_df(train_ii_regret, topr_values, topk_values, {"metric": "regret", "mode": "ii", "split": "train"}))
    dfs.append(prepare_df(train_ii_ratio, topr_values, topk_values, {"metric": "ratio", "mode": "ii", "split": "train"}))

    dfs.append(prepare_df(test_ii_rank, topr_values, topk_values, {"metric": "rank", "mode": "ii", "split": "test"}))
    dfs.append(prepare_df(test_ii_regret, topr_values, topk_values, {"metric": "regret", "mode": "ii", "split": "test"}))
    dfs.append(prepare_df(test_ii_ratio, topr_values, topk_values, {"metric": "ratio", "mode": "ii", "split": "test"}))

full_df = pd.concat(dfs)
full_df.groupby(["mode", "split", "metric", "k"]).mean()

Training data: 63.64%
Both new: 4.09%
Config new: 16.31%
Input new: 15.96%
Training data: 64.04%
Both new: 3.99%
Config new: 15.91%
Input new: 16.06%
Training data: 64.10%
Both new: 3.97%
Config new: 15.93%
Input new: 16.00%
Training data: 64.10%
Both new: 3.97%
Config new: 15.93%
Input new: 16.00%
Training data: 64.10%
Both new: 3.97%
Config new: 15.93%
Input new: 16.00%


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,r,r,r,r,r
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,1,3,5,15,25
mode,split,metric,k,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
cc,test,rank,1,0.116415,0.11729,0.117167,0.116163,0.115278
cc,test,rank,3,0.11397,0.114951,0.114819,0.114587,0.114102
cc,test,rank,5,0.113905,0.114558,0.114533,0.113986,0.113526
cc,test,rank,15,0.114405,0.114948,0.115084,0.114135,0.11352
cc,test,rank,25,0.113169,0.11365,0.113676,0.112941,0.112412
cc,test,ratio,1,,,,,
cc,test,ratio,3,0.211524,0.27561,0.295427,0.368496,0.398429
cc,test,ratio,5,0.191555,0.269238,0.291262,0.368943,0.409874
cc,test,ratio,15,0.257247,0.341307,0.372592,0.448832,0.498616
cc,test,ratio,25,0.264182,0.364316,0.39738,0.486665,0.54454


In [43]:
# full_df.groupby(["mode", "split", "metric", "k"]).mean().to_clipboard()
full_df.groupby(["mode", "split", "metric", "k"]).mean().to_csv("knn_config_recommendation.csv")