In [1]:
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score

from pathlib import Path

import yaml
from yaml import Loader

In [2]:
# path to dataset
# change to reflect dataset location

train_path = 'split_set/train'
test_path = 'split_set/test'
val_path = 'split_set/val'

architecture_vocab = ['Conv2d', 'Linear', 'MaxPool2d', 'BatchNorm2d', 'Dropout2d', 'ReLU', 'SELU', 'LeakyReLU', 'Flatten', 'Tanh', 'BatchNorm1d', 'Dropout', 'Softmax']
# unnecessary as all models use Adam
optimizer_vocab = ['SGD', 'Adam', 'Adadelta', 'Adagrad']

In [41]:
# load training set

train_data = []

for filename in Path(train_path).glob('**/meta_data.yml'):
    data_item = {}
    with open(filename, 'r') as f:
        yamldata = yaml.load(f, Loader=Loader)
        data_item['mdl_str'] = yamldata['arch_and_hp']
        data_item['train_loss'] = yamldata['train_loss']
        data_item['train_error'] = yamldata['train_error']
        data_item['val_loss'] = yamldata['val_loss']
        data_item['val_error'] = yamldata['val_error']
        data_item['test_loss'] = yamldata['test_loss']
        data_item['test_error'] = yamldata['test_error']
        # model architecture as bag of words, and number of occurences
        data_item['mdl_vect'] = [data_item['mdl_str'].count(vocab) for vocab in architecture_vocab] + [data_item['train_loss'], data_item['train_error']]
    
    train_data.append(data_item)
    
train_data = np.array(train_data)

train_X = []
train_y = []

for data_item in train_data:
    train_X.append(data_item['mdl_vect'])
    train_y.append(data_item['test_error'])
    
train_X = np.array(train_X, dtype=float)
train_y = np.array(train_y, dtype=float)

In [42]:
# load test set

test_data = []

for filename in Path(test_path).glob('**/meta_data.yml'):
    data_item = {}
    with open(filename, 'r') as f:
        yamldata = yaml.load(f, Loader=Loader)
        data_item['mdl_str'] = yamldata['arch_and_hp']
        data_item['train_loss'] = yamldata['train_loss']
        data_item['train_error'] = yamldata['train_error']
        data_item['val_loss'] = yamldata['val_loss']
        data_item['val_error'] = yamldata['val_error']
        data_item['test_loss'] = yamldata['test_loss']
        data_item['test_error'] = yamldata['test_error']
        # model architecture as bag of words
        data_item['mdl_vect'] = [data_item['mdl_str'].count(vocab) for vocab in architecture_vocab] + [data_item['train_loss'], data_item['train_error']]
    
    test_data.append(data_item)
    
test_data = np.array(test_data)

test_X = []
test_y = []

for data_item in test_data:
    test_X.append(data_item['mdl_vect'])
    test_y.append(data_item['test_error'])
    
test_X = np.array(test_X, dtype=float)
test_y = np.array(test_y, dtype=float)

In [43]:
# load validation set

val_data = []

for filename in Path(val_path).glob('**/meta_data.yml'):
    data_item = {}
    with open(filename, 'r') as f:
        yamldata = yaml.load(f, Loader=Loader)
        data_item['mdl_str'] = yamldata['arch_and_hp']
        data_item['train_loss'] = yamldata['train_loss']
        data_item['train_error'] = yamldata['train_error']
        data_item['val_loss'] = yamldata['val_loss']
        data_item['val_error'] = yamldata['val_error']
        data_item['test_loss'] = yamldata['test_loss']
        data_item['test_error'] = yamldata['test_error']
        # model architecture as bag of words
        data_item['mdl_vect'] = [data_item['mdl_str'].count(vocab) for vocab in architecture_vocab] + [data_item['train_loss'], data_item['train_error']]
    
    val_data.append(data_item)
    
val_data = np.array(val_data)

val_X = []
val_y = []

for data_item in val_data:
    val_X.append(data_item['mdl_vect'])
    val_y.append(data_item['test_error'])
    
val_X = np.array(val_X, dtype=float)
val_y = np.array(val_y, dtype=float)

In [44]:
reg = LinearRegression().fit(train_X, train_y)
ridge = Ridge(alpha=1.0).fit(train_X, train_y)
knn = KNeighborsRegressor(n_neighbors=15).fit(train_X, train_y)

In [45]:
test_y_reg = reg.predict(test_X)
reg_sse_err = np.sum((test_y_reg - test_y) ** 2)
reg_mse_err = reg_sse_err / test_X.shape[0]
reg_r2 = r2_score(test_y, test_y_reg)

test_y_ridge = ridge.predict(test_X)
ridge_sse_err = np.sum((test_y_ridge - test_y) ** 2)
ridge_mse_err = ridge_sse_err / test_X.shape[0]
ridge_r2 = r2_score(test_y, test_y_ridge)

test_y_knn = knn.predict(test_X)
knn_sse_err = np.sum((test_y_knn - test_y) ** 2)
knn_mse_err = knn_sse_err / test_X.shape[0]
knn_r2 = r2_score(test_y, test_y_knn)

print("Sum of Squared Errors for:")
print("Linear Regression, Ridge Regression, KNN")
print(reg_sse_err, ridge_sse_err, knn_sse_err)
print()
print("Mean Squared Errors for:")
print("Linear Regression, Ridge Regression, KNN")
print(reg_mse_err, ridge_mse_err, knn_mse_err)
print()
print("R2 Scores for :")
print("Linear Regression, Ridge Regression, KNN")
print(reg_r2, ridge_r2, knn_r2)

Sum of Squared Errors for:
Linear Regression, Ridge Regression, KNN
1.0844433859842815 1.084922497459192 1.4140649898952904

Mean Squared Errors for:
Linear Regression, Ridge Regression, KNN
0.004499765087071708 0.0045017531014904235 0.005867489584627761

R2 Scores for :
Linear Regression, Ridge Regression, KNN
0.6195368262569537 0.619368736087677 0.5038932775211846


### Using Pickled data

In [46]:
import pickle

with open('data.p', 'rb') as f:
    data = pickle.load(f)

In [48]:
train_X = []
train_y = []
test_X = []
test_y = []
val_X = []
val_y = []

for data_item in data:
    x = [data_item['meta_data']['arch_and_hp'].count(vocab) for vocab in architecture_vocab]
    x += [data_item['meta_data']['train_loss'], data_item['meta_data']['train_error']]
    x += data_item['other_path']['train_accs'][:10]
    x += data_item['other_path']['train_errors'][:10]
    x += data_item['other_path']['train_losses'][:10]
    x += data_item['other_path']['val_accs'][:10]
    x += data_item['other_path']['val_errors'][:10]
    x += data_item['other_path']['val_losses'][:10]
    
    # Needs work as length of param stats are different for each model
#     try:
#         x += data_item['param_stats']['init_params_mu']
#         x += data_item['param_stats']['final_params_mu']
#         x += data_item['param_stats']['init_params_std']
#         x += data_item['param_stats']['final_params_std']
#         x += data_item['param_stats']['init_params_l2']
#         x += data_item['param_stats']['final_params_l2']
#     except:
#         continue
    
    y = data_item['meta_data']['test_error']
    
    if 'train' in data_item['filename']:
        train_X.append(x)
        train_y.append(y)
        
    if 'test' in data_item['filename']:
        test_X.append(x)
        test_y.append(y)
        
    if 'val' in data_item['filename']:
        val_X.append(x)
        val_y.append(y)

train_X = np.array(train_X, dtype=float)
train_y = np.array(train_y, dtype=float)
        
test_X = np.array(test_X, dtype=float)
test_y = np.array(test_y, dtype=float)

val_X = np.array(val_X, dtype=float)
val_y = np.array(val_y, dtype=float)

In [49]:
reg = LinearRegression().fit(train_X, train_y)
ridge = Ridge(alpha=1.0).fit(train_X, train_y)
knn = KNeighborsRegressor(n_neighbors=12).fit(train_X, train_y)

In [50]:
test_y_reg = reg.predict(test_X)
reg_sse_err = np.sum((test_y_reg - test_y) ** 2)
reg_mse_err = reg_sse_err / test_X.shape[0]
reg_r2 = r2_score(test_y, test_y_reg)

test_y_ridge = ridge.predict(test_X)
ridge_sse_err = np.sum((test_y_ridge - test_y) ** 2)
ridge_mse_err = ridge_sse_err / test_X.shape[0]
ridge_r2 = r2_score(test_y, test_y_ridge)

test_y_knn = knn.predict(test_X)
knn_sse_err = np.sum((test_y_knn - test_y) ** 2)
knn_mse_err = knn_sse_err / test_X.shape[0]
knn_r2 = r2_score(test_y, test_y_knn)

print("Sum of Squared Errors for:")
print("Linear Regression, Ridge Regression, KNN")
print(reg_sse_err, ridge_sse_err, knn_sse_err)
print()
print("Mean Squared Errors for:")
print("Linear Regression, Ridge Regression, KNN")
print(reg_mse_err, ridge_mse_err, knn_mse_err)
print()
print("R2 Scores for :")
print("Linear Regression, Ridge Regression, KNN")
print(reg_r2, ridge_r2, knn_r2)

Sum of Squared Errors for:
Linear Regression, Ridge Regression, KNN
0.45348196007311337 0.5018549249432513 0.9924573495652939

Mean Squared Errors for:
Linear Regression, Ridge Regression, KNN
0.0018816678841208025 0.0020823855806773916 0.004118080288652672

R2 Scores for :
Linear Regression, Ridge Regression, KNN
0.8409016201357192 0.8239305804523988 0.6518089575718093
