# Experiments

In [6]:
# IMPORT CODE 
import os 
import cv2
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import *
from sklearn.model_selection import RandomizedSearchCV
from IPython import display
from PIL import Image


def cv2_imshow(a):
  """A replacement for cv2.imshow() for use in Jupyter notebooks.
  Args:
    a : np.ndarray. shape (N, M) or (N, M, 1) is an NxM grayscale image. shape
      (N, M, 3) is an NxM BGR color image. shape (N, M, 4) is an NxM BGRA color
      image.
  """
  a = a.clip(0, 255).astype('uint8')
  # cv2 stores colors as BGR; convert to RGB
  if a.ndim == 3:
    if a.shape[2] == 4:
      a = cv2.cvtColor(a, cv2.COLOR_BGRA2RGBA)
    else:
      a = cv2.cvtColor(a, cv2.COLOR_BGR2RGB)
  display.display(Image.fromarray(a))

def load_data(location, labels, metric):
    fv_list = os.listdir(location)
    fv_nb = len(fv_list)

    with open(location + fv_list[0], "rb") as handle: 
        fv0 = pickle.load(handle)

    with open(labels, "rb") as handle: 
        labels = pickle.load(handle)

    print("Number of vectors: " + str(fv_nb) + " of length " + str(len(fv0)))

    data_x = np.zeros(shape=(fv_nb, len(fv0)))
    data_y = np.zeros(shape=(fv_nb,))

    for i, fv_filename in enumerate(fv_list): 
        with open(location + fv_filename, "rb") as handle: 
            fv = pickle.load(handle)
        
        fid = fv_filename.split(".")[0]

        data_y[i] = 0
        if metric in labels[fid]: 
            data_y[i] = labels[fid][metric] 

        data_x[i] = fv

    return data_x, data_y, fv_list


In [9]:
DATA_FOLDER = "D:\\NIST datasets\\feature_vector_dataset\\"

LABELS = DATA_FOLDER + "qualities_v2.pkl"
metric = "lqm"

train_x, train_y, train_names = load_data(DATA_FOLDER + "train/", LABELS, metric)
test_x, test_y, test_names = load_data(DATA_FOLDER + "test/", LABELS, metric)

Number of vectors: 12000 of length 192
Number of vectors: 1200 of length 192


In [4]:

n_estimators = [int(x) for x in np.linspace(start=50, stop=1050, num = 11)]
# Number of features to consider at every split
max_features = ['auto']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 150, num = 15)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2]#, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1]#, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True ]#, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'verbose': [True],
               'n_jobs': [2]}

print(random_grid)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=100, cv=3, verbose=5, random_state=42, n_jobs = -1)# Fit the random search model
rf_random.fit(train_x, train_y)

print(rf_random.best_params_)


{'n_estimators': [50, 150, 250, 350, 450, 550, 650, 750, 850, 950, 1050], 'max_features': ['auto'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, None], 'min_samples_split': [2], 'min_samples_leaf': [1], 'bootstrap': [True], 'verbose': [True], 'n_jobs': [2]}
Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   11.5s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   48.7s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:  1.8min
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:  3.3min


{'verbose': True, 'n_jobs': 2, 'n_estimators': 1050, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 140, 'bootstrap': True}


[Parallel(n_jobs=2)]: Done 1050 out of 1050 | elapsed:  4.3min finished


In [10]:

regr = RandomForestRegressor(n_estimators=750, max_depth=150, min_samples_split=2, min_samples_leaf=1, random_state=0, verbose=True, n_jobs=8)

regr.fit(train_x, train_y)

train_predictions = np.clip(regr.predict(train_x), 0, 100)
predictions = np.clip(regr.predict(test_x), 0, 100)

print("train MSE", mean_squared_error(train_y, train_predictions))
print("test MSE", mean_squared_error(test_y, predictions))

print("train MAE", mean_absolute_error(train_y, train_predictions))
print("test MAE", mean_absolute_error(test_y, predictions))

print("train R2", r2_score(train_y, train_predictions))
print("test R2", r2_score(test_y, predictions))


import pickle
with open("rf_model_lqm.pkl", "wb") as handle: 
    pickle.dump(regr, handle)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    3.8s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   19.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   44.3s
[Parallel(n_jobs=8)]: Done 750 out of 750 | elapsed:  1.3min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 750 out of 750 | elapsed:    0.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 750 out of 750 | elapsed:    0.0s finished


train MSE 13.782119585925926
test MSE 181.76113516296294
train MAE 2.366202111111111
test MAE 9.68543
train R2 0.9894983579184549
test R2 0.7219772113601342
