In [1]:
import pandas as pd
from nebula.data.yg_ar.setup_data_image_hard import read_data
from nebula.common import to_scale_one, write_pickle, read_pickle
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
import os
import os.path as osp
import numpy as np

In [2]:
def evaluate(model, test_x, test_y):
    res = model.predict(test_x)
    correct = res == test_y
    accuracy = correct.sum() / len(res)
    return res, accuracy


def create_dirs_to_file(path):
    dirs = "/".join(osp.join(path).split("/")[:-1])
    if not osp.exists(dirs):
        os.makedirs(dirs)


def load_or_train(train_x, train_y, test_x, test_y, train_func, label_map, path):
    
    if osp.exists(path):
        return read_pickle(path)
    
    create_dirs_to_file(path)
    
    trained_model = train_func(train_x, train_y)
    predictions, accuracy = evaluate(trained_model, test_x, test_y)
    
    df, df_incorrect, df_correct = format_results(predictions, test_y, label_map)
    
    write_pickle(path, (trained_model, predictions, accuracy, df, df_incorrect, df_correct, label_map)) 
    
    return trained_model, predictions, accuracy, df, df_incorrect, df_correct, label_map


def format_results(predictions, labels, label_map):
    df = pd.DataFrame(
        data={
            "prediction": predictions,
            "label": labels
        }
    )
    df["check"] = df["prediction"] == df["label"]

    label_map_reverse = {v:k for k, v in label_map.items()}

    df["prediction_name"] = df.prediction.map(label_map_reverse)
    df["label_name"] = df.label.map(label_map_reverse)

    df_incorrect = df[~df.check]
    df_correct = df[df.check]

    return df, df_incorrect, df_correct

In [3]:
def create_label_map(labels):
    label_set = set()
    for lt in labels:
        label_set.add(lt)
        
    label_set = list(label_set)
    label_set.sort()

    label_map = {}
    count = 0
    for l in label_set:
        label_map[l] = count
        count += 1
        
    return label_map

In [4]:
# create data sets
df_path_easy = '/home/ubuntu/data/yg_ar/image_easy2_ds_sift_amiya.pkl'
df_easy, train_df_easy, test_df_easy, valid_df_easy = read_pickle(df_path_easy)

df_path_medium = '/home/ubuntu/data/yg_ar/image_medium_ds_sift_amiya.pkl'
df_medium, train_df_medium, test_df_medium, valid_df_medium = read_pickle(df_path_medium)
                                                                  
df_path_hard = '/home/ubuntu/data/yg_ar/image_hard_ds_sift_amiya.pkl'
df_hard, train_df_hard, test_df_hard, valid_df_hard = read_pickle(df_path_hard)

In [5]:
# create labels
label_map_a_easy = create_label_map(df_easy["label_a"])
label_map_at_easy = create_label_map(df_easy["label_at"])

label_map_a_medium = create_label_map(df_medium["label_a"])
label_map_at_medium = create_label_map(df_medium["label_at"])

label_map_a_hard = create_label_map(df_hard["label_a"])
label_map_at_hard = create_label_map(df_hard["label_at"])

In [6]:
train_x_easy = train_df_easy["image"].to_list()
train_x_medium = train_df_medium["image"].to_list()
train_x_hard = train_df_hard["image"].to_list()

In [7]:
train_y_a_easy = train_df_easy["label_a"].map(label_map_a_easy).to_list()
train_y_at_easy = train_df_easy["label_at"].map(label_map_at_easy).to_list()

train_y_a_medium = train_df_medium["label_a"].map(label_map_a_medium).to_list()
train_y_at_medium = train_df_medium["label_at"].map(label_map_at_medium).to_list()

train_y_a_hard = train_df_hard["label_a"].map(label_map_a_medium).to_list()
train_y_at_hard = train_df_hard["label_at"].map(label_map_at_medium).to_list()

In [8]:
test_x_easy = test_df_easy["image"].to_list()
test_x_medium = test_df_medium["image"].to_list()
test_x_hard = test_df_hard["image"].to_list()

In [9]:
test_y_a_easy = test_df_easy["label_a"].map(label_map_a_easy).to_list()
test_y_at_easy = test_df_easy["label_at"].map(label_map_at_easy).to_list()

test_y_a_medium = test_df_medium["label_a"].map(label_map_a_medium).to_list()
test_y_at_medium = test_df_medium["label_at"].map(label_map_at_medium).to_list()

test_y_a_hard = test_df_hard["label_a"].map(label_map_a_hard).to_list()
test_y_at_hard = test_df_hard["label_at"].map(label_map_at_hard).to_list()

In [10]:
valid_x_easy = valid_df_easy["image"].to_list()
valid_x_medium = valid_df_medium["image"].to_list()
valid_x_hard = valid_df_hard["image"].to_list()

In [11]:
valid_y_a_easy = valid_df_easy["label_a"].map(label_map_a_easy).to_list()
valid_y_at_easy = valid_df_easy["label_at"].map(label_map_at_easy).to_list()

valid_y_a_medium = valid_df_medium["label_a"].map(label_map_a_medium).to_list()
valid_y_at_medium = valid_df_medium["label_at"].map(label_map_at_medium).to_list()

valid_y_a_hard = valid_df_hard["label_a"].map(label_map_a_hard).to_list()
valid_y_at_hard = valid_df_hard["label_at"].map(label_map_at_hard).to_list()

In [20]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "loss":["log_loss"],
    "learning_rate": [0.1],
    "min_samples_split": [0.1, 0.5],
    "min_samples_leaf": [0.1, 0.5], # 2^15 / 2000
    "max_depth":[3, 5], # up to 15
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse"],
    "subsample":[0.5],
    "n_estimators":[10, 50] # 
}


grid_model = GridSearchCV(
    estimator=GradientBoostingClassifier(),
    param_grid=param_grid,
    refit=True,
    verbose=3,
    cv=2,
)

grid_model.fit(valid_x_hard, valid_y_at_hard)

print("best score: ", grid_model.best_score_)
print("best_params: ", grid_model.best_params_)

Fitting 2 folds for each of 32 candidates, totalling 64 fits
[CV 1/2] END criterion=friedman_mse, learning_rate=0.1, loss=log_loss, max_depth=3, max_features=log2, min_samples_leaf=0.1, min_samples_split=0.1, n_estimators=10, subsample=0.5;, score=0.058 total time=   0.5s
[CV 2/2] END criterion=friedman_mse, learning_rate=0.1, loss=log_loss, max_depth=3, max_features=log2, min_samples_leaf=0.1, min_samples_split=0.1, n_estimators=10, subsample=0.5;, score=0.043 total time=   0.5s
[CV 1/2] END criterion=friedman_mse, learning_rate=0.1, loss=log_loss, max_depth=3, max_features=log2, min_samples_leaf=0.1, min_samples_split=0.1, n_estimators=50, subsample=0.5;, score=0.066 total time=   2.4s
[CV 2/2] END criterion=friedman_mse, learning_rate=0.1, loss=log_loss, max_depth=3, max_features=log2, min_samples_leaf=0.1, min_samples_split=0.1, n_estimators=50, subsample=0.5;, score=0.058 total time=   2.4s
[CV 1/2] END criterion=friedman_mse, learning_rate=0.1, loss=log_loss, max_depth=3, max_fea

In [21]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "loss":["log_loss"],
    "learning_rate": [0.1, 0.3],
    "min_samples_split": [0.05, 0.1, 0.2],
    "min_samples_leaf": [0.05, 0.1, 0.2],
    "max_depth":[2, 3, 4],
    "max_features":["sqrt"],
    "criterion": ["friedman_mse"],
    "subsample":[0.5],
    "n_estimators":[50, 80]
}


grid_model = GridSearchCV(
    estimator=GradientBoostingClassifier(),
    param_grid=param_grid,
    refit=True,
    verbose=3,
    cv=2,
)

grid_model.fit(valid_x_hard, valid_y_at_hard)

print("best score: ", grid_model.best_score_)
print("best_params: ", grid_model.best_params_)

Fitting 2 folds for each of 108 candidates, totalling 216 fits
[CV 1/2] END criterion=friedman_mse, learning_rate=0.1, loss=log_loss, max_depth=2, max_features=sqrt, min_samples_leaf=0.05, min_samples_split=0.05, n_estimators=50, subsample=0.5;, score=0.064 total time=   2.4s
[CV 2/2] END criterion=friedman_mse, learning_rate=0.1, loss=log_loss, max_depth=2, max_features=sqrt, min_samples_leaf=0.05, min_samples_split=0.05, n_estimators=50, subsample=0.5;, score=0.077 total time=   2.4s
[CV 1/2] END criterion=friedman_mse, learning_rate=0.1, loss=log_loss, max_depth=2, max_features=sqrt, min_samples_leaf=0.05, min_samples_split=0.05, n_estimators=80, subsample=0.5;, score=0.067 total time=   4.0s
[CV 2/2] END criterion=friedman_mse, learning_rate=0.1, loss=log_loss, max_depth=2, max_features=sqrt, min_samples_leaf=0.05, min_samples_split=0.05, n_estimators=80, subsample=0.5;, score=0.068 total time=   3.9s
[CV 1/2] END criterion=friedman_mse, learning_rate=0.1, loss=log_loss, max_depth=

In [13]:
def train_gbt_best(data_x, data_y):
    clf = GradientBoostingClassifier(
        criterion="friedman_mse",
        loss="log_loss",
        max_features="sqrt",
        min_samples_leaf=0.1,
        min_samples_split=0.05,
        subsample=0.5,
        n_estimators=50,
        learning_rate=0.1,
        max_depth=2,
        random_state=0,
        verbose=1,
    )
    clf.fit(data_x, data_y)
    return clf

In [26]:
save_path = "/home/ubuntu/data/yg_ar/classic_models_hard_correctSIFT/gbt_best_at.pkl"
(
    trained_svm_at, 
    predictions_svm_at, 
    accuracy_svm_at, 
    df_svm_at,
    df_incorrect_svm_at, 
    df_correct_svm_at,
    label_map_svm_at
)= load_or_train(
    train_x_hard, 
    train_y_at_hard, 
    test_x_hard, 
    test_y_at_hard, 
    train_gbt_best, 
    label_map_at_hard, 
    save_path
)
print(accuracy_svm_at)
print(df_incorrect_svm_at.head())

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           3.6756           0.0101           27.69s
         2           3.6640           0.0083           27.16s
         3           3.6521           0.0106           26.89s
         4           3.6397           0.0096           26.36s
         5           3.6283           0.0092           25.93s
         6           3.6178           0.0089           25.19s
         7           3.6050           0.0073           24.58s
         8           3.5982           0.0069           23.98s
         9           3.5895           0.0070           23.34s
        10           3.5777           0.0080           22.70s
        20           3.4956           0.0054           17.06s
        30           3.4267           0.0040           11.39s
        40           3.3722           0.0027            5.68s
        50           3.3347           0.0018            0.00s
0.10185185185185185
   prediction  label  check prediction_name      

In [16]:
save_path = "/home/ubuntu/data/yg_ar/classic_models_hard_correctSIFT/gbt_best_a.pkl"
(
    trained_svm_a, 
    predictions_svm_a, 
    accuracy_svm_a, 
    df_svm_a, 
    df_incorrect_svm_a, 
    df_correct_svm_a,
    label_map_svm_a
)= load_or_train(
    train_x_hard, 
    train_y_a_hard, 
    test_x_hard, 
    test_y_a_hard, 
    train_gbt_best, 
    label_map_a_hard, 
    save_path
)
print(accuracy_svm_a)
print(df_incorrect_svm_a.head())

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           2.2911           0.0101            5.75s
         2           2.2826           0.0085            5.08s
         3           2.2758           0.0065            4.66s
         4           2.2701           0.0056            4.11s
         5           2.2634           0.0056            4.05s
         6           2.2556           0.0073            3.98s
         7           2.2477           0.0073            3.91s
         8           2.2401           0.0069            3.83s
         9           2.2351           0.0056            3.74s
        10           2.2283           0.0052            3.66s
        20           2.1714           0.0054            2.36s
        30           2.1279           0.0040            1.68s
        40           2.0940           0.0021            0.95s
        50           2.0578           0.0025            0.00s
0.28055555555555556
   prediction  label  check prediction_name      