In [1]:
import pandas as pd
from nebula.data.yg_ar.setup_data_image_hard import read_data
from nebula.common import to_scale_one, write_pickle, read_pickle
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
import os
import os.path as osp
import numpy as np

In [2]:
def evaluate(model, test_x, test_y):
    res = model.predict(test_x)
    correct = res == test_y
    accuracy = correct.sum() / len(res)
    return res, accuracy


def create_dirs_to_file(path):
    dirs = "/".join(osp.join(path).split("/")[:-1])
    if not osp.exists(dirs):
        os.makedirs(dirs)


def load_or_train(train_x, train_y, test_x, test_y, train_func, label_map, path):
    
    if osp.exists(path):
        return read_pickle(path)
    
    create_dirs_to_file(path)
    
    trained_model = train_func(train_x, train_y)
    predictions, accuracy = evaluate(trained_model, test_x, test_y)
    
    df, df_incorrect, df_correct = format_results(predictions, test_y, label_map)
    
    write_pickle(path, (trained_model, predictions, accuracy, df, df_incorrect, df_correct, label_map)) 
    
    return trained_model, predictions, accuracy, df, df_incorrect, df_correct, label_map


def format_results(predictions, labels, label_map):
    df = pd.DataFrame(
        data={
            "prediction": predictions,
            "label": labels
        }
    )
    df["check"] = df["prediction"] == df["label"]

    label_map_reverse = {v:k for k, v in label_map.items()}

    df["prediction_name"] = df.prediction.map(label_map_reverse)
    df["label_name"] = df.label.map(label_map_reverse)

    df_incorrect = df[~df.check]
    df_correct = df[df.check]

    return df, df_incorrect, df_correct

In [3]:
def create_label_map(labels):
    label_set = set()
    for lt in labels:
        label_set.add(lt)
        
    label_set = list(label_set)
    label_set.sort()

    label_map = {}
    count = 0
    for l in label_set:
        label_map[l] = count
        count += 1
        
    return label_map

In [4]:
# create data sets
df_path_easy = '/home/ubuntu/data/yg_ar/image_easy2_ds_sift_amiya.pkl'
df_easy, train_df_easy, test_df_easy, valid_df_easy = read_pickle(df_path_easy)

df_path_medium = '/home/ubuntu/data/yg_ar/image_medium_ds_sift_amiya.pkl'
df_medium, train_df_medium, test_df_medium, valid_df_medium = read_pickle(df_path_medium)
                                                                  
df_path_hard = '/home/ubuntu/data/yg_ar/image_hard_ds_sift_amiya.pkl'
df_hard, train_df_hard, test_df_hard, valid_df_hard = read_pickle(df_path_hard)

In [5]:
# create labels
label_map_a_easy = create_label_map(df_easy["label_a"])
label_map_at_easy = create_label_map(df_easy["label_at"])

label_map_a_medium = create_label_map(df_medium["label_a"])
label_map_at_medium = create_label_map(df_medium["label_at"])

label_map_a_hard = create_label_map(df_hard["label_a"])
label_map_at_hard = create_label_map(df_hard["label_at"])

In [6]:
train_x_easy = train_df_easy["image"].to_list()
train_x_medium = train_df_medium["image"].to_list()
train_x_hard = train_df_hard["image"].to_list()

In [7]:
train_y_a_easy = train_df_easy["label_a"].map(label_map_a_easy).to_list()
train_y_at_easy = train_df_easy["label_at"].map(label_map_at_easy).to_list()

train_y_a_medium = train_df_medium["label_a"].map(label_map_a_medium).to_list()
train_y_at_medium = train_df_medium["label_at"].map(label_map_at_medium).to_list()

train_y_a_hard = train_df_hard["label_a"].map(label_map_a_medium).to_list()
train_y_at_hard = train_df_hard["label_at"].map(label_map_at_medium).to_list()

In [8]:
test_x_easy = test_df_easy["image"].to_list()
test_x_medium = test_df_medium["image"].to_list()
test_x_hard = test_df_hard["image"].to_list()

In [9]:
test_y_a_easy = test_df_easy["label_a"].map(label_map_a_easy).to_list()
test_y_at_easy = test_df_easy["label_at"].map(label_map_at_easy).to_list()

test_y_a_medium = test_df_medium["label_a"].map(label_map_a_medium).to_list()
test_y_at_medium = test_df_medium["label_at"].map(label_map_at_medium).to_list()

test_y_a_hard = test_df_hard["label_a"].map(label_map_a_hard).to_list()
test_y_at_hard = test_df_hard["label_at"].map(label_map_at_hard).to_list()

In [10]:
valid_x_easy = valid_df_easy["image"].to_list()
valid_x_medium = valid_df_medium["image"].to_list()
valid_x_hard = valid_df_hard["image"].to_list()

In [11]:
valid_y_a_easy = valid_df_easy["label_a"].map(label_map_a_easy).to_list()
valid_y_at_easy = valid_df_easy["label_at"].map(label_map_at_easy).to_list()

valid_y_a_medium = valid_df_medium["label_a"].map(label_map_a_medium).to_list()
valid_y_at_medium = valid_df_medium["label_at"].map(label_map_at_medium).to_list()

valid_y_a_hard = valid_df_hard["label_a"].map(label_map_a_hard).to_list()
valid_y_at_hard = valid_df_hard["label_at"].map(label_map_at_hard).to_list()

In [13]:
from sklearn.model_selection import GridSearchCV

param_grid = dict(
    solver=['newton-cg', 'lbfgs', 'liblinear'],
    penalty=['l1', 'l2', 'elasticnet'],
    C=[100, 10, 1.0, 0.1, 0.01]
)


grid_model = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid=param_grid,
    refit=True,
    verbose=3,
    cv=2,
)

grid_model.fit(valid_x_hard, valid_y_at_hard)

print("best score: ", grid_model.best_score_)
print("best_params: ", grid_model.best_params_)

Fitting 2 folds for each of 45 candidates, totalling 90 fits
[CV 1/2] END .C=100, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/2] END .C=100, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 1/2] END .....C=100, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/2] END .....C=100, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s




[CV 1/2] END C=100, penalty=l1, solver=liblinear;, score=0.062 total time=   7.4s




[CV 2/2] END C=100, penalty=l1, solver=liblinear;, score=0.070 total time=   5.6s
[CV 1/2] END C=100, penalty=l2, solver=newton-cg;, score=0.069 total time=   7.5s
[CV 2/2] END C=100, penalty=l2, solver=newton-cg;, score=0.065 total time=   5.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/2] END ...C=100, penalty=l2, solver=lbfgs;, score=0.074 total time=   0.7s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/2] END ...C=100, penalty=l2, solver=lbfgs;, score=0.070 total time=   0.4s
[CV 1/2] END C=100, penalty=l2, solver=liblinear;, score=0.061 total time=   0.5s
[CV 2/2] END C=100, penalty=l2, solver=liblinear;, score=0.069 total time=   0.5s
[CV 1/2] END C=100, penalty=elasticnet, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/2] END C=100, penalty=elasticnet, solver=newton-cg;, score=nan total time=   0.0s
[CV 1/2] END C=100, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/2] END C=100, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 1/2] END C=100, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 2/2] END C=100, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 1/2] END ..C=10, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/2] END ..C=10, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 1/2] END ......C=10, penalty=l1, solver=lbfgs;, score=nan total time=

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/2] END ....C=10, penalty=l2, solver=lbfgs;, score=0.080 total time=   0.3s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/2] END ....C=10, penalty=l2, solver=lbfgs;, score=0.072 total time=   0.5s
[CV 1/2] END C=10, penalty=l2, solver=liblinear;, score=0.070 total time=   0.4s
[CV 2/2] END C=10, penalty=l2, solver=liblinear;, score=0.075 total time=   0.4s
[CV 1/2] END C=10, penalty=elasticnet, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/2] END C=10, penalty=elasticnet, solver=newton-cg;, score=nan total time=   0.0s
[CV 1/2] END C=10, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/2] END C=10, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 1/2] END C=10, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 2/2] END C=10, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 1/2] END .C=1.0, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/2] END .C=1.0, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 1/2] END .....C=1.0, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/2] END ...C=1.0, penalty=l2, solver=lbfgs;, score=0.080 total time=   0.7s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/2] END ...C=1.0, penalty=l2, solver=lbfgs;, score=0.083 total time=   0.6s
[CV 1/2] END C=1.0, penalty=l2, solver=liblinear;, score=0.080 total time=   0.3s
[CV 2/2] END C=1.0, penalty=l2, solver=liblinear;, score=0.078 total time=   0.3s
[CV 1/2] END C=1.0, penalty=elasticnet, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/2] END C=1.0, penalty=elasticnet, solver=newton-cg;, score=nan total time=   0.0s
[CV 1/2] END C=1.0, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/2] END C=1.0, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 1/2] END C=1.0, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 2/2] END C=1.0, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 1/2] END .C=0.1, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/2] END .C=0.1, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 1/2] END .....C=0.1, penalty=l1, solver=lbfgs;, score=nan total time=

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/2] END ...C=0.1, penalty=l2, solver=lbfgs;, score=0.086 total time=   0.3s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/2] END ...C=0.1, penalty=l2, solver=lbfgs;, score=0.085 total time=   0.5s
[CV 1/2] END C=0.1, penalty=l2, solver=liblinear;, score=0.084 total time=   0.2s
[CV 2/2] END C=0.1, penalty=l2, solver=liblinear;, score=0.080 total time=   0.2s
[CV 1/2] END C=0.1, penalty=elasticnet, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/2] END C=0.1, penalty=elasticnet, solver=newton-cg;, score=nan total time=   0.0s
[CV 1/2] END C=0.1, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/2] END C=0.1, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 1/2] END C=0.1, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 2/2] END C=0.1, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 1/2] END C=0.01, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/2] END C=0.01, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 1/2] END ....C=0.01, penalty=l1, solver=lbfgs;, score=nan total time=

50 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/ubuntu/.local/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/ubuntu/.local/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

----------

best score:  0.08579545454545454
best_params:  {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}


In [15]:
from sklearn.model_selection import GridSearchCV

param_grid = dict(
    solver=['newton-cg'],
    penalty=['l2'],
    C=[0.3, 0.1, 0.05]
)


grid_model = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid=param_grid,
    refit=True,
    verbose=3,
    cv=4,
)

grid_model.fit(valid_x_hard, valid_y_at_hard)

print("best score: ", grid_model.best_score_)
print("best_params: ", grid_model.best_params_)

Fitting 4 folds for each of 3 candidates, totalling 12 fits
[CV 1/4] END C=0.3, penalty=l2, solver=newton-cg;, score=0.093 total time=   2.0s
[CV 2/4] END C=0.3, penalty=l2, solver=newton-cg;, score=0.111 total time=   2.1s
[CV 3/4] END C=0.3, penalty=l2, solver=newton-cg;, score=0.100 total time=   1.5s
[CV 4/4] END C=0.3, penalty=l2, solver=newton-cg;, score=0.086 total time=   2.1s
[CV 1/4] END C=0.1, penalty=l2, solver=newton-cg;, score=0.107 total time=   1.4s
[CV 2/4] END C=0.1, penalty=l2, solver=newton-cg;, score=0.114 total time=   1.4s
[CV 3/4] END C=0.1, penalty=l2, solver=newton-cg;, score=0.098 total time=   2.1s
[CV 4/4] END C=0.1, penalty=l2, solver=newton-cg;, score=0.091 total time=   1.0s
[CV 1/4] END C=0.05, penalty=l2, solver=newton-cg;, score=0.114 total time=   0.9s
[CV 2/4] END C=0.05, penalty=l2, solver=newton-cg;, score=0.125 total time=   1.0s
[CV 3/4] END C=0.05, penalty=l2, solver=newton-cg;, score=0.105 total time=   1.0s
[CV 4/4] END C=0.05, penalty=l2, so

In [16]:
from sklearn.model_selection import GridSearchCV

param_grid = dict(
    solver=['newton-cg'],
    penalty=['l2'],
    C=[0.07, 0.05, 0.03]
)


grid_model = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid=param_grid,
    refit=True,
    verbose=3,
    cv=4,
)

grid_model.fit(valid_x_hard, valid_y_at_hard)

print("best score: ", grid_model.best_score_)
print("best_params: ", grid_model.best_params_)

Fitting 4 folds for each of 3 candidates, totalling 12 fits
[CV 1/4] END C=0.07, penalty=l2, solver=newton-cg;, score=0.102 total time=   1.7s
[CV 2/4] END C=0.07, penalty=l2, solver=newton-cg;, score=0.120 total time=   1.8s
[CV 3/4] END C=0.07, penalty=l2, solver=newton-cg;, score=0.098 total time=   2.6s
[CV 4/4] END C=0.07, penalty=l2, solver=newton-cg;, score=0.091 total time=   1.0s
[CV 1/4] END C=0.05, penalty=l2, solver=newton-cg;, score=0.114 total time=   1.2s
[CV 2/4] END C=0.05, penalty=l2, solver=newton-cg;, score=0.125 total time=   2.0s
[CV 3/4] END C=0.05, penalty=l2, solver=newton-cg;, score=0.105 total time=   1.5s
[CV 4/4] END C=0.05, penalty=l2, solver=newton-cg;, score=0.098 total time=   1.3s
[CV 1/4] END C=0.03, penalty=l2, solver=newton-cg;, score=0.109 total time=   1.4s
[CV 2/4] END C=0.03, penalty=l2, solver=newton-cg;, score=0.134 total time=   0.8s
[CV 3/4] END C=0.03, penalty=l2, solver=newton-cg;, score=0.109 total time=   1.0s
[CV 4/4] END C=0.03, penalt

In [17]:
from sklearn.model_selection import GridSearchCV

param_grid = dict(
    solver=['newton-cg'],
    penalty=['l2'],
    C=[0.04, 0.03, 0.02, 0.01]
)


grid_model = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid=param_grid,
    refit=True,
    verbose=3,
    cv=4,
)

grid_model.fit(valid_x_hard, valid_y_at_hard)

print("best score: ", grid_model.best_score_)
print("best_params: ", grid_model.best_params_)

Fitting 4 folds for each of 4 candidates, totalling 16 fits
[CV 1/4] END C=0.04, penalty=l2, solver=newton-cg;, score=0.109 total time=   1.6s
[CV 2/4] END C=0.04, penalty=l2, solver=newton-cg;, score=0.127 total time=   1.7s
[CV 3/4] END C=0.04, penalty=l2, solver=newton-cg;, score=0.107 total time=   1.0s
[CV 4/4] END C=0.04, penalty=l2, solver=newton-cg;, score=0.095 total time=   0.7s
[CV 1/4] END C=0.03, penalty=l2, solver=newton-cg;, score=0.109 total time=   1.2s
[CV 2/4] END C=0.03, penalty=l2, solver=newton-cg;, score=0.134 total time=   1.1s
[CV 3/4] END C=0.03, penalty=l2, solver=newton-cg;, score=0.109 total time=   1.1s
[CV 4/4] END C=0.03, penalty=l2, solver=newton-cg;, score=0.091 total time=   1.1s
[CV 1/4] END C=0.02, penalty=l2, solver=newton-cg;, score=0.107 total time=   0.7s
[CV 2/4] END C=0.02, penalty=l2, solver=newton-cg;, score=0.118 total time=   1.6s
[CV 3/4] END C=0.02, penalty=l2, solver=newton-cg;, score=0.111 total time=   1.1s
[CV 4/4] END C=0.02, penalt

In [20]:
def train_logistic_best(data_x, data_y):
    clf = LogisticRegression(C=0.03, penalty="l2", solver="newton-cg")
    clf.fit(data_x, data_y)
    return clf

In [21]:
save_path = "/home/ubuntu/data/yg_ar/classic_models_hard_correctSIFT/logistic_best_at.pkl"
(
    trained_svm_at, 
    predictions_svm_at, 
    accuracy_svm_at, 
    df_svm_at,
    df_incorrect_svm_at, 
    df_correct_svm_at,
    label_map_svm_at
)= load_or_train(
    train_x_hard, 
    train_y_at_hard, 
    test_x_hard, 
    test_y_at_hard, 
    train_logistic_best, 
    label_map_at_hard, 
    save_path
)
print(accuracy_svm_at)
print(df_incorrect_svm_at.head())

0.13055555555555556
   prediction  label  check      prediction_name           label_name
0          25     13  False           triangle_2  lord_of_the_dance_2
1          20     13  False        thunderbolt_1  lord_of_the_dance_2
3          32     13  False        warrior_III_1  lord_of_the_dance_2
4          12     13  False  lord_of_the_dance_1  lord_of_the_dance_2
5          35     13  False        warrior_III_4  lord_of_the_dance_2


In [22]:
save_path = "/home/ubuntu/data/yg_ar/classic_models_hard_correctSIFT/logistic_best_a.pkl"
(
    trained_svm_a, 
    predictions_svm_a, 
    accuracy_svm_a, 
    df_svm_a, 
    df_incorrect_svm_a, 
    df_correct_svm_a,
    label_map_svm_a
)= load_or_train(
    train_x_hard, 
    train_y_a_hard, 
    test_x_hard, 
    test_y_a_hard, 
    train_logistic_best, 
    label_map_a_hard, 
    save_path
)
print(accuracy_svm_a)
print(df_incorrect_svm_a.head())

0.33425925925925926
   prediction  label  check prediction_name         label_name
0           6      3  False        triangle  lord_of_the_dance
1           5      3  False     thunderbolt  lord_of_the_dance
3           9      3  False     warrior_III  lord_of_the_dance
4           6      3  False        triangle  lord_of_the_dance
5           9      3  False     warrior_III  lord_of_the_dance
