In [11]:
import pandas as pd
from nebula.data.yg_ar.setup_data_image_hard import read_data
from nebula.common import to_scale_one, write_pickle, read_pickle
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
import os
import os.path as osp
import numpy as np

In [26]:
def evaluate(model, test_x, test_y):
    res = model.predict(test_x)
    correct = res == test_y
    accuracy = correct.sum() / len(res)
    return res, accuracy


def create_dirs_to_file(path):
    dirs = "/".join(osp.join(path).split("/")[:-1])
    if not osp.exists(dirs):
        os.makedirs(dirs)


def load_or_train(train_x, train_y, test_x, test_y, train_func, label_map, path):
    
    if osp.exists(path):
        return read_pickle(path)
    
    create_dirs_to_file(path)
    
    trained_model = train_func(train_x, train_y)
    predictions, accuracy = evaluate(trained_model, test_x, test_y)
    
    df, df_incorrect, df_correct = format_results(predictions, test_y, label_map)
    
    write_pickle(path, (trained_model, predictions, accuracy, df, df_incorrect, df_correct, label_map)) 
    
    return trained_model, predictions, accuracy, df, df_incorrect, df_correct, label_map


def format_results(predictions, labels, label_map):
    df = pd.DataFrame(
        data={
            "prediction": predictions,
            "label": labels
        }
    )
    df["check"] = df["prediction"] == df["label"]

    label_map_reverse = {v:k for k, v in label_map.items()}

    df["prediction_name"] = df.prediction.map(label_map_reverse)
    df["label_name"] = df.label.map(label_map_reverse)

    df_incorrect = df[~df.check]
    df_correct = df[df.check]

    return df, df_incorrect, df_correct

In [12]:
def create_label_map(labels):
    label_set = set()
    for lt in labels:
        label_set.add(lt)
        
    label_set = list(label_set)
    label_set.sort()

    label_map = {}
    count = 0
    for l in label_set:
        label_map[l] = count
        count += 1
        
    return label_map

In [13]:
# create data sets
df_path_easy = '/home/ubuntu/data/yg_ar/image_easy2_ds_sift_amiya.pkl'
df_easy, train_df_easy, test_df_easy, valid_df_easy = read_pickle(df_path_easy)

df_path_medium = '/home/ubuntu/data/yg_ar/image_medium_ds_sift_amiya.pkl'
df_medium, train_df_medium, test_df_medium, valid_df_medium = read_pickle(df_path_medium)
                                                                  
df_path_hard = '/home/ubuntu/data/yg_ar/image_hard_ds_sift_amiya.pkl'
df_hard, train_df_hard, test_df_hard, valid_df_hard = read_pickle(df_path_hard)

In [14]:
# create labels
label_map_a_easy = create_label_map(df_easy["label_a"])
label_map_at_easy = create_label_map(df_easy["label_at"])

label_map_a_medium = create_label_map(df_medium["label_a"])
label_map_at_medium = create_label_map(df_medium["label_at"])

label_map_a_hard = create_label_map(df_hard["label_a"])
label_map_at_hard = create_label_map(df_hard["label_at"])

In [15]:
train_x_easy = train_df_easy["image"].to_list()
train_x_medium = train_df_medium["image"].to_list()
train_x_hard = train_df_hard["image"].to_list()

In [16]:
train_y_a_easy = train_df_easy["label_a"].map(label_map_a_easy).to_list()
train_y_at_easy = train_df_easy["label_at"].map(label_map_at_easy).to_list()

train_y_a_medium = train_df_medium["label_a"].map(label_map_a_medium).to_list()
train_y_at_medium = train_df_medium["label_at"].map(label_map_at_medium).to_list()

train_y_a_hard = train_df_hard["label_a"].map(label_map_a_medium).to_list()
train_y_at_hard = train_df_hard["label_at"].map(label_map_at_medium).to_list()

In [17]:
test_x_easy = test_df_easy["image"].to_list()
test_x_medium = test_df_medium["image"].to_list()
test_x_hard = test_df_hard["image"].to_list()

In [18]:
test_y_a_easy = test_df_easy["label_a"].map(label_map_a_easy).to_list()
test_y_at_easy = test_df_easy["label_at"].map(label_map_at_easy).to_list()

test_y_a_medium = test_df_medium["label_a"].map(label_map_a_medium).to_list()
test_y_at_medium = test_df_medium["label_at"].map(label_map_at_medium).to_list()

test_y_a_hard = test_df_hard["label_a"].map(label_map_a_hard).to_list()
test_y_at_hard = test_df_hard["label_at"].map(label_map_at_hard).to_list()

In [35]:
valid_x_easy = valid_df_easy["image"].to_list()
valid_x_medium = valid_df_medium["image"].to_list()
valid_x_hard = valid_df_hard["image"].to_list()

In [36]:
valid_y_a_easy = valid_df_easy["label_a"].map(label_map_a_easy).to_list()
valid_y_at_easy = valid_df_easy["label_at"].map(label_map_at_easy).to_list()

valid_y_a_medium = valid_df_medium["label_a"].map(label_map_a_medium).to_list()
valid_y_at_medium = valid_df_medium["label_at"].map(label_map_at_medium).to_list()

valid_y_a_hard = valid_df_hard["label_a"].map(label_map_a_hard).to_list()
valid_y_at_hard = valid_df_hard["label_at"].map(label_map_at_hard).to_list()

In [41]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C':      [0.1, 1, 10, 100], 
    'gamma':  [1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'rbf', 'poly']
} 

grid_model = GridSearchCV(
    estimator=svm.SVC(),
    param_grid=param_grid,
    refit=True,
    verbose=3,
    cv=2,
)

grid_model.fit(valid_x_hard, valid_y_at_hard)

print("best score: ", grid_model.best_score_)
print("best_params: ", grid_model.best_params_)

Fitting 2 folds for each of 48 candidates, totalling 96 fits
[CV 1/2] END .....C=0.1, gamma=1, kernel=linear;, score=0.081 total time=   0.2s
[CV 2/2] END .....C=0.1, gamma=1, kernel=linear;, score=0.075 total time=   0.3s
[CV 1/2] END ........C=0.1, gamma=1, kernel=rbf;, score=0.051 total time=   0.2s
[CV 2/2] END ........C=0.1, gamma=1, kernel=rbf;, score=0.049 total time=   0.3s
[CV 1/2] END .......C=0.1, gamma=1, kernel=poly;, score=0.068 total time=   0.2s
[CV 2/2] END .......C=0.1, gamma=1, kernel=poly;, score=0.058 total time=   0.2s
[CV 1/2] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.081 total time=   0.2s
[CV 2/2] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.075 total time=   0.2s
[CV 1/2] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.050 total time=   0.2s
[CV 2/2] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.044 total time=   0.3s
[CV 1/2] END .....C=0.1, gamma=0.1, kernel=poly;, score=0.067 total time=   0.2s
[CV 2/2] END .....C=0.1, gamma=0.1, kernel=poly;

In [39]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C':      [0.5, 1, 3, 5], 
    'gamma':  [0.05, 0.01, 0.005],
    'kernel': ['linear', 'rbf', 'poly']
} 

grid_model = GridSearchCV(
    estimator=svm.SVC(),
    param_grid=param_grid,
    refit=True,
    verbose=3,
    cv=2,
)

grid_model.fit(valid_x_hard, valid_y_at_hard)

print("best score: ", grid_model.best_score_)
print("best_params: ", grid_model.best_params_)

Fitting 2 folds for each of 36 candidates, totalling 72 fits
[CV 1/2] END ..C=0.5, gamma=0.05, kernel=linear;, score=0.080 total time=   0.3s
[CV 2/2] END ..C=0.5, gamma=0.05, kernel=linear;, score=0.074 total time=   0.3s
[CV 1/2] END .....C=0.5, gamma=0.05, kernel=rbf;, score=0.050 total time=   0.3s
[CV 2/2] END .....C=0.5, gamma=0.05, kernel=rbf;, score=0.050 total time=   0.3s
[CV 1/2] END ....C=0.5, gamma=0.05, kernel=poly;, score=0.065 total time=   0.2s
[CV 2/2] END ....C=0.5, gamma=0.05, kernel=poly;, score=0.060 total time=   0.2s
[CV 1/2] END ..C=0.5, gamma=0.01, kernel=linear;, score=0.080 total time=   0.2s
[CV 2/2] END ..C=0.5, gamma=0.01, kernel=linear;, score=0.074 total time=   0.3s
[CV 1/2] END .....C=0.5, gamma=0.01, kernel=rbf;, score=0.084 total time=   0.3s
[CV 2/2] END .....C=0.5, gamma=0.01, kernel=rbf;, score=0.085 total time=   0.3s
[CV 1/2] END ....C=0.5, gamma=0.01, kernel=poly;, score=0.069 total time=   0.2s
[CV 2/2] END ....C=0.5, gamma=0.01, kernel=poly;

In [37]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C':      [0.8, 1, 2], 
    'gamma':  [0.03, 0.01, 0.007],
    'kernel': ['linear', 'rbf', 'poly']
} 

grid_model = GridSearchCV(
    estimator=svm.SVC(),
    param_grid=param_grid,
    refit=True,
    verbose=3,
    cv=2,
)

grid_model.fit(valid_x_hard, valid_y_at_hard)

print("best score: ", grid_model.best_score_)
print("best_params: ", grid_model.best_params_)

Fitting 2 folds for each of 27 candidates, totalling 54 fits
[CV 1/2] END ..C=0.8, gamma=0.03, kernel=linear;, score=0.080 total time=   0.3s
[CV 2/2] END ..C=0.8, gamma=0.03, kernel=linear;, score=0.074 total time=   0.3s
[CV 1/2] END .....C=0.8, gamma=0.03, kernel=rbf;, score=0.059 total time=   0.3s
[CV 2/2] END .....C=0.8, gamma=0.03, kernel=rbf;, score=0.066 total time=   0.3s
[CV 1/2] END ....C=0.8, gamma=0.03, kernel=poly;, score=0.067 total time=   0.2s
[CV 2/2] END ....C=0.8, gamma=0.03, kernel=poly;, score=0.061 total time=   0.2s
[CV 1/2] END ..C=0.8, gamma=0.01, kernel=linear;, score=0.080 total time=   0.3s
[CV 2/2] END ..C=0.8, gamma=0.01, kernel=linear;, score=0.074 total time=   0.3s
[CV 1/2] END .....C=0.8, gamma=0.01, kernel=rbf;, score=0.089 total time=   0.3s
[CV 2/2] END .....C=0.8, gamma=0.01, kernel=rbf;, score=0.077 total time=   0.3s
[CV 1/2] END ....C=0.8, gamma=0.01, kernel=poly;, score=0.076 total time=   0.2s
[CV 2/2] END ....C=0.8, gamma=0.01, kernel=poly;

In [31]:
def train_svm_best(data_x, data_y):
    clf = svm.SVC(C=1, gamma=0.01, kernel="rbf")
    clf.fit(data_x, data_y)
    return clf

In [32]:
save_path = "/home/ubuntu/data/yg_ar/classic_models_hard_correctSIFT/svm_best_at.pkl"
(
    trained_svm_at, 
    predictions_svm_at, 
    accuracy_svm_at, 
    df_svm_at,
    df_incorrect_svm_at, 
    df_correct_svm_at,
    label_map_svm_at
)= load_or_train(
    train_x_hard, 
    train_y_at_hard, 
    test_x_hard, 
    test_y_at_hard, 
    train_svm_best, 
    label_map_at_hard, 
    save_path
)
print(accuracy_svm_at)
print(df_incorrect_svm_at.head())

0.1574074074074074
   prediction  label  check      prediction_name           label_name
0          14     13  False  lord_of_the_dance_3  lord_of_the_dance_2
1          39     13  False         warrior_II_4  lord_of_the_dance_2
3          32     13  False        warrior_III_1  lord_of_the_dance_2
5          34     13  False        warrior_III_3  lord_of_the_dance_2
6          27     13  False           triangle_4  lord_of_the_dance_2


In [33]:
save_path = "/home/ubuntu/data/yg_ar/classic_models_hard_correctSIFT/svm_best_a.pkl"
(
    trained_svm_a, 
    predictions_svm_a, 
    accuracy_svm_a, 
    df_svm_a, 
    df_incorrect_svm_a, 
    df_correct_svm_a,
    label_map_svm_a
)= load_or_train(
    train_x_hard, 
    train_y_a_hard, 
    test_x_hard, 
    test_y_a_hard, 
    train_svm_best, 
    label_map_a_hard, 
    save_path
)
print(accuracy_svm_a)
print(df_incorrect_svm_a.head())

0.40046296296296297
   prediction  label  check prediction_name         label_name
0           6      3  False        triangle  lord_of_the_dance
1           8      3  False      warrior_II  lord_of_the_dance
3           9      3  False     warrior_III  lord_of_the_dance
5           9      3  False     warrior_III  lord_of_the_dance
6           6      3  False        triangle  lord_of_the_dance
