In [1]:
import pandas as pd
from nebula.data.yg_ar.setup_data_image_hard import read_data
from nebula.common import to_scale_one, write_pickle, read_pickle
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
import os
import os.path as osp
import numpy as np

In [2]:
def create_label_map(labels):
    label_set = set()
    for lt in labels:
        label_set.add(lt)
        
    label_set = list(label_set)
    label_set.sort()

    label_map = {}
    count = 0
    for l in label_set:
        label_map[l] = count
        count += 1
        
    return label_map

In [3]:
df_path = '/home/ubuntu/data/yg_ar/image_hard_ds_sift_amiya.pkl' ## corect SIFt df
df, train_df, test_df, valid_df = read_pickle(df_path)

In [4]:
print(df.loc[0])

image        [1, 2, 1, 4, 1, 2, 1, 1, 0, 0, 3, 2, 0, 0, 2, ...
label_a                                                  camel
label_at                                               camel_1
file_name    camel_1_hair_0_cloth_0_pants_0_Z1031_XON17_YON...
Name: 0, dtype: object


In [5]:
label_map_a = create_label_map(df["label_a"])
label_map_at = create_label_map(df["label_at"])

In [6]:
train_x = train_df["image"].to_list()

In [7]:
train_y_a = train_df["label_a"].map(label_map_a).to_list()
train_y_at = train_df["label_at"].map(label_map_at).to_list()

In [8]:
test_x = test_df["image"].to_list()

In [9]:
test_y_a = test_df["label_a"].map(label_map_a).to_list()
test_y_at = test_df["label_at"].map(label_map_at).to_list()

In [10]:
def train_svm(data_x, data_y):
    clf = svm.SVC(max_iter=50)
    clf.fit(data_x, data_y)
    return clf


def train_svm_poly8(data_x, data_y):
    clf = svm.SVC(kernel = "poly", degree = 8, C=20)
    clf.fit(data_x, data_y)
    return clf


def train_logistic(data_x, data_y):
    clf = LogisticRegression(random_state=0)
    clf.fit(data_x, data_y)
    return clf


def train_gbt(data_x, data_y):
    clf = GradientBoostingClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=3,
        random_state=0,
        verbose=1,
        n_iter_no_change=2,
    )
    clf.fit(data_x, data_y)
    return clf


def evaluate(model, test_x, test_y):
    res = model.predict(test_x)
    correct = res == test_y
    accuracy = correct.sum() / len(res)
    return res, accuracy


def create_dirs_to_file(path):
    dirs = "/".join(osp.join(path).split("/")[:-1])
    if not osp.exists(dirs):
        os.makedirs(dirs)


def load_or_train(train_x, train_y, test_x, test_y, train_func, label_map, path):
    
    if osp.exists(path):
        return read_pickle(path)
    
    create_dirs_to_file(path)
    
    trained_model = train_func(train_x, train_y)
    predictions, accuracy = evaluate(trained_model, test_x, test_y)
    
    df, df_incorrect, df_correct = format_results(predictions, test_y, label_map)
    
    write_pickle(path, (trained_model, predictions, accuracy, df, df_incorrect, df_correct, label_map)) 
    
    return trained_model, predictions, accuracy, df, df_incorrect, df_correct, label_map


def format_results(predictions, labels, label_map):
    df = pd.DataFrame(
        data={
            "prediction": predictions,
            "label": labels
        }
    )
    df["check"] = df["prediction"] == df["label"]

    label_map_reverse = {v:k for k, v in label_map.items()}

    df["prediction_name"] = df.prediction.map(label_map_reverse)
    df["label_name"] = df.label.map(label_map_reverse)

    df_incorrect = df[~df.check]
    df_correct = df[df.check]

    return df, df_incorrect, df_correct

In [11]:
save_path = "/home/ubuntu/data/yg_ar/classic_models_hard_correctSIFT/svm_a.pkl"
(
    trained_svm_a, 
    predictions_svm_a, 
    accuracy_svm_a, 
    df_svm_a, 
    df_incorrect_svm_a, 
    df_correct_svm_a,
    label_map_svm_a
)= load_or_train(
    train_x, 
    train_y_a, 
    test_x, 
    test_y_a, 
    train_svm, 
    label_map_a, 
    save_path
)
print(accuracy_svm_a)
print(df_incorrect_svm_a.head())







0.1625
   prediction  label  check prediction_name         label_name
0           6      3  False        triangle  lord_of_the_dance
1           0      3  False           camel  lord_of_the_dance
2           0      3  False           camel  lord_of_the_dance
3           9      3  False     warrior_III  lord_of_the_dance
4           0      3  False           camel  lord_of_the_dance


In [20]:
from sklearn.model_selection import GridSearchCV
model_svm = svm.SVC(C = 30, random_state = 0)
parameters = [
     {'C': [20, 25, 30, 35, 40, 45]}
 ]
grid_model = GridSearchCV(
     estimator = model_svm,
     param_grid = parameters,
     cv = 10
 )
grid_model.fit(train_x, train_y_a)

#model_svm.fit(train_x, train_y_a)
#print("score on training set params: ", model_svm.score(train_x, train_y_a))
#print("score on testing set params: ", model_svm.score(train_x, train_y_a))
print("best score: ", grid_model.best_score_)
print("best_params: ", grid_model.best_params_)

best score:  0.7692164179104478
best_params:  {'C': 25}


In [12]:
save_path = "/home/ubuntu/data/yg_ar/classic_models_hard_correctSIFT/svm_at.pkl"
(
    trained_svm_at, 
    predictions_svm_at, 
    accuracy_svm_at, 
    df_svm_at,
    df_incorrect_svm_at, 
    df_correct_svm_at,
    label_map_svm_at
)= load_or_train(
    train_x, 
    train_y_at, 
    test_x, 
    test_y_at, 
    train_svm, 
    label_map_at, 
    save_path
)
print(accuracy_svm_at)
print(df_incorrect_svm_at.head())



0.08703703703703704
   prediction  label  check      prediction_name           label_name
0          35     13  False        warrior_III_4  lord_of_the_dance_2
1           3     13  False              camel_4  lord_of_the_dance_2
2          12     13  False  lord_of_the_dance_1  lord_of_the_dance_2
3          32     13  False        warrior_III_1  lord_of_the_dance_2
4          12     13  False  lord_of_the_dance_1  lord_of_the_dance_2


In [13]:
save_path = "/home/ubuntu/data/yg_ar/classic_models_hard_correctSIFT/logistic_a.pkl"
(
    trained_logistic_a, 
    predictions_logistic_a, 
    accuracy_logistic_a, 
    df_logistic_a,
    df_incorrect_logistic_a, 
    df_correct_logistic_a,
    label_map_logistic_a
)= load_or_train(
    train_x, 
    train_y_a, 
    test_x, 
    test_y_a, 
    train_logistic, 
    label_map_a, 
    save_path
)
print(accuracy_logistic_a)
print(df_incorrect_logistic_a.head())

0.33101851851851855
   prediction  label  check prediction_name         label_name
0           6      3  False        triangle  lord_of_the_dance
1           5      3  False     thunderbolt  lord_of_the_dance
3           9      3  False     warrior_III  lord_of_the_dance
4           6      3  False        triangle  lord_of_the_dance
5           9      3  False     warrior_III  lord_of_the_dance


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
save_path = "/home/ubuntu/data/yg_ar/classic_models_hard_correctSIFT/logistic_at.pkl"
(
    trained_logistic_at, 
    predictions_logistic_at, 
    accuracy_logistic_at, 
    df_logistic_at,
    df_incorrect_logistic_at, 
    df_correct_logistic_at,
    label_map_logistic_at
)= load_or_train(
    train_x, 
    train_y_at, 
    test_x, 
    test_y_at, 
    train_logistic, 
    label_map_at, 
    save_path
)
print(accuracy_logistic_at)
print(df_incorrect_logistic_at.head())

0.13240740740740742
   prediction  label  check      prediction_name           label_name
0          25     13  False           triangle_2  lord_of_the_dance_2
1          20     13  False        thunderbolt_1  lord_of_the_dance_2
3          32     13  False        warrior_III_1  lord_of_the_dance_2
4          12     13  False  lord_of_the_dance_1  lord_of_the_dance_2
5          35     13  False        warrior_III_4  lord_of_the_dance_2


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
save_path = "/home/ubuntu/data/yg_ar/classic_models_hard_correctSIFT/gbt01_a.pkl"
(
    trained_gbt_a, 
    predictions_gbt_a, 
    accuracy_gbt_a, 
    df_gbt_a, 
    df_incorrect_gbt_a, 
    df_correct_gbt_a,
    label_map_gbt_a
)= load_or_train(
    train_x, 
    train_y_a, 
    test_x, 
    test_y_a, 
    train_gbt, 
    label_map_a, 
    save_path
)
print(accuracy_gbt_a)
print(df_incorrect_gbt_a.head())

      Iter       Train Loss   Remaining Time 
         1           2.2685           57.54s
         2           2.2412           56.52s
         3           2.2175           57.36s
         4           2.1972           57.71s
         5           2.1782           57.45s
         6           2.1613           57.13s
         7           2.1449           56.74s
         8           2.1303           56.57s
         9           2.1161           56.37s
        10           2.1031           55.59s
        20           1.9956           49.36s
        30           1.9195           43.24s
        40           1.8584           37.04s
        50           1.8093           30.92s
        60           1.7666           24.77s
        70           1.7301           18.59s
        80           1.6968           12.42s
        90           1.6676            6.21s
       100           1.6416            0.00s
0.32314814814814813
   prediction  label  check prediction_name         label_name
0           9   

In [16]:
save_path = "/home/ubuntu/data/yg_ar/classic_models_hard_correctSIFT/gbt01_at.pkl"
(
    trained_gbt_at, 
    predictions_gbt_at, 
    accuracy_gbt_at, 
    df_gbt_at, 
    df_incorrect_gbt_at, 
    df_correct_gbt_at,
    label_map_gbt_at
)= load_or_train(
    train_x, 
    train_y_at, 
    test_x, 
    test_y_at, 
    train_gbt, 
    label_map_at, 
    save_path
)
print(accuracy_gbt_at)
print(df_incorrect_gbt_at.head())

      Iter       Train Loss   Remaining Time 
         1           3.6171            4.34m
         2           3.5618            4.30m
         3           3.5142            4.23m
         4           3.4719            4.16m
         5           3.4380            4.12m
         6           3.4022            4.09m
         7           3.3711            4.06m
         8           3.3411            4.00m
         9           3.3134            3.95m
        10           3.2865            3.91m
        20           3.0674            3.46m
        30           2.8960            3.03m
        40           2.7538            2.60m
        50           2.6363            2.17m
        60           2.5291            1.74m
        70           2.4347            1.29m
        80           2.3514           51.56s
        90           2.2733           25.83s
0.1162037037037037
   prediction  label  check      prediction_name           label_name
0          14     13  False  lord_of_the_dance_3  lord_

In [15]:
test_df.head()

Unnamed: 0,image,label_a,label_at,file_name
504,"[-1132.5117214975426, -803.9562075164253, -8.1...",camel,camel_2,camel_2_hair_0_cloth_0_pants_2_Z1062_XOP13_YOP...
830,"[391.3672035585932, -389.4422474374182, 1857.5...",camel,camel_2,camel_2_hair_2_cloth_1_pants_1_Z853_XOP3_YON8_...
885,"[-1588.1975948477632, 493.1846922039985, 1061....",camel,camel_2,camel_2_hair_2_cloth_2_pants_1_Z875_XOP13_YON1...
824,"[563.4378149562966, -741.9392068701135, 1256.8...",camel,camel_2,camel_2_hair_2_cloth_1_pants_1_Z1063_XOP0_YON2...
665,"[50.84486329080433, -206.42929664656796, 1644....",camel,camel_2,camel_2_hair_1_cloth_1_pants_2_Z873_XOP3_YON7_...


In [16]:
df_gbt_at.head()

Unnamed: 0,prediction,label,check,prediction_name,label_name
0,1,1,True,camel_2,camel_2
1,11,1,False,childs_4,camel_2
2,1,1,True,camel_2,camel_2
3,3,1,False,camel_4,camel_2
4,38,1,False,warrior_II_3,camel_2
