In [1]:
import pandas as pd
from nebula.data.yg_ar.setup_data_image_hard import read_data
from nebula.common import to_scale_one, write_pickle, read_pickle
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
import os
import os.path as osp
import numpy as np

  warn(f"Failed to load image Python extension: {e}")


In [2]:
def create_label_map(labels):
    label_set = set()
    for lt in labels:
        label_set.add(lt)
        
    label_set = list(label_set)
    label_set.sort()

    label_map = {}
    count = 0
    for l in label_set:
        label_map[l] = count
        count += 1
        
    return label_map

In [3]:
df_path = "C:/Users/aphri/Documents/t0002/pycharm/data/yg_ar/image_easy2_ds_siftwkp_pca.pkl"
df, train_df, test_df, valid_df = read_pickle(df_path)

In [4]:
label_map_a = create_label_map(df["label_a"])
label_map_at = create_label_map(df["label_at"])

In [5]:
train_x = train_df["image"].to_list()

In [6]:
train_y_a = train_df["label_a"].map(label_map_a).to_list()
train_y_at = train_df["label_at"].map(label_map_at).to_list()

In [7]:
test_x = test_df["image"].to_list()

In [8]:
test_y_a = test_df["label_a"].map(label_map_a).to_list()
test_y_at = test_df["label_at"].map(label_map_at).to_list()

In [9]:
def train_svm(data_x, data_y):
    clf = svm.SVC(max_iter=50)
    clf.fit(data_x, data_y)
    return clf


def train_logistic(data_x, data_y):
    clf = LogisticRegression(random_state=0)
    clf.fit(data_x, data_y)
    return clf


def train_gbt(data_x, data_y):
    clf = GradientBoostingClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=3,
        random_state=0,
        verbose=1,
        n_iter_no_change=2,
    )
    clf.fit(data_x, data_y)
    return clf


def evaluate(model, test_x, test_y):
    res = model.predict(test_x)
    correct = res == test_y
    accuracy = correct.sum() / len(res)
    return res, accuracy


def create_dirs_to_file(path):
    dirs = "/".join(osp.join(path).split("/")[:-1])
    if not osp.exists(dirs):
        os.makedirs(dirs)


def load_or_train(train_x, train_y, test_x, test_y, train_func, label_map, path):
    
    if osp.exists(path):
        return read_pickle(path)
    
    create_dirs_to_file(path)
    
    trained_model = train_func(train_x, train_y)
    predictions, accuracy = evaluate(trained_model, test_x, test_y)
    
    df, df_incorrect, df_correct = format_results(predictions, test_y, label_map)
    
    write_pickle(path, (trained_model, predictions, accuracy, df, df_incorrect, df_correct, label_map)) 
    
    return trained_model, predictions, accuracy, df, df_incorrect, df_correct, label_map


def format_results(predictions, labels, label_map):
    df = pd.DataFrame(
        data={
            "prediction": predictions,
            "label": labels
        }
    )
    df["check"] = df["prediction"] == df["label"]

    label_map_reverse = {v:k for k, v in label_map.items()}

    df["prediction_name"] = df.prediction.map(label_map_reverse)
    df["label_name"] = df.label.map(label_map_reverse)

    df_incorrect = df[~df.check]
    df_correct = df[df.check]

    return df, df_incorrect, df_correct

In [11]:
save_path = "C:/Users/aphri/Documents/t0002/pycharm/data/yg_ar/classic_models_easy2_siftwkp_pca/svm_a.pkl"
(
    trained_svm_a, 
    predictions_svm_a, 
    accuracy_svm_a, 
    df_svm_a, 
    df_incorrect_svm_a, 
    df_correct_svm_a,
    label_map_svm_a
)= load_or_train(
    train_x, 
    train_y_a, 
    test_x, 
    test_y_a, 
    train_svm, 
    label_map_a, 
    save_path
)
print(accuracy_svm_a)
print(df_incorrect_svm_a.head())



0.37222222222222223
    prediction  label  check prediction_name label_name
0            2      0  False          childs      camel
2            8      0  False      warrior_II      camel
5            4      0  False           lotus      camel
7            7      0  False      upward_dog      camel
11           7      0  False      upward_dog      camel


In [12]:
save_path = "C:/Users/aphri/Documents/t0002/pycharm/data/yg_ar/classic_models_easy2_siftwkp_pca/svm_at.pkl"
(
    trained_svm_at, 
    predictions_svm_at, 
    accuracy_svm_at, 
    df_svm_at,
    df_incorrect_svm_at, 
    df_correct_svm_at,
    label_map_svm_at
)= load_or_train(
    train_x, 
    train_y_at, 
    test_x, 
    test_y_at, 
    train_svm, 
    label_map_at, 
    save_path
)
print(accuracy_svm_at)
print(df_incorrect_svm_at.head())



0.2671296296296296
    prediction  label  check prediction_name label_name
0           16      1  False         lotus_1    camel_2
1            0      1  False         camel_1    camel_2
5           31      1  False    upward_dog_4    camel_2
7           31      1  False    upward_dog_4    camel_2
11          19      1  False         lotus_4    camel_2


In [13]:
save_path = "C:/Users/aphri/Documents/t0002/pycharm/data/yg_ar/classic_models_easy2_siftwkp_pca/logistic_a.pkl"
(
    trained_logistic_a, 
    predictions_logistic_a, 
    accuracy_logistic_a, 
    df_logistic_a,
    df_incorrect_logistic_a, 
    df_correct_logistic_a,
    label_map_logistic_a
)= load_or_train(
    train_x, 
    train_y_a, 
    test_x, 
    test_y_a, 
    train_logistic, 
    label_map_a, 
    save_path
)
print(accuracy_logistic_a)
print(df_incorrect_logistic_a.head())

0.425462962962963
   prediction  label  check prediction_name label_name
0           2      0  False          childs      camel
3           7      0  False      upward_dog      camel
4           6      0  False        triangle      camel
5           2      0  False          childs      camel
7           7      0  False      upward_dog      camel


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
save_path = "C:/Users/aphri/Documents/t0002/pycharm/data/yg_ar/classic_models_easy2_siftwkp_pca/logistic_at.pkl"
(
    trained_logistic_at, 
    predictions_logistic_at, 
    accuracy_logistic_at, 
    df_logistic_at,
    df_incorrect_logistic_at, 
    df_correct_logistic_at,
    label_map_logistic_at
)= load_or_train(
    train_x, 
    train_y_at, 
    test_x, 
    test_y_at, 
    train_logistic, 
    label_map_at, 
    save_path
)
print(accuracy_logistic_at)
print(df_incorrect_logistic_at.head())

0.12407407407407407
   prediction  label  check prediction_name label_name
0          11      1  False        childs_4    camel_2
1           0      1  False         camel_1    camel_2
3          25      1  False      triangle_2    camel_2
4          25      1  False      triangle_2    camel_2
5           9      1  False        childs_2    camel_2


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
save_path = "C:/Users/aphri/Documents/t0002/pycharm/data/yg_ar/classic_models_easy2_siftwkp_pca/gbt01_a.pkl"
(
    trained_gbt_a, 
    predictions_gbt_a, 
    accuracy_gbt_a, 
    df_gbt_a, 
    df_incorrect_gbt_a, 
    df_correct_gbt_a,
    label_map_gbt_a
)= load_or_train(
    train_x, 
    train_y_a, 
    test_x, 
    test_y_a, 
    train_gbt, 
    label_map_a, 
    save_path
)
print(accuracy_gbt_a)
print(df_incorrect_gbt_a.head())

      Iter       Train Loss   Remaining Time 
         1           2.1900           92.97m
         2           2.1024           91.67m
         3           2.0323           89.88m
         4           1.9740           86.39m
         5           1.9239           83.81m
         6           1.8793           81.62m
         7           1.8386           80.22m
         8           1.8018           79.28m
         9           1.7671           80.91m
        10           1.7370           79.70m
        20           1.5243           67.66m
        30           1.3835           59.66m
        40           1.2842           50.07m
        50           1.2034           41.84m
        60           1.1391           33.35m
        70           1.0798           25.62m
        80           1.0296           16.57m
        90           0.9835            7.92m
       100           0.9432            0.00s
0.5916666666666667
   prediction  label  check    prediction_name label_name
0           2      0  

In [11]:
save_path = "C:/Users/aphri/Documents/t0002/pycharm/data/yg_ar/classic_models_easy2_siftwkp_pca/gbt01_at.pkl"
(
    trained_gbt_at, 
    predictions_gbt_at, 
    accuracy_gbt_at, 
    df_gbt_at, 
    df_incorrect_gbt_at, 
    df_correct_gbt_at,
    label_map_gbt_at
)= load_or_train(
    train_x, 
    train_y_at, 
    test_x, 
    test_y_at, 
    train_gbt, 
    label_map_at, 
    save_path
)
print(accuracy_gbt_at)
print(df_incorrect_gbt_at.head())

      Iter       Train Loss   Remaining Time 
         1           3.4985          195.96m
         2           3.3721          191.71m
         3           3.2722          189.01m
         4           3.1869          187.02m
         5           3.1105          184.67m
         6           3.0374          645.01m
         7           2.9728          566.29m
         8           2.9169          506.30m
         9           2.8693          459.78m
        10           2.8199          422.61m
        20           2.4611          262.56m
        30           2.2210          197.55m
        40           2.0287          153.63m
0.17962962962962964
   prediction  label  check      prediction_name label_name
1          11      1  False             childs_4    camel_2
3           3      1  False              camel_4    camel_2
4          38      1  False         warrior_II_3    camel_2
5          19      1  False              lotus_4    camel_2
6          13      1  False  lord_of_the_dance_2 

In [15]:
test_df.head()

Unnamed: 0,image,label_a,label_at,file_name
504,"[-1132.5117214975426, -803.9562075164253, -8.1...",camel,camel_2,camel_2_hair_0_cloth_0_pants_2_Z1062_XOP13_YOP...
830,"[391.3672035585932, -389.4422474374182, 1857.5...",camel,camel_2,camel_2_hair_2_cloth_1_pants_1_Z853_XOP3_YON8_...
885,"[-1588.1975948477632, 493.1846922039985, 1061....",camel,camel_2,camel_2_hair_2_cloth_2_pants_1_Z875_XOP13_YON1...
824,"[563.4378149562966, -741.9392068701135, 1256.8...",camel,camel_2,camel_2_hair_2_cloth_1_pants_1_Z1063_XOP0_YON2...
665,"[50.84486329080433, -206.42929664656796, 1644....",camel,camel_2,camel_2_hair_1_cloth_1_pants_2_Z873_XOP3_YON7_...


In [16]:
df_gbt_at.head()

Unnamed: 0,prediction,label,check,prediction_name,label_name
0,1,1,True,camel_2,camel_2
1,11,1,False,childs_4,camel_2
2,1,1,True,camel_2,camel_2
3,3,1,False,camel_4,camel_2
4,38,1,False,warrior_II_3,camel_2
