In [1]:
import pandas as pd
from nebula.data.yg_ar.setup_data_image_hard import read_data
from nebula.common import to_scale_one, write_pickle, read_pickle
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
import os
import os.path as osp
import numpy as np

  warn(f"Failed to load image Python extension: {e}")


In [2]:
def create_label_map(labels):
    label_set = set()
    for lt in labels:
        label_set.add(lt)
        
    label_set = list(label_set)
    label_set.sort()

    label_map = {}
    count = 0
    for l in label_set:
        label_map[l] = count
        count += 1
        
    return label_map

In [3]:
df_path = "C:/Users/aphri/Documents/t0002/pycharm/data/yg_ar/image_easy2_ds_siftwkp_pca.pkl"
df, train_df, test_df, valid_df = read_pickle(df_path)

In [4]:
label_map_a = create_label_map(df["label_a"])
label_map_at = create_label_map(df["label_at"])

In [5]:
train_x = train_df["image"].to_list()

In [6]:
train_y_a = train_df["label_a"].map(label_map_a).to_list()
train_y_at = train_df["label_at"].map(label_map_at).to_list()

In [7]:
test_x = test_df["image"].to_list()

In [8]:
test_y_a = test_df["label_a"].map(label_map_a).to_list()
test_y_at = test_df["label_at"].map(label_map_at).to_list()

In [9]:
def train_svm(data_x, data_y):
    clf = svm.SVC(max_iter=50)
    clf.fit(data_x, data_y)
    return clf


def train_logistic(data_x, data_y):
    clf = LogisticRegression(random_state=0)
    clf.fit(data_x, data_y)
    return clf


def train_gbt(data_x, data_y):
    clf = GradientBoostingClassifier(
        n_estimators=100,
        learning_rate=0.01,
        max_depth=3,
        random_state=0,
        verbose=1,
        n_iter_no_change=2,
    )
    clf.fit(data_x, data_y)
    return clf


def evaluate(model, test_x, test_y):
    res = model.predict(test_x)
    correct = res == test_y
    accuracy = correct.sum() / len(res)
    return res, accuracy


def create_dirs_to_file(path):
    dirs = "/".join(osp.join(path).split("/")[:-1])
    if not osp.exists(dirs):
        os.makedirs(dirs)


def load_or_train(train_x, train_y, test_x, test_y, train_func, label_map, path):
    
    if osp.exists(path):
        return read_pickle(path)
    
    create_dirs_to_file(path)
    
    trained_model = train_func(train_x, train_y)
    predictions, accuracy = evaluate(trained_model, test_x, test_y)
    
    df, df_incorrect, df_correct = format_results(predictions, test_y, label_map)
    
    write_pickle(path, (trained_model, predictions, accuracy, df, df_incorrect, df_correct, label_map)) 
    
    return trained_model, predictions, accuracy, df, df_incorrect, df_correct, label_map


def format_results(predictions, labels, label_map):
    df = pd.DataFrame(
        data={
            "prediction": predictions,
            "label": labels
        }
    )
    df["check"] = df["prediction"] == df["label"]

    label_map_reverse = {v:k for k, v in label_map.items()}

    df["prediction_name"] = df.prediction.map(label_map_reverse)
    df["label_name"] = df.label.map(label_map_reverse)

    df_incorrect = df[~df.check]
    df_correct = df[df.check]

    return df, df_incorrect, df_correct

In [11]:
save_path = "C:/Users/aphri/Documents/t0002/pycharm/data/yg_ar/classic_models_easy2_siftwkp_pca/svm_a.pkl"
(
    trained_svm_a, 
    predictions_svm_a, 
    accuracy_svm_a, 
    df_svm_a, 
    df_incorrect_svm_a, 
    df_correct_svm_a,
    label_map_svm_a
)= load_or_train(
    train_x, 
    train_y_a, 
    test_x, 
    test_y_a, 
    train_svm, 
    label_map_a, 
    save_path
)
print(accuracy_svm_a)
print(df_incorrect_svm_a.head())



0.37222222222222223
    prediction  label  check prediction_name label_name
0            2      0  False          childs      camel
2            8      0  False      warrior_II      camel
5            4      0  False           lotus      camel
7            7      0  False      upward_dog      camel
11           7      0  False      upward_dog      camel


In [12]:
save_path = "C:/Users/aphri/Documents/t0002/pycharm/data/yg_ar/classic_models_easy2_siftwkp_pca/svm_at.pkl"
(
    trained_svm_at, 
    predictions_svm_at, 
    accuracy_svm_at, 
    df_svm_at,
    df_incorrect_svm_at, 
    df_correct_svm_at,
    label_map_svm_at
)= load_or_train(
    train_x, 
    train_y_at, 
    test_x, 
    test_y_at, 
    train_svm, 
    label_map_at, 
    save_path
)
print(accuracy_svm_at)
print(df_incorrect_svm_at.head())



0.2671296296296296
    prediction  label  check prediction_name label_name
0           16      1  False         lotus_1    camel_2
1            0      1  False         camel_1    camel_2
5           31      1  False    upward_dog_4    camel_2
7           31      1  False    upward_dog_4    camel_2
11          19      1  False         lotus_4    camel_2


In [13]:
save_path = "C:/Users/aphri/Documents/t0002/pycharm/data/yg_ar/classic_models_easy2_siftwkp_pca/logistic_a.pkl"
(
    trained_logistic_a, 
    predictions_logistic_a, 
    accuracy_logistic_a, 
    df_logistic_a,
    df_incorrect_logistic_a, 
    df_correct_logistic_a,
    label_map_logistic_a
)= load_or_train(
    train_x, 
    train_y_a, 
    test_x, 
    test_y_a, 
    train_logistic, 
    label_map_a, 
    save_path
)
print(accuracy_logistic_a)
print(df_incorrect_logistic_a.head())

0.425462962962963
   prediction  label  check prediction_name label_name
0           2      0  False          childs      camel
3           7      0  False      upward_dog      camel
4           6      0  False        triangle      camel
5           2      0  False          childs      camel
7           7      0  False      upward_dog      camel


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
save_path = "C:/Users/aphri/Documents/t0002/pycharm/data/yg_ar/classic_models_easy2_siftwkp_pca/logistic_at.pkl"
(
    trained_logistic_at, 
    predictions_logistic_at, 
    accuracy_logistic_at, 
    df_logistic_at,
    df_incorrect_logistic_at, 
    df_correct_logistic_at,
    label_map_logistic_at
)= load_or_train(
    train_x, 
    train_y_at, 
    test_x, 
    test_y_at, 
    train_logistic, 
    label_map_at, 
    save_path
)
print(accuracy_logistic_at)
print(df_incorrect_logistic_at.head())

0.12407407407407407
   prediction  label  check prediction_name label_name
0          11      1  False        childs_4    camel_2
1           0      1  False         camel_1    camel_2
3          25      1  False      triangle_2    camel_2
4          25      1  False      triangle_2    camel_2
5           9      1  False        childs_2    camel_2


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
save_path = "C:/Users/aphri/Documents/t0002/pycharm/data/yg_ar/classic_models_easy2_siftwkp_pca/gbt_a.pkl"
(
    trained_gbt_a, 
    predictions_gbt_a, 
    accuracy_gbt_a, 
    df_gbt_a, 
    df_incorrect_gbt_a, 
    df_correct_gbt_a,
    label_map_gbt_a
)= load_or_train(
    train_x, 
    train_y_a, 
    test_x, 
    test_y_a, 
    train_gbt, 
    label_map_a, 
    save_path
)
print(accuracy_gbt_a)
print(df_incorrect_gbt_a.head())

      Iter       Train Loss   Remaining Time 
         1           2.2908           59.52m
         2           2.2794           60.50m
         3           2.2684           58.70m
         4           2.2576           56.01m
         5           2.2472           54.27m
         6           2.2370           52.93m
         7           2.2270           51.82m
         8           2.2173           50.78m
         9           2.2078           49.71m
        10           2.1982           48.86m
        20           2.1158           41.68m
        30           2.0476           35.76m
        40           1.9895           30.25m
        50           1.9392           25.19m
        60           1.8946           20.03m
        70           1.8555           15.18m
        80           1.8201           11.62m
        90           1.7876            6.05m
       100           1.7582            0.00s
0.4152777777777778
   prediction  label  check prediction_name label_name
0           4      0  Fal

In [16]:
save_path = "C:/Users/aphri/Documents/t0002/pycharm/data/yg_ar/classic_models_easy2_siftwkp_pca/gbt_at.pkl"
(
    trained_gbt_at, 
    predictions_gbt_at, 
    accuracy_gbt_at, 
    df_gbt_at, 
    df_incorrect_gbt_at, 
    df_correct_gbt_at,
    label_map_gbt_at
)= load_or_train(
    train_x, 
    train_y_at, 
    test_x, 
    test_y_at, 
    train_gbt, 
    label_map_at, 
    save_path
)
print(accuracy_gbt_at)
print(df_incorrect_gbt_at.head())

      Iter       Train Loss   Remaining Time 
         1           3.6681          320.16m
         2           3.6496          316.43m
         3           3.6330          312.32m
         4           3.6172          308.91m
         5           3.6026          305.71m
         6           3.5886          305.27m
         7           3.5754          301.11m
         8           3.5629          297.33m
         9           3.5505          290.06m
        10           3.5385          277.11m
        20           3.4347          243.31m
        30           3.3490          190.57m
        40           3.2753          158.54m
        50           3.2112          126.90m
        60           3.1529          100.44m
        70           3.0997           72.27m
        80           3.0508           49.09m
        90           3.0085           23.96m
       100           2.9699            0.00s
0.1550925925925926
   prediction  label  check prediction_name label_name
1          11      1  Fal