In [42]:
import pandas as pd
from nebula.data.yg_ar.setup_data_image_hard import read_data
from nebula.common import to_scale_one, write_pickle, read_pickle
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
import os
import os.path as osp
import numpy as np

In [4]:
def create_label_map(labels):
    label_set = set()
    for lt in labels:
        label_set.add(lt)

    label_map = {}
    count = 0
    for l in label_set:
        label_map[l] = count
        count += 1
        
    return label_map

In [15]:
df_path = "C:/Users/aphri/Documents/t0002/pycharm/data/yg_ar/image_hard_df.pkl"
random_seed = 1
df, train_df, test_df, valid_df = read_data(df_path, random_seed)

In [16]:
len(valid_df)

1760

In [29]:
label_map_a = create_label_map(df["label_a"])
label_map_at = create_label_map(df["label_at"])

In [30]:
label_map_a

{'lord_of_the_dance': 0,
 'warrior_II': 1,
 'lotus': 2,
 'chair': 3,
 'upward_dog': 4,
 'warrior_III': 5,
 'thunderbolt': 6,
 'childs': 7,
 'triangle': 8,
 'camel': 9}

In [31]:
label_map_at

{'upward_dog_3': 0,
 'lotus_2': 1,
 'camel_2': 2,
 'thunderbolt_1': 3,
 'triangle_4': 4,
 'thunderbolt_4': 5,
 'chair_3': 6,
 'triangle_1': 7,
 'thunderbolt_3': 8,
 'childs_2': 9,
 'warrior_III_3': 10,
 'lotus_4': 11,
 'camel_4': 12,
 'triangle_3': 13,
 'warrior_II_2': 14,
 'chair_4': 15,
 'lord_of_the_dance_2': 16,
 'warrior_II_1': 17,
 'camel_3': 18,
 'chair_1': 19,
 'warrior_III_2': 20,
 'warrior_III_4': 21,
 'childs_1': 22,
 'lord_of_the_dance_1': 23,
 'childs_4': 24,
 'chair_2': 25,
 'triangle_2': 26,
 'lotus_3': 27,
 'lotus_1': 28,
 'upward_dog_2': 29,
 'thunderbolt_2': 30,
 'upward_dog_1': 31,
 'upward_dog_4': 32,
 'childs_3': 33,
 'camel_1': 34,
 'lord_of_the_dance_3': 35,
 'warrior_II_4': 36,
 'warrior_III_1': 37,
 'lord_of_the_dance_4': 38,
 'warrior_II_3': 39}

In [32]:
train_x = train_df["image"].apply(lambda x: to_scale_one(x, scale=0.2).flatten()).to_list()

In [33]:
train_y_a = train_df["label_a"].map(label_map_a).to_list()
train_y_at = train_df["label_at"].map(label_map_at).to_list()

In [34]:
test_x = test_df["image"].apply(lambda x: to_scale_one(x, scale=0.2).flatten()).to_list()

In [35]:
test_y_a = test_df["label_a"].map(label_map_a).to_list()
test_y_at = test_df["label_at"].map(label_map_at).to_list()

In [36]:
def train_svm(data_x, data_y):
    clf = svm.SVC()
    clf.fit(data_x, data_y)
    return clf


def train_logistic(data_x, data_y):
    clf = LogisticRegression(random_state=0)
    clf.fit(data_x, data_y)
    return clf


def train_gbt_n10_m1(data_x, data_y):
    clf = GradientBoostingClassifier(
        n_estimators=10,
        learning_rate=1,
        max_depth=1,
        random_state=0
    )
    clf.fit(data_x, data_y)
    return clf


def evaluate(model, test_x, test_y):
    res = model.predict(test_x)
    correct = res == test_y
    accuracy = correct.sum() / len(res)
    return res, accuracy


def load_or_train(train_x, train_y, test_x, test_y, train_func, label_map, path):
    
    if osp.exists(path):
        return read_pickle(path)
    
    trained_model = train_func(train_x, train_y)
    predictions, accuracy = evaluate(trained_model, test_x, test_y)
    
    df, df_incorrect, df_correct = format_results(predictions, test_y, label_map)
    
    write_pickle(path, (trained_model, predictions, accuracy, df, df_incorrect, df_correct, label_map)) 
    
    return trained_model, predictions, accuracy, df, df_incorrect, df_correct, label_map


def format_results(predictions, labels, label_map):
    df = pd.DataFrame(
        data={
            "prediction": predictions,
            "label": labels
        }
    )
    df["check"] = df["prediction"] == df["label"]

    label_map_reverse = {v:k for k, v in label_map.items()}

    df["prediction_name"] = df.prediction.map(label_map_reverse)
    df["label_name"] = df.label.map(label_map_reverse)

    df_incorrect = df[~df.check]
    df_correct = df[df.check]

    return df, df_incorrect, df_correct

In [43]:
save_path = "C:/Users/aphri/Documents/t0002/pycharm/data/yg_ar/classic_models_hard/gbt_a.pkl"
(
    trained_gbt_a, 
    predictions_gbt_a, 
    accuracy_gbt_a, 
    df_gbt_a, 
    df_incorrect_gbt_a, 
    df_correct_gbt_a,
    label_map_gbt_a
)= load_or_train(
    train_x, 
    train_y_a, 
    test_x, 
    test_y_a, 
    train_gbt_n10_m1, 
    label_map_a, 
    save_path
)
print(accuracy_gbt_a)
print(df_incorrect_gbt_a.head())

0.14305555555555555
   prediction  label  check prediction_name  label_name
0           7      4  False          childs  upward_dog
2           6      4  False     thunderbolt  upward_dog
3           5      4  False     warrior_III  upward_dog
4           7      4  False          childs  upward_dog
5           6      4  False     thunderbolt  upward_dog


In [44]:
trained_gbt_a.feature_importances_.argmax()

51

In [45]:
np.argpartition(trained_gbt_a.feature_importances_, -10)[-10:]

array([733, 644, 850, 570, 805, 597, 479, 749, 762,  51], dtype=int64)

In [46]:
trained_gbt_a.feature_importances_[np.argpartition(trained_gbt_a.feature_importances_, -10)[-10:],]

array([0.01987777, 0.0199064 , 0.02024128, 0.02312961, 0.02066384,
       0.02147911, 0.02502272, 0.02827686, 0.03089271, 0.03573809])

In [48]:
pd.Series(trained_gbt_a.feature_importances_).describe(list(np.arange(0.9,1,0.01)) + [0.998] )

count    961.000000
mean       0.001041
std        0.003803
min        0.000000
50%        0.000000
90%        0.000000
91%        0.000000
92%        0.005919
93%        0.007710
94%        0.008206
95%        0.008920
96%        0.010007
97%        0.010935
98%        0.015920
99%        0.019875
99.8%      0.028486
max        0.035738
dtype: float64

In [50]:
save_path = "C:/Users/aphri/Documents/t0002/pycharm/data/yg_ar/classic_models_hard/gbt_at.pkl"
(
    trained_gbt_at, 
    predictions_gbt_at, 
    accuracy_gbt_at, 
    df_gbt_at, 
    df_incorrect_gbt_at, 
    df_correct_gbt_at,
    label_map_gbt_at
)= load_or_train(
    train_x, 
    train_y_at, 
    test_x, 
    test_y_at, 
    train_gbt_n10_m1, 
    label_map_at, 
    save_path
)
print(accuracy_gbt_at)
print(df_incorrect_gbt_at.head())

0.03194444444444444
   prediction  label  check prediction_name    label_name
0          28      0  False         lotus_1  upward_dog_3
1          28      0  False         lotus_1  upward_dog_3
2          28      0  False         lotus_1  upward_dog_3
3          28      0  False         lotus_1  upward_dog_3
4          28      0  False         lotus_1  upward_dog_3


In [51]:
save_path = "C:/Users/aphri/Documents/t0002/pycharm/data/yg_ar/classic_models_hard/svm_a.pkl"
(
    trained_svm_a, 
    predictions_svm_a, 
    accuracy_svm_a, 
    df_svm_a, 
    df_incorrect_svm_a, 
    df_correct_svm_a,
    label_map_svm_a
)= load_or_train(
    train_x, 
    train_y_a, 
    test_x, 
    test_y_a, 
    train_svm, 
    label_map_a, 
    save_path
)
print(accuracy_svm_a)
print(df_incorrect_svm_a.head())

0.1625
   prediction  label  check prediction_name  label_name
0           7      4  False          childs  upward_dog
1           8      4  False        triangle  upward_dog
2           9      4  False           camel  upward_dog
3           9      4  False           camel  upward_dog
4           3      4  False           chair  upward_dog


In [52]:
save_path = "C:/Users/aphri/Documents/t0002/pycharm/data/yg_ar/classic_models/svm_at.pkl"
(
    trained_svm_at, 
    predictions_svm_at, 
    accuracy_svm_at, 
    df_svm_at,
    df_incorrect_svm_at, 
    df_correct_svm_at,
    label_map_svm_at
)= load_or_train(
    train_x, 
    train_y_at, 
    test_x, 
    test_y_at, 
    train_svm, 
    label_map_at, 
    save_path
)
print(accuracy_svm_at)
print(df_incorrect_svm_at.head())

0.041666666666666664
   prediction  label  check prediction_name    label_name
0           9      0  False        childs_2  upward_dog_3
1          34      0  False         camel_1  upward_dog_3
2          30      0  False   thunderbolt_2  upward_dog_3
3          34      0  False         camel_1  upward_dog_3
4          12      0  False         camel_4  upward_dog_3


In [99]:
save_path = "C:/Users/aphri/Documents/t0002/pycharm/data/yg_ar/classic_models/logistic.pkl"
(
    trained_logistic, 
    predictions_logistic, 
    accuracy_logistic, 
    df_logistic,
    df_incorrect_logistic, 
    df_correct_logistic,
    label_map_logistic
)= load_or_train(
    train_x, 
    train_y, 
    test_x, 
    test_y, 
    train_logistic, 
    label_map, 
    save_path
)
print(accuracy_logistic)
print(df_incorrect_logistic.head())

1.0
Empty DataFrame
Columns: [prediction, label, check, prediction_name, label_name]
Index: []


In [114]:
trained_logistic.coef_[0].argmax()

663

In [120]:
for i in range(len(trained_logistic.coef_)):
    print(trained_logistic.coef_[i].argmax())

663
667
789
760
663
789
605
572
664
543


In [115]:
trained_logistic.coef_[0][663]

0.04047882481582151

In [100]:
save_path = "C:/Users/aphri/Documents/t0002/pycharm/data/yg_ar/classic_models/logistic_t.pkl"
(
    trained_logistic_t, 
    predictions_logistic_t, 
    accuracy_logistic_t, 
    df_logistic_t,
    df_incorrect_logistic_t, 
    df_correct_logistic_t,
    label_map_logistic_t
)= load_or_train(
    train_x, 
    train_y_t, 
    test_x, 
    test_y_t, 
    train_logistic, 
    label_t_map, 
    save_path
)
print(accuracy_logistic_t)
print(df_incorrect_logistic_t.head())

0.9980851063829788
      prediction  label  check prediction_name     label_name
2625           0      7  False      triangle_1     triangle_3
2740           0      7  False      triangle_1     triangle_3
4149          21     24  False   thunderbolt_3  thunderbolt_4
4297           5     12  False        childs_3       childs_1
4452           5     12  False        childs_3       childs_1


In [101]:
save_path = "C:/Users/aphri/Documents/t0002/pycharm/data/yg_ar/classic_models/logistic_th.pkl"
(
    trained_logistic_th, 
    predictions_logistic_th, 
    accuracy_logistic_th, 
    df_logistic_th,
    df_incorrect_logistic_th, 
    df_correct_logistic_th,
    label_map_logistic_th
)= load_or_train(
    train_x, 
    train_y_th, 
    test_x, 
    test_y_th, 
    train_logistic, 
    label_th_map, 
    save_path
)
print(accuracy_logistic_th)
print(df_incorrect_logistic_th.head())

0.9631914893617022
      prediction  label  check prediction_name label_name
779           92    151  False       chair_1_2  chair_3_2
1414         156     61  False       lotus_3_2  lotus_4_0
1422         116    130  False       lotus_4_2  lotus_4_1
1428          43    135  False       lotus_2_1  lotus_3_1
1436         135    156  False       lotus_3_1  lotus_3_2


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [102]:
save_path = "C:/Users/aphri/Documents/t0002/pycharm/data/yg_ar/classic_models/logistic_thc.pkl"
(
    trained_logistic_thc, 
    predictions_logistic_thc, 
    accuracy_logistic_thc, 
    df_logistic_thc,
    df_incorrect_logistic_thc, 
    df_correct_logistic_thc,
    label_map_logistic_thc
)= load_or_train(
    train_x, 
    train_y_thc, 
    test_x, 
    test_y_thc, 
    train_logistic, 
    label_thc_map, 
    save_path
)
print(accuracy_logistic_thc)
print(df_incorrect_logistic_thc.head())

0.9131914893617021
    prediction  label  check    prediction_name         label_name
16         254    582  False  warrior_III_2_0_2  warrior_III_2_0_3
51         335    132  False  warrior_III_1_1_2  warrior_III_1_1_3
52         396    117  False  warrior_III_2_2_2  warrior_III_2_2_3
53         638    380  False  warrior_III_3_0_3  warrior_III_3_1_3
56         335    132  False  warrior_III_1_1_2  warrior_III_1_1_3


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [103]:
save_path = "C:/Users/aphri/Documents/t0002/pycharm/data/yg_ar/classic_models/logistic_thcp.pkl"
(
    trained_logistic_thcp, 
    predictions_logistic_thcp, 
    accuracy_logistic_thcp, 
    df_logistic_thcp,
    df_incorrect_logistic_thcp, 
    df_correct_logistic_thcp,
    label_map_logistic_thcp
)= load_or_train(
    train_x, 
    train_y_thcp, 
    test_x, 
    test_y_thcp, 
    train_logistic, 
    label_thcp_map, 
    save_path
)
print(accuracy_logistic_thcp)
print(df_incorrect_logistic_thcp.head())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.5974468085106382
    prediction  label  check      prediction_name           label_name
0          836   1188  False  warrior_III_1_1_3_1  warrior_III_1_0_3_1
5         2409   1978  False  warrior_III_3_1_1_2  warrior_III_3_2_1_2
6         1551   1998  False  warrior_III_2_0_2_3  warrior_III_2_0_3_3
9         1998   1551  False  warrior_III_2_0_3_3  warrior_III_2_0_2_3
13        1825   2221  False  warrior_III_3_1_1_3  warrior_III_3_2_1_0
