In [1]:
import pandas as pd
from nebula.data.yg_ar.setup_data_image_hard import read_data
from nebula.common import to_scale_one, write_pickle, read_pickle
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
import os
import os.path as osp
import numpy as np

  warn(f"Failed to load image Python extension: {e}")


In [2]:
def create_label_map(labels):
    label_set = set()
    for lt in labels:
        label_set.add(lt)

    label_map = {}
    count = 0
    for l in label_set:
        label_map[l] = count
        count += 1
        
    return label_map

In [3]:
df_path = "C:/Users/aphri/Documents/t0002/pycharm/data/yg_ar/image_medium_df.pkl"
random_seed = 1
df, train_df, test_df, valid_df = read_data(df_path, random_seed)

In [4]:
len(valid_df)

1760

In [5]:
label_map_a = create_label_map(df["label_a"])
label_map_at = create_label_map(df["label_at"])

In [6]:
label_map_a

{'chair': 0,
 'thunderbolt': 1,
 'lotus': 2,
 'upward_dog': 3,
 'childs': 4,
 'warrior_III': 5,
 'triangle': 6,
 'warrior_II': 7,
 'camel': 8,
 'lord_of_the_dance': 9}

In [7]:
label_map_at

{'childs_2': 0,
 'warrior_III_2': 1,
 'lord_of_the_dance_1': 2,
 'triangle_1': 3,
 'triangle_2': 4,
 'upward_dog_3': 5,
 'chair_4': 6,
 'triangle_4': 7,
 'childs_3': 8,
 'warrior_II_2': 9,
 'childs_4': 10,
 'lord_of_the_dance_3': 11,
 'childs_1': 12,
 'thunderbolt_3': 13,
 'thunderbolt_4': 14,
 'camel_2': 15,
 'warrior_III_4': 16,
 'camel_3': 17,
 'lotus_1': 18,
 'upward_dog_2': 19,
 'triangle_3': 20,
 'upward_dog_4': 21,
 'thunderbolt_2': 22,
 'chair_2': 23,
 'camel_4': 24,
 'warrior_II_4': 25,
 'warrior_III_3': 26,
 'warrior_II_1': 27,
 'lotus_3': 28,
 'lord_of_the_dance_2': 29,
 'chair_1': 30,
 'chair_3': 31,
 'lotus_4': 32,
 'upward_dog_1': 33,
 'warrior_III_1': 34,
 'warrior_II_3': 35,
 'camel_1': 36,
 'lotus_2': 37,
 'lord_of_the_dance_4': 38,
 'thunderbolt_1': 39}

In [8]:
train_x = train_df["image"].apply(lambda x: to_scale_one(x, scale=0.2).flatten()).to_list()

In [9]:
train_y_a = train_df["label_a"].map(label_map_a).to_list()
train_y_at = train_df["label_at"].map(label_map_at).to_list()

In [10]:
test_x = test_df["image"].apply(lambda x: to_scale_one(x, scale=0.2).flatten()).to_list()

In [11]:
test_y_a = test_df["label_a"].map(label_map_a).to_list()
test_y_at = test_df["label_at"].map(label_map_at).to_list()

In [12]:
def train_svm(data_x, data_y):
    clf = svm.SVC()
    clf.fit(data_x, data_y)
    return clf


def train_logistic(data_x, data_y):
    clf = LogisticRegression(random_state=0)
    clf.fit(data_x, data_y)
    return clf


def train_gbt_n10_m1(data_x, data_y):
    clf = GradientBoostingClassifier(
        n_estimators=10,
        learning_rate=1,
        max_depth=1,
        random_state=0
    )
    clf.fit(data_x, data_y)
    return clf


def evaluate(model, test_x, test_y):
    res = model.predict(test_x)
    correct = res == test_y
    accuracy = correct.sum() / len(res)
    return res, accuracy


def load_or_train(train_x, train_y, test_x, test_y, train_func, label_map, path):
    
    if osp.exists(path):
        return read_pickle(path)
    
    trained_model = train_func(train_x, train_y)
    predictions, accuracy = evaluate(trained_model, test_x, test_y)
    
    df, df_incorrect, df_correct = format_results(predictions, test_y, label_map)
    
    write_pickle(path, (trained_model, predictions, accuracy, df, df_incorrect, df_correct, label_map)) 
    
    return trained_model, predictions, accuracy, df, df_incorrect, df_correct, label_map


def format_results(predictions, labels, label_map):
    df = pd.DataFrame(
        data={
            "prediction": predictions,
            "label": labels
        }
    )
    df["check"] = df["prediction"] == df["label"]

    label_map_reverse = {v:k for k, v in label_map.items()}

    df["prediction_name"] = df.prediction.map(label_map_reverse)
    df["label_name"] = df.label.map(label_map_reverse)

    df_incorrect = df[~df.check]
    df_correct = df[df.check]

    return df, df_incorrect, df_correct

In [13]:
save_path = "C:/Users/aphri/Documents/t0002/pycharm/data/yg_ar/classic_models_medium/gbt_a.pkl"
(
    trained_gbt_a, 
    predictions_gbt_a, 
    accuracy_gbt_a, 
    df_gbt_a, 
    df_incorrect_gbt_a, 
    df_correct_gbt_a,
    label_map_gbt_a
)= load_or_train(
    train_x, 
    train_y_a, 
    test_x, 
    test_y_a, 
    train_gbt_n10_m1, 
    label_map_a, 
    save_path
)
print(accuracy_gbt_a)
print(df_incorrect_gbt_a.head())

0.21157407407407408
   prediction  label  check prediction_name label_name
0           5      4  False     warrior_III     childs
1           1      4  False     thunderbolt     childs
2           0      4  False           chair     childs
3           0      4  False           chair     childs
4           5      4  False     warrior_III     childs


In [14]:
trained_gbt_a.feature_importances_.argmax()

924

In [15]:
np.argpartition(trained_gbt_a.feature_importances_, -10)[-10:]

array([ 16, 814, 860, 783,  10, 926, 910, 795, 624, 924], dtype=int64)

In [16]:
trained_gbt_a.feature_importances_[np.argpartition(trained_gbt_a.feature_importances_, -10)[-10:],]

array([0.00350457, 0.00361017, 0.00386851, 0.00424821, 0.23092255,
       0.04050694, 0.00671047, 0.02636861, 0.00585017, 0.61658604])

In [17]:
pd.Series(trained_gbt_a.feature_importances_).describe(list(np.arange(0.9,1,0.01)) + [0.998] )

count    961.000000
mean       0.001041
std        0.021288
min        0.000000
50%        0.000000
90%        0.000000
91%        0.000000
92%        0.000000
93%        0.000000
94%        0.000000
95%        0.000000
96%        0.001147
97%        0.001584
98%        0.001782
99%        0.003395
99.8%      0.055740
max        0.616586
dtype: float64

In [18]:
save_path = "C:/Users/aphri/Documents/t0002/pycharm/data/yg_ar/classic_models_medium/gbt_at.pkl"
(
    trained_gbt_at, 
    predictions_gbt_at, 
    accuracy_gbt_at, 
    df_gbt_at, 
    df_incorrect_gbt_at, 
    df_correct_gbt_at,
    label_map_gbt_at
)= load_or_train(
    train_x, 
    train_y_at, 
    test_x, 
    test_y_at, 
    train_gbt_n10_m1, 
    label_map_at, 
    save_path
)
print(accuracy_gbt_at)
print(df_incorrect_gbt_at.head())

0.05277777777777778
   prediction  label  check prediction_name label_name
0          13      0  False   thunderbolt_3   childs_2
1          39      0  False   thunderbolt_1   childs_2
2          28      0  False         lotus_3   childs_2
3          24      0  False         camel_4   childs_2
4          23      0  False         chair_2   childs_2


In [19]:
save_path = "C:/Users/aphri/Documents/t0002/pycharm/data/yg_ar/classic_models_medium/svm_a.pkl"
(
    trained_svm_a, 
    predictions_svm_a, 
    accuracy_svm_a, 
    df_svm_a, 
    df_incorrect_svm_a, 
    df_correct_svm_a,
    label_map_svm_a
)= load_or_train(
    train_x, 
    train_y_a, 
    test_x, 
    test_y_a, 
    train_svm, 
    label_map_a, 
    save_path
)
print(accuracy_svm_a)
print(df_incorrect_svm_a.head())

0.3925925925925926
   prediction  label  check prediction_name label_name
2           6      4  False        triangle     childs
5           2      4  False           lotus     childs
7           6      4  False        triangle     childs
8           8      4  False           camel     childs
9           3      4  False      upward_dog     childs


In [20]:
save_path = "C:/Users/aphri/Documents/t0002/pycharm/data/yg_ar/classic_models_medium/svm_at.pkl"
(
    trained_svm_at, 
    predictions_svm_at, 
    accuracy_svm_at, 
    df_svm_at,
    df_incorrect_svm_at, 
    df_correct_svm_at,
    label_map_svm_at
)= load_or_train(
    train_x, 
    train_y_at, 
    test_x, 
    test_y_at, 
    train_svm, 
    label_map_at, 
    save_path
)
print(accuracy_svm_at)
print(df_incorrect_svm_at.head())

0.08842592592592592
   prediction  label  check prediction_name label_name
0          32      0  False         lotus_4   childs_2
1          28      0  False         lotus_3   childs_2
2           8      0  False        childs_3   childs_2
3           6      0  False         chair_4   childs_2
5          14      0  False   thunderbolt_4   childs_2


In [21]:
save_path = "C:/Users/aphri/Documents/t0002/pycharm/data/yg_ar/classic_models_medium/logistic_a.pkl"
(
    trained_logistic_a, 
    predictions_logistic_a, 
    accuracy_logistic_a, 
    df_logistic_a,
    df_incorrect_logistic_a, 
    df_correct_logistic_a,
    label_map_logistic_a
)= load_or_train(
    train_x, 
    train_y_a, 
    test_x, 
    test_y_a, 
    train_logistic, 
    label_map_a, 
    save_path
)
print(accuracy_logistic_a)
print(df_incorrect_logistic_a.head())

0.2111111111111111
   prediction  label  check prediction_name label_name
1           2      4  False           lotus     childs
2           6      4  False        triangle     childs
5           0      4  False           chair     childs
7           2      4  False           lotus     childs
8           5      4  False     warrior_III     childs


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
trained_logistic_a.coef_[0].argmax()

558

In [26]:
for i in range(len(trained_logistic_a.coef_)):
    print(trained_logistic_a.coef_[i].argmax())

558
924
749
403
341
558
785
589
309
10


In [27]:
trained_logistic_a.coef_[0][663]

0.00030255454799611145

In [28]:
save_path = "C:/Users/aphri/Documents/t0002/pycharm/data/yg_ar/classic_models_medium/logistic_at.pkl"
(
    trained_logistic_at, 
    predictions_logistic_at, 
    accuracy_logistic_at, 
    df_logistic_at,
    df_incorrect_logistic_at, 
    df_correct_logistic_at,
    label_map_logistic_at
)= load_or_train(
    train_x, 
    train_y_at, 
    test_x, 
    test_y_at, 
    train_logistic, 
    label_map_at, 
    save_path
)
print(accuracy_logistic_at)
print(df_incorrect_logistic_at.head())

0.059722222222222225
   prediction  label  check prediction_name label_name
0          12      0  False        childs_1   childs_2
1           5      0  False    upward_dog_3   childs_2
2          12      0  False        childs_1   childs_2
3          21      0  False    upward_dog_4   childs_2
5          31      0  False         chair_3   childs_2


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
