In [1]:
import pandas
import os
import numpy as np
import torch
import gates_models as gm
import pickle
import utils as ut
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import pairwise_distances
from sklearn.linear_model import LogisticRegression

In [16]:
round_suffix = 'r5'
subset = 'all'
path = f'models_{round_suffix}'
main_path = os.path.join(path, 'round6-train-dataset') if round_suffix == 'r6' else os.path.join(path, 'round5-train-dataset')
models_path = os.path.join(main_path, 'models')
metadata_file = 'METADATA.csv'    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
df = pandas.read_csv(os.path.join(main_path, metadata_file))
torch.backends.cudnn.enabled=False
use_amp = True if torch.cuda.is_available() else False # attempt to use mixed precision to accelerate embedding conversion process
# ut.write_embeddings_on_file(df, main_path, models_path, round_suffix=round_suffix, input_type='clean')

In [None]:
reg_types = ['l1']
threshold = (0.95, 0.7) # (1, 0.7)
gg = 'all' # 'per_sample'

for reg_type in reg_types:
    print(f'Reg Type: {reg_type}')
    cdrp_hgates_all_params = {'threshold':threshold, 'start':0.05, 'iter':50, 'lr':0.1, 'eps':1e-2, 'gate_type':'hidden', 'reg_type':reg_type, 'gate_granularity':gg, 'subset':subset}
    hgates_all, hgates_all_accs, hgates_all_gammas, class_indices, model_labels, trigger_targets = gm.apply_cdrp_on_all_models(df, main_path, models_path, cdrp_hgates_all_params, round_suffix, use_amp, device)

    cdrp_igates_all_params = {'threshold':threshold, 'start':0.05, 'iter':50, 'lr':0.1, 'eps':1e-2, 'gate_type':'input', 'reg_type':reg_type, 'gate_granularity':gg, 'subset':subset}
    igates_all, igates_all_accs, igates_all_gammas, _, _, _ = gm.apply_cdrp_on_all_models(df, main_path, models_path, cdrp_igates_all_params, round_suffix, use_amp, device)

    with open(f'{round_suffix}_{gg}_{reg_type}_data.pickle', 'wb') as handle:
        pickle.dump((hgates_all, hgates_all_accs, hgates_all_gammas, igates_all, igates_all_accs, igates_all_gammas, class_indices, model_labels, trigger_targets), handle, protocol=pickle.HIGHEST_PROTOCOL)

In [3]:
gg = 'all'

with open(f'{round_suffix}_{gg}_l1_data.pickle', 'rb') as handle:
    all_l1_data =  pickle.load(handle)

In [11]:
model_labels = all_l1_data[7]
hconfs_l1_data = np.array([[all_l1_data[1][idx][0][1], all_l1_data[1][idx][1][1], all_l1_data[1][idx][2][1]] for idx in range(len(model_labels))])
iconfs_l1_data = np.array([[all_l1_data[4][idx][0][1], all_l1_data[4][idx][1][1], all_l1_data[4][idx][2][1]] for idx in range(len(model_labels))])
data = np.hstack((hconfs_l1_data, iconfs_l1_data))
print(data.shape)

(48, 6)


In [12]:
# X_train, X_test, y_train, y_test = train_test_split(data, model_labels, stratify=model_labels)
# scoring = make_scorer(accuracy_score)
# parameters = {'learning_rate': [0.15,0.1,0.05,0.01,0.005,0.001],  'n_estimators': [100,250,500,750,1000,1250,1500], 'max_depth': [3,5,7]}
# clf = GridSearchCV(GradientBoostingClassifier(), parameters, scoring=scoring, refit=True, cv=2, n_jobs=-1).fit(X_train, y_train)
# print(f'Acc: {clf.score(X_test, y_test):.2f} - AUC: {roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]):.2f}')

clf = LogisticRegression(penalty='l2', C=100).fit(data, model_labels)
with open('clf.pickle', 'wb') as handle:
    pickle.dump(clf, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [13]:
with open('clf.pickle', 'rb') as handle:
    clf = pickle.load(handle)

print(f'Acc: {clf.score(data, model_labels):.2f} - AUC: {roc_auc_score(model_labels, clf.predict_proba(data)[:, 1]):.2f}')

Acc: 0.73 - AUC: 0.80
