In [1]:
import numpy as np
from scipy.io import arff
import pandas as pd
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn import metrics
import scipy as sp
import logging
from sklearn.neighbors import NearestNeighbors
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
    
def run_lof(X, y, k=60):
    clf = LocalOutlierFactor(n_neighbors=k)
    clf.fit(X)
    lof_scores = -clf.negative_outlier_factor_
    return lof_scores

def get_predictions(scores, num_outliers = 400, method_name = 'LOF'):
    threshold = np.sort(scores)[::-1][num_outliers]
    # threshold, max_f1 = get_best_f1_score(y, lof_scores)
    predictions = np.array(scores > threshold)
    predictions = np.array([int(i) for i in predictions])
#     print('F1 for {} : {}'.format(method_name, metrics.f1_score(y, predictions)))
    return predictions, scores, metrics.f1_score(y, predictions)

def get_best_F1(scores):
    best_f1 = 0
    for i in range(np.shape(scores)[0]):
        threshold = np.sort(scores)[::-1][i]
        predictions = np.array(scores > threshold)
        predictions = np.array([int(i) for i in predictions])
        cur_f1 = metrics.f1_score(y, predictions)
        best_f1 = max(cur_f1, best_f1)
    return best_f1

def run_knn(X, y, k=60):
    neigh = NearestNeighbors(n_neighbors=k)
    neigh.fit(X)
    knn_dists = neigh.kneighbors(X)[0][:,-1]
    return knn_dists

def run_isolation_forest(X, y, max_features = 1.0):
    # training the model
    clf = IsolationForest(random_state=42,max_features=max_features)
    clf.fit(X)
    # predictions
    sklearn_score_anomalies = clf.decision_function(X)
    if_scores = [-1*s + 0.5 for s in sklearn_score_anomalies]
    return if_scores

def mahalanobis(x):
    """Compute the Mahalanobis Distance between each row of x and the data
    """
    x_minus_mu = x - np.mean(x)
    cov = np.cov(x.T)
    inv_covmat = sp.linalg.inv(cov)
    results = []
    x_minus_mu = np.array(x_minus_mu)
    for i in range(np.shape(x)[0]):
        cur_data = x_minus_mu[i,:]
        results.append(np.dot(np.dot(x_minus_mu[i,:], inv_covmat), x_minus_mu[i,:].T))
    return np.array(results)
#     left_term = np.dot(x_minus_mu, inv_covmat)
#     mahal = np.dot(left_term, x_minus_mu.T)
#     print(mahal.diagonal())
#     return mahal.diagonal()

def run_mahalanobis(X, y):
    # training the model
    dist = mahalanobis(x=X)
    return dist

def load_dataset(filename):
    with open(filename, 'r') as f:
        data, meta = arff.loadarff(f)
    data = pd.DataFrame(data)
    X = data.drop(columns=['id', 'outlier'])
    # Map dataframe to encode values and put values into a numpy array
    y = data["outlier"].map(lambda x: 1 if x == b'yes' else 0).values
    return X, y

### Load SpamBase dataset

In [2]:
filename = './SpamBase_withoutdupl_norm_40.arff'
X, y = load_dataset(filename=filename)
print(np.shape(X), np.shape(y))
K = 9
N = 1679
class_balance = [1- N/4207.0, N/4207.0]
lof_krange = list(range(10,110,10)) * 6
knn_krange = list(range(10,110,10)) * 6
if_range = [0.5, 0.6, 0.7, 0.8, 0.9] * 6
# mahalanobis_N_range=[N]
mahalanobis_N_range = [1400, 1500, 1600, 1700, 1800, 1900]
if_N_range = np.sort(mahalanobis_N_range * 5)
N_range = np.sort(mahalanobis_N_range * 10) 
print(N/len(y))

(4207, 57) (4207,)
0.39909674352270025


In [3]:
all_results = []
all_scores = []
f1s = []

temp_lof_results = dict()
unique_lof_ks = list(set(lof_krange)) 
for k in unique_lof_ks:
#     print(k)
    lof_scores = run_lof(X, y, k=k)
    temp_lof_results[k] = lof_scores
for i in range(len(lof_krange)):
    lof_predictions, lof_scores, f1 = get_predictions(temp_lof_results[lof_krange[i]], num_outliers=N_range[i], method_name='LOF')
    all_results.append(lof_predictions)
    all_scores.append(lof_scores)
    f1s.append(f1)
best_lof_f1 = 0
for i in np.sort(unique_lof_ks):
    temp_f1 = max(np.array(f1s[0:60])[np.where(np.array(lof_krange) == i)[0]])
    print('LOF k = {}, best F-1 = {}'.format(i, temp_f1))
    best_lof_f1 = max(best_lof_f1, temp_f1)
print('Best LOF F-1 = {}'.format(best_lof_f1))

temp_knn_results = dict()
unique_knn_ks = list(set(knn_krange)) 
for k in unique_knn_ks:
    print(k)
    knn_scores = run_knn(X, y, k=k)
    temp_knn_results[k] = knn_scores
for i in range(len(knn_krange)):
    knn_predictions, knn_scores,f1 = get_predictions(temp_knn_results[knn_krange[i]], num_outliers=N_range[i], method_name='KNN')
    all_results.append(knn_predictions)
    all_scores.append(knn_scores)
    f1s.append(f1)
best_knn_f1 = 0
for i in np.sort(unique_knn_ks):
    temp_f1 = max(np.array(f1s[60:120])[np.where(np.array(knn_krange) == i)[0]])
    print('KNN k = {}, best F-1 = {}'.format(i, temp_f1))
    best_knn_f1 = max(best_knn_f1, temp_f1)
print('Best KNN F-1 = {}'.format(best_knn_f1))
    
temp_if_results = dict()
unique_if_features = list(set(if_range)) 
for k in unique_if_features:
    print(k)
    if_scores = run_isolation_forest(X, y, max_features=k)
    temp_if_results[k] = if_scores
for i in range(len(if_range)):
    if_predictions, if_scores,f1 = get_predictions(temp_if_results[if_range[i]], num_outliers=N_range[i], method_name='IF')
    all_results.append(if_predictions)
    all_scores.append(if_scores)
    f1s.append(f1)
best_if_f1 = 0
for i in np.sort(unique_if_features):
    temp_f1 = max(np.array(f1s[120:150])[np.where(np.array(if_range) == i)[0]])
    print('IF = {}, best F-1 = {}'.format(i, temp_f1))
    best_if_f1 = max(best_if_f1, temp_f1)
print('Best IF F-1 = {}'.format(best_if_f1))
    
mahalanobis_scores = run_mahalanobis(X, y)
best_mahala_f1 = 0
for i in range(len(mahalanobis_N_range)):
    mahalanobis_predictions,mahalanobis_scores,f1 = get_predictions(mahalanobis_scores, num_outliers=mahalanobis_N_range[i], method_name='mahala')
    all_results.append(mahalanobis_predictions)
    all_scores.append(mahalanobis_scores)
    best_mahala_f1 = max(best_mahala_f1, f1)
    f1s.append(f1)
print('mahalanobis = {}'.format(max(np.array(f1s[150:]))))
print('Best Mahala F-1 = {}'.format(best_mahala_f1))
L = np.stack(all_results).T
scores = np.stack(all_scores).T

LOF k = 10, best F-1 = 0.3956412405699916
LOF k = 20, best F-1 = 0.38502374965074043
LOF k = 30, best F-1 = 0.3576417993853031
LOF k = 40, best F-1 = 0.339759709416038
LOF k = 50, best F-1 = 0.34143615535065663
LOF k = 60, best F-1 = 0.34367141659681477
LOF k = 70, best F-1 = 0.34199497066219614
LOF k = 80, best F-1 = 0.34478904721989384
LOF k = 90, best F-1 = 0.3526124615814474
LOF k = 100, best F-1 = 0.3615535065660799
Best LOF F-1 = 0.3956412405699916
100
70
40
10
80
50
20
90
60
30
KNN k = 10, best F-1 = 0.4582285554624197
KNN k = 20, best F-1 = 0.4723309111235327
KNN k = 30, best F-1 = 0.47219893825090814
KNN k = 40, best F-1 = 0.46940486169321044
KNN k = 50, best F-1 = 0.4699636770047499
KNN k = 60, best F-1 = 0.4716401229393685
KNN k = 70, best F-1 = 0.4738753841855267
KNN k = 80, best F-1 = 0.47499301480860584
KNN k = 90, best F-1 = 0.4738753841855267
KNN k = 100, best F-1 = 0.4744341994970662
Best KNN F-1 = 0.47499301480860584
0.5
0.6
0.8
0.9
0.7
IF = 0.5, best F-1 = 0.49222323

In [4]:
print(np.shape(L))
print(np.shape(scores))

(4207, 156)
(4207, 156)


### Majority Vote

In [5]:
mid = np.shape(L)[1]/2
predictions = np.full((len(y)), 0)
predictions[np.sum(L, axis = 1) > mid] = 1
print('F1 for MV:', metrics.f1_score(y, predictions))

F1 for MV: 0.40124223602484466


### Save and load pickles

In [6]:
L_prev = L
scores_prev = scores

In [7]:
L = L_prev
scores = scores_prev

In [8]:
# print(max(f1s)) 
print(np.shape(L))

(4207, 156)


In [9]:
prediction_result_list = []
classifier_result_list = []
prediction_list = []
cur_f1_scores = []
prediction_high_conf_outliers = np.array([])
prediction_high_conf_inliers = np.array([])
prediction_classifier_disagree = np.array([])

In [10]:
index_range = np.array([[0, 60], [60, 120], [120, 150], [150, 156]])
coef_index_range = np.array([[0, 10], [10, 20], [20, 25], [25, 26]])
coef_remain_index = range(156)

In [11]:
scores_for_training_indexes = []
for i in range(len(index_range)):
    start=index_range[i][0]
    temp_range = coef_index_range[i][1]-coef_index_range[i][0]
    scores_for_training_indexes  = scores_for_training_indexes + list(range(start, start+temp_range))
print(scores_for_training_indexes) 
scores_for_training = scores[:, np.array(scores_for_training_indexes)]
print(np.shape(scores_for_training))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 120, 121, 122, 123, 124, 150]
(4207, 26)


### Iterative train LR and classifier(SVM)

In [12]:
# stable version
high_confidence_threshold = 0.9
low_confidence_threshold = 0.1
max_iter = 200
remain_params_tracking = np.array(range(0,np.max(coef_index_range)))
training_data_F1 = []
two_prediction_corr = []

min_max_diff = []
N_size = 6

last_training_data_indexes = []
counter = 0

for i_range in range(0, 50):
    print("##################################################################")
    print('Iteration = {}, L shape = {}'.format(i_range, np.shape(L)))
    num_methods = np.shape(L)[1]

    agree_outlier_indexes = np.sum(L,axis=1)==np.shape(L)[1]
    print('All agree, Number of outliers = {}'.format(sum(agree_outlier_indexes)))
    agree_inlier_indexes = np.sum(L,axis=1)==0
    print('All agree, Number of inliers = {}'.format(sum(agree_inlier_indexes)))

    disagree_indexes = np.where(np.logical_or(np.sum(L,axis = 1)==0, np.sum(L,axis = 1)==num_methods)==0)[0]
    # print('Number of disagreed points = {}'.format(len(disagree_indexes)))
    # print('Number of disagreed points (true outliers) = {}'.format(sum(y[disagree_indexes] == 1)))
    # print('Number of disagreed points (true inliers) = {}'.format(sum(y[disagree_indexes] == 0)))

#     all_inlier_indexes = np.where(agree_inlier_indexes)[0]
    all_inlier_indexes = np.setdiff1d(np.where(agree_inlier_indexes)[0], prediction_high_conf_outliers)
    if len(prediction_high_conf_inliers) >0:
        all_inlier_indexes = np.intersect1d(np.setdiff1d(np.where(agree_inlier_indexes)[0], prediction_high_conf_outliers), prediction_high_conf_inliers)
    print('num of inliers = {}'.format(np.shape(all_inlier_indexes)[0]))
#     all_outlier_indexes = np.union1d(np.where(agree_outlier_indexes)[0], self_agree_index_list)

#     if(len(np.intersect1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)) > 0 and
#       (len(np.union1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)) > 2000)):
#         all_outlier_indexes = np.intersect1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)
#     else:
    all_outlier_indexes = np.union1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)
#     if(len(all_outlier_indexes) > 1000):
#         all_outlier_indexes = np.random.RandomState(1).permutation(all_outlier_indexes)[:1000]
        
#     all_outlier_indexes = np.union1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)
    print('num of outliers = {}'.format(np.shape(all_outlier_indexes)[0]))
    all_inlier_indexes = np.setdiff1d(all_inlier_indexes, prediction_classifier_disagree)
    
    self_agree_index_list = []
    if((len(all_outlier_indexes) == 0) or (len(all_inlier_indexes)/ len(all_outlier_indexes) > 1000)):
        for i in range(0, len(index_range)):
            if(index_range[i,1]-index_range[i,0] <= 6):
                continue
            temp_index = disagree_indexes[np.where(np.sum(L[disagree_indexes][:,index_range[i,0]: index_range[i,1]], axis = 1)==(index_range[i,1]-index_range[i,0]))[0]]
            self_agree_index_list = np.union1d(self_agree_index_list, temp_index)
        self_agree_index_list = [int(i) for i in self_agree_index_list]
#     self_agree_index_list = np.random.RandomState(1).permutation(self_agree_index_list)[:500]
    all_outlier_indexes = np.union1d(all_outlier_indexes, self_agree_index_list)
    all_outlier_indexes = np.setdiff1d(all_outlier_indexes, prediction_classifier_disagree)
    print('num of outliers = {}'.format(np.shape(all_outlier_indexes)[0]))
    
    
    from sklearn.preprocessing import RobustScaler
    from sklearn.preprocessing import StandardScaler
    data_indexes = np.concatenate((all_inlier_indexes, all_outlier_indexes), axis = 0)
    data_indexes = np.array([int(i) for i in data_indexes])
    labels = np.concatenate((np.zeros(len(all_inlier_indexes)), np.ones(len(all_outlier_indexes))), axis = 0)
    transformer = RobustScaler().fit(scores_for_training)
    scores_transformed = transformer.transform(scores_for_training)
    training_data = scores_transformed[data_indexes]
    print('Training data shape: ', np.shape(training_data))
    training_data_F1.append(metrics.f1_score(y[data_indexes], labels))
    print('Training data F-1', metrics.f1_score(y[data_indexes], labels))

    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import SVC
#     clf = SVC(gamma='auto', probability=True, random_state=0)
#     clf.fit(training_data, labels)
    clf = LogisticRegression(random_state=0, penalty='l2', max_iter=max_iter).fit(training_data, labels) 
    clf_predictions = clf.predict(scores_transformed)
    clf_predict_proba = clf.predict_proba(scores_transformed)[:,1]
    print("F-1 score from LR:",metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba > 0.5])))
    
    
    transformer = RobustScaler().fit(X)
    X_transformed = transformer.transform(X)
    X_training_data = X_transformed[data_indexes]
    print(np.shape(X_training_data))
    print(np.shape(labels))

    clf_X = SVC(gamma='auto', probability=True, random_state=0)
    clf_X.fit(X_training_data, labels)
    clf_predictions_X = clf_X.predict(X_transformed)
    clf_predict_proba_X = clf_X.predict_proba(X_transformed)[:,1]
    print("F-1 score from SVM:",metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba_X > 0.5])))
#     cur_f1_scores.append(metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba_X > 0.5])))
    
    agreed_outlier_indexes = np.where(np.sum(L,axis=1)==np.shape(L)[1])[0]
    agreed_inlier_indexes = np.where(np.sum(L,axis=1)==0)[0]
        
    prediction_result_list.append(clf_predict_proba)
    classifier_result_list.append(clf_predict_proba_X)
    
    prediction_list.append(np.array([int(i) for i in clf_predictions]))
    
    prediction_high_conf_outliers = np.intersect1d(np.where(prediction_result_list[-1] > high_confidence_threshold)[0],
                                                   np.where(classifier_result_list[-1] > high_confidence_threshold)[0])
    print('length of prediction_high_conf_outliers:' , len(prediction_high_conf_outliers))
    prediction_high_conf_inliers = np.intersect1d(np.where(prediction_result_list[-1] < low_confidence_threshold)[0],
                                                   np.where(classifier_result_list[-1] < low_confidence_threshold)[0])
    print('length of prediction high conf inliers: ', len(prediction_high_conf_inliers))
    
    temp_prediction = np.array([int(i) for i in prediction_result_list[-1] > 0.5])
    temp_classifier = np.array([int(i) for i in classifier_result_list[-1] > 0.5])
    prediction_classifier_disagree = np.where(temp_prediction != temp_classifier)[0]
#     print('length of prediction-classifier disagree: {}'.format(len(prediction_classifier_disagree)))
#     print('length of prediction-classifier disagree in training: {}'.format(len(np.where(temp_prediction[data_indexes] != temp_classifier[data_indexes])[0])))
#     print(np.corrcoef(clf_predict_proba,clf_predict_proba_X))
    two_prediction_corr.append(np.corrcoef(clf_predict_proba,clf_predict_proba_X)[0,1])

    if np.max(coef_index_range) >= 2:
        if(len(prediction_high_conf_outliers) > 0 and len(prediction_high_conf_inliers) > 0):
            new_data_indexes = np.concatenate((prediction_high_conf_outliers, prediction_high_conf_inliers), axis = 0)
            new_data_indexes = np.array([int(i) for i in new_data_indexes])
            new_labels = np.concatenate((np.ones(len(prediction_high_conf_outliers)), np.zeros(len(prediction_high_conf_inliers))), axis = 0)
            clf_prune_2 = LogisticRegression(random_state=0, penalty='l2', max_iter=max_iter).fit(scores_transformed[new_data_indexes], new_labels) 
            combined_coef = clf_prune_2.coef_[0]  
        else:
            print('Coef from normal training: ', clf.coef_[0])
            combined_coef = clf.coef_[0]
            print('Combined Coef: ',  combined_coef)

        if(np.max(coef_index_range) > 2 or 
           ((np.max(combined_coef)/np.min(combined_coef) >= 1.1) and np.max(coef_index_range) >= 2)):
            if(len(set(combined_coef)) > 1):
                cur_clf_coef = combined_coef 
                cutoff = max(max(0, np.mean(combined_coef)-np.std(combined_coef)),min(combined_coef))
                print(cutoff)

                remain_indexes_after_cond = (cur_clf_coef > cutoff) #np.logical_and(cur_clf_coef > cutoff, abs(cur_clf_coef) > 0.01) # # 
                remain_params_tracking = remain_params_tracking[remain_indexes_after_cond]
                print(remain_params_tracking)
                remain_indexes_after_cond_expanded = []
                for i in range(0, len(coef_index_range)): #
                    s_e_range = coef_index_range[i,1]-coef_index_range[i,0]
                    s1, e1 = coef_index_range[i,0], coef_index_range[i,1]
                    s2, e2 = index_range[i,0], index_range[i,1]
                    saved_indexes = np.where(cur_clf_coef[s1:e1] > cutoff)[0]
                    for j in range(N_size):
                        remain_indexes_after_cond_expanded.extend(np.array(saved_indexes) + j * s_e_range + s2)

                new_coef_index_range_seq = []
                for i in range(0, len(coef_index_range)): #
                    s, e = coef_index_range[i,0], coef_index_range[i,1]
                    new_coef_index_range_seq.append(sum((remain_indexes_after_cond)[s:e]))

                coef_index_range = []
                index_range = []
                cur_sum = 0
                for i in range(0, len(new_coef_index_range_seq)):
                    coef_index_range.append([cur_sum, cur_sum + new_coef_index_range_seq[i]])
                    index_range.append([cur_sum * 6, 6 * (cur_sum + new_coef_index_range_seq[i])])
                    cur_sum += new_coef_index_range_seq[i]

                coef_index_range = np.array(coef_index_range)
                index_range = np.array(index_range)
                print(coef_index_range)
                print(index_range)

                L=L[:,remain_indexes_after_cond_expanded]
                scores_for_training = scores_for_training[:, remain_indexes_after_cond]
    if((len(last_training_data_indexes) == len(data_indexes)) and 
       (sum(last_training_data_indexes == data_indexes) == len(data_indexes)) and 
       (np.max(coef_index_range) < 2)):
        counter =  counter + 1
    else:
        counter = 0
    if(counter > 3):
        break
    last_training_data_indexes = data_indexes

##################################################################
Iteration = 0, L shape = (4207, 156)
All agree, Number of outliers = 235
All agree, Number of inliers = 879
num of inliers = 879
num of outliers = 235
num of outliers = 235
Training data shape:  (1114, 26)
Training data F-1 0.2697674418604651
F-1 score from LR: 0.38482549317147197
(1114, 57)
(1114,)
F-1 score from SVM: 0.4480801335559266
length of prediction_high_conf_outliers: 474
length of prediction high conf inliers:  1415
0.5459031258085079
[ 0  1  2  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25]
[[ 0  8]
 [ 8 18]
 [18 23]
 [23 24]]
[[  0  48]
 [ 48 108]
 [108 138]
 [138 144]]
##################################################################
Iteration = 1, L shape = (4207, 144)
All agree, Number of outliers = 235
All agree, Number of inliers = 880
num of inliers = 801
num of outliers = 569
num of outliers = 511
Training data shape:  (1312, 24)
Training data F-1 0.44764649375600385
F-1 score from L

F-1 score from SVM: 0.4952435860478524
length of prediction_high_conf_outliers: 1288
length of prediction high conf inliers:  1293
3.4494725592306255
[ 9 24 25]
[[0 1]
 [1 1]
 [1 2]
 [2 3]]
[[ 0  6]
 [ 6  6]
 [ 6 12]
 [12 18]]
##################################################################
Iteration = 14, L shape = (4207, 18)
All agree, Number of outliers = 659
All agree, Number of inliers = 1649
num of inliers = 1238
num of outliers = 1471
num of outliers = 1296
Training data shape:  (2534, 3)
Training data F-1 0.5317426501917341
F-1 score from LR: 0.4919839679358717
(2534, 57)
(2534,)
F-1 score from SVM: 0.4991394148020655
length of prediction_high_conf_outliers: 1281
length of prediction high conf inliers:  1283
4.350024506341005
[24 25]
[[0 0]
 [0 0]
 [0 1]
 [1 2]]
[[ 0  0]
 [ 0  0]
 [ 0  6]
 [ 6 12]]
##################################################################
Iteration = 15, L shape = (4207, 12)
All agree, Number of outliers = 949
All agree, Number of inliers = 2006
num 

In [15]:
index_range = np.array([[0, 60], [60, 120], [120, 150], [150, 156]])
coef_index_range = np.array([[0, 10], [10, 20], [20, 25], [25, 26]])
coef_remain_index = range(156)

scores_for_training_indexes = []
for i in range(len(index_range)):
    start=index_range[i][0]
    temp_range = coef_index_range[i][1]-coef_index_range[i][0]
    scores_for_training_indexes  = scores_for_training_indexes + list(range(start, start+temp_range))
print(scores_for_training_indexes) 
scores_for_training = scores[:, np.array(scores_for_training_indexes)]
print(np.shape(scores_for_training))

transformer = RobustScaler().fit(scores_for_training)
scores_transformed = transformer.transform(scores_for_training)
training_data = scores_transformed[data_indexes]
print('Training data shape: ', np.shape(training_data))
training_data_F1.append(metrics.f1_score(y[data_indexes], labels))
print('Training data F-1', metrics.f1_score(y[data_indexes], labels))

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
clf = LogisticRegression(random_state=0, penalty='l2', max_iter=max_iter).fit(training_data, labels) 
clf_predictions = clf.predict(scores_transformed)
clf_predict_proba = clf.predict_proba(scores_transformed)[:,1]
print("F-1 score from LR:",metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba > 0.5])))

transformer = RobustScaler().fit(X)
X_transformed = transformer.transform(X)
X_training_data = X_transformed[data_indexes]
print(np.shape(X_training_data))
print(np.shape(labels))

clf_X = SVC(gamma='auto', probability=True, random_state=0)
clf_X.fit(X_training_data, labels)
clf_predictions_X = clf_X.predict(X_transformed)
clf_predict_proba_X = clf_X.predict_proba(X_transformed)[:,1]
print("F-1 score from SVM:",metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba_X > 0.5])))
cur_f1_scores.append(metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba_X > 0.5])))

agreed_outlier_indexes = np.where(np.sum(L,axis=1)==np.shape(L)[1])[0]
agreed_inlier_indexes = np.where(np.sum(L,axis=1)==0)[0]

prediction_result_list.append(clf_predict_proba)
classifier_result_list.append(clf_predict_proba_X)

prediction_list.append(np.array([int(i) for i in clf_predictions]))

prediction_high_conf_outliers = np.intersect1d(np.where(prediction_result_list[-1] > high_confidence_threshold)[0],
                                               np.where(classifier_result_list[-1] > high_confidence_threshold)[0])
print('length of prediction_high_conf_outliers:' , len(prediction_high_conf_outliers))
prediction_high_conf_inliers = np.intersect1d(np.where(prediction_result_list[-1] < low_confidence_threshold)[0],
                                               np.where(classifier_result_list[-1] < low_confidence_threshold)[0])
print('length of prediction high conf inliers: ', len(prediction_high_conf_inliers))

temp_prediction = np.array([int(i) for i in prediction_result_list[-1] > 0.5])
temp_classifier = np.array([int(i) for i in classifier_result_list[-1] > 0.5])
prediction_classifier_disagree = np.where(temp_prediction != temp_classifier)[0]

L = L_prev
remain_params_tracking = np.array(range(0,np.max(coef_index_range)))

if np.max(coef_index_range) >= 2:
    if(len(prediction_high_conf_outliers) > 0 and len(prediction_high_conf_inliers) > 0):
        new_data_indexes = np.concatenate((prediction_high_conf_outliers, prediction_high_conf_inliers), axis = 0)
        new_data_indexes = np.array([int(i) for i in new_data_indexes])
        new_labels = np.concatenate((np.ones(len(prediction_high_conf_outliers)), np.zeros(len(prediction_high_conf_inliers))), axis = 0)
        clf_prune_2 = LogisticRegression(random_state=0, penalty='l2', max_iter=max_iter).fit(scores_transformed[new_data_indexes], new_labels) 
        print("F-1 score from both LR and SVM:",metrics.f1_score(y, np.array([int(i) for i in clf_prune_2.predict_proba(scores_transformed)[:,1] > 0.5])))
        print('Coef from both LR and SVM: ', clf_prune_2.coef_[0])
        combined_coef = clf_prune_2.coef_[0]  
    else:
        print('Coef from normal training: ', clf.coef_[0])
        combined_coef = clf.coef_[0]
        print('Combined Coef: ',  combined_coef)

    if(np.max(coef_index_range) > 2 or 
       ((np.max(combined_coef)/np.min(combined_coef) >= 1.1) and np.max(coef_index_range) >= 2)):
        if(len(set(combined_coef)) > 1):
            cur_clf_coef = combined_coef 
            cutoff = max(max(0, np.mean(combined_coef)-np.std(combined_coef)),min(combined_coef))
            print(cutoff)

            remain_indexes_after_cond = (cur_clf_coef > cutoff) #np.logical_and(cur_clf_coef > cutoff, abs(cur_clf_coef) > 0.01) # # 
            remain_params_tracking = remain_params_tracking[remain_indexes_after_cond]
            print(remain_params_tracking)
            remain_indexes_after_cond_expanded = []
            for i in range(0, len(coef_index_range)): #
                s_e_range = coef_index_range[i,1]-coef_index_range[i,0]
                s1, e1 = coef_index_range[i,0], coef_index_range[i,1]
                s2, e2 = index_range[i,0], index_range[i,1]
                saved_indexes = np.where(cur_clf_coef[s1:e1] > cutoff)[0]
                for j in range(N_size):
                    remain_indexes_after_cond_expanded.extend(np.array(saved_indexes) + j * s_e_range + s2)

            new_coef_index_range_seq = []
            for i in range(0, len(coef_index_range)): #
                s, e = coef_index_range[i,0], coef_index_range[i,1]
                new_coef_index_range_seq.append(sum((remain_indexes_after_cond)[s:e]))

            coef_index_range = []
            index_range = []
            cur_sum = 0
            for i in range(0, len(new_coef_index_range_seq)):
                coef_index_range.append([cur_sum, cur_sum + new_coef_index_range_seq[i]])
                index_range.append([cur_sum * 6, 6 * (cur_sum + new_coef_index_range_seq[i])])
                cur_sum += new_coef_index_range_seq[i]

            coef_index_range = np.array(coef_index_range)
            index_range = np.array(index_range)
            print(coef_index_range)
            print(index_range)

            L=L[:,remain_indexes_after_cond_expanded]
            scores_for_training = scores_for_training[:, remain_indexes_after_cond]



[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 120, 121, 122, 123, 124, 150]
(4207, 26)
Training data shape:  (2527, 26)
Training data F-1 0.5485347985347986
F-1 score from LR: 0.5043885313048566
(2527, 57)
(2527,)
F-1 score from SVM: 0.5124740124740125
length of prediction_high_conf_outliers: 1177
length of prediction high conf inliers:  1450
F-1 score from both LR and SVM: 0.5025095955122527
Coef from both LR and SVM:  [ 1.25247392e-04  6.20358138e-02 -1.02552274e-01  3.24148089e-02
  4.41343399e-02 -8.10614220e-02 -1.86904616e-01 -2.75726030e-01
 -2.22704066e-01 -3.00581615e-01  9.62157601e-01  4.09626187e-01
  2.79714141e-01  1.52021512e-01  1.10364014e-01  7.73699181e-02
  6.20495478e-02  3.74998755e-02  1.65053646e-02 -8.32090855e-03
  1.94272660e+00  2.39671788e+00  2.54120052e+00  2.18141370e+00
  4.64793010e+00 -1.89232326e-01]
0
[ 0  1  3  4 10 11 12 13 14 15 16 17 18 20 21 22 23 24]
[[ 0  4]
 [ 4 13]
 [13 18]
 [18 18]]
[[  0  24]
 [ 24  78]
 [ 78 108]

In [16]:
last_training_data_indexes = []
counter = 0

for i_range in range(0, 50):
    print("##################################################################")
    print('Iteration = {}, L shape = {}'.format(i_range, np.shape(L)))
    num_methods = np.shape(L)[1]

#     agree_outlier_indexes = (np.sum(L,axis=1)==np.shape(L)[1])
#     print('All agree, Number of outliers = {}'.format(sum(agree_outlier_indexes)))
#     agree_inlier_indexes = (np.sum(L,axis=1)==0)
#     print('All agree, Number of inliers = {}'.format(sum(agree_inlier_indexes)))

#     all_inlier_indexes = np.union1d(np.setdiff1d(np.where(agree_inlier_indexes)[0], prediction_high_conf_outliers), prediction_high_conf_inliers)
#     print('num of inliers = {}'.format(np.shape(all_inlier_indexes)[0]))
#     all_outlier_indexes = np.union1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)
#     print('num of outliers = {}'.format(np.shape(all_outlier_indexes)[0]))

#     disagree_indexes = np.where(np.logical_or(np.sum(L,axis = 1)==0, np.sum(L,axis = 1)==num_methods)==0)[0]

    ########################################################################

    agree_outlier_indexes = np.sum(L,axis=1)==np.shape(L)[1]
    print('All agree, Number of outliers = {}'.format(sum(agree_outlier_indexes)))
    agree_inlier_indexes = np.sum(L,axis=1)==0
    print('All agree, Number of inliers = {}'.format(sum(agree_inlier_indexes)))

    disagree_indexes = np.where(np.logical_or(np.sum(L,axis = 1)==0, np.sum(L,axis = 1)==num_methods)==0)[0]
    # print('Number of disagreed points = {}'.format(len(disagree_indexes)))
    # print('Number of disagreed points (true outliers) = {}'.format(sum(y[disagree_indexes] == 1)))
    # print('Number of disagreed points (true inliers) = {}'.format(sum(y[disagree_indexes] == 0)))

#     all_inlier_indexes = np.where(agree_inlier_indexes)[0]
    all_inlier_indexes = np.setdiff1d(np.where(agree_inlier_indexes)[0], prediction_high_conf_outliers)
    if len(prediction_high_conf_inliers) >0:
        all_inlier_indexes = np.intersect1d(np.setdiff1d(np.where(agree_inlier_indexes)[0], prediction_high_conf_outliers), prediction_high_conf_inliers)
    print('num of inliers = {}'.format(np.shape(all_inlier_indexes)[0]))
#     all_outlier_indexes = np.union1d(np.where(agree_outlier_indexes)[0], self_agree_index_list)

#     if(len(np.intersect1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)) > 0 and
#       (len(np.union1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)) > 2000)):
#         all_outlier_indexes = np.intersect1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)
#     else:
    all_outlier_indexes = np.union1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)
#     if(len(all_outlier_indexes) > 1000):
#         all_outlier_indexes = np.random.RandomState(1).permutation(all_outlier_indexes)[:1000]
        
#     all_outlier_indexes = np.union1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)
    print('num of outliers = {}'.format(np.shape(all_outlier_indexes)[0]))
    all_inlier_indexes = np.setdiff1d(all_inlier_indexes, prediction_classifier_disagree)
    
    self_agree_index_list = []
    if((len(all_outlier_indexes) == 0) or (len(all_inlier_indexes)/ len(all_outlier_indexes) > 1000)):
        for i in range(0, len(index_range)):
            if(index_range[i,1]-index_range[i,0] <= 6):
                continue
            temp_index = disagree_indexes[np.where(np.sum(L[disagree_indexes][:,index_range[i,0]: index_range[i,1]], axis = 1)==(index_range[i,1]-index_range[i,0]))[0]]
            self_agree_index_list = np.union1d(self_agree_index_list, temp_index)
        self_agree_index_list = [int(i) for i in self_agree_index_list]
#     self_agree_index_list = np.random.RandomState(1).permutation(self_agree_index_list)[:500]
    all_outlier_indexes = np.union1d(all_outlier_indexes, self_agree_index_list)
    all_outlier_indexes = np.setdiff1d(all_outlier_indexes, prediction_classifier_disagree)
    print('num of outliers = {}'.format(np.shape(all_outlier_indexes)[0]))
    
    
    from sklearn.preprocessing import RobustScaler
    from sklearn.preprocessing import StandardScaler
    data_indexes = np.concatenate((all_inlier_indexes, all_outlier_indexes), axis = 0)
    data_indexes = np.array([int(i) for i in data_indexes])
    labels = np.concatenate((np.zeros(len(all_inlier_indexes)), np.ones(len(all_outlier_indexes))), axis = 0)
    transformer = RobustScaler().fit(scores_for_training)
    scores_transformed = transformer.transform(scores_for_training)
    training_data = scores_transformed[data_indexes]
    print('Training data shape: ', np.shape(training_data))
    training_data_F1.append(metrics.f1_score(y[data_indexes], labels))
    print('Training data F-1', metrics.f1_score(y[data_indexes], labels))

    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import SVC
#     clf = SVC(gamma='auto', probability=True, random_state=0)
#     clf.fit(training_data, labels)
    clf = LogisticRegression(random_state=0, penalty='l2', max_iter=max_iter).fit(training_data, labels) 
    clf_predictions = clf.predict(scores_transformed)
    clf_predict_proba = clf.predict_proba(scores_transformed)[:,1]
    print("F-1 score from LR:",metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba > 0.5])))
    
    
    transformer = RobustScaler().fit(X)
    X_transformed = transformer.transform(X)
    X_training_data = X_transformed[data_indexes]
    print(np.shape(X_training_data))
    print(np.shape(labels))

    clf_X = SVC(gamma='auto', probability=True, random_state=0)
    clf_X.fit(X_training_data, labels)
    clf_predictions_X = clf_X.predict(X_transformed)
    clf_predict_proba_X = clf_X.predict_proba(X_transformed)[:,1]
    print("F-1 score from SVM:",metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba_X > 0.5])))
    cur_f1_scores.append(metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba_X > 0.5])))
        
    agreed_outlier_indexes = np.where(np.sum(L,axis=1)==np.shape(L)[1])[0]
    agreed_inlier_indexes = np.where(np.sum(L,axis=1)==0)[0]
        
    prediction_result_list.append(clf_predict_proba)
    classifier_result_list.append(clf_predict_proba_X)
    
    prediction_list.append(np.array([int(i) for i in clf_predictions]))
    
    prediction_high_conf_outliers = np.intersect1d(np.where(prediction_result_list[-1] > high_confidence_threshold)[0],
                                                   np.where(classifier_result_list[-1] > high_confidence_threshold)[0])
    print('length of prediction_high_conf_outliers:' , len(prediction_high_conf_outliers))
    prediction_high_conf_inliers = np.intersect1d(np.where(prediction_result_list[-1] < low_confidence_threshold)[0],
                                                   np.where(classifier_result_list[-1] < low_confidence_threshold)[0])
    print('length of prediction high conf inliers: ', len(prediction_high_conf_inliers))
    
    temp_prediction = np.array([int(i) for i in prediction_result_list[-1] > 0.5])
    temp_classifier = np.array([int(i) for i in classifier_result_list[-1] > 0.5])
    prediction_classifier_disagree = np.where(temp_prediction != temp_classifier)[0]
    
    if np.max(coef_index_range) >= 2:
        if(len(prediction_high_conf_outliers) > 0 and len(prediction_high_conf_inliers) > 0):
            new_data_indexes = np.concatenate((prediction_high_conf_outliers, prediction_high_conf_inliers), axis = 0)
            new_data_indexes = np.array([int(i) for i in new_data_indexes])
            new_labels = np.concatenate((np.ones(len(prediction_high_conf_outliers)), np.zeros(len(prediction_high_conf_inliers))), axis = 0)
            clf_prune_2 = LogisticRegression(random_state=0, penalty='l2', max_iter=max_iter).fit(scores_transformed[new_data_indexes], new_labels) 
            print("F-1 score from both LR and SVM:",metrics.f1_score(y, np.array([int(i) for i in clf_prune_2.predict_proba(scores_transformed)[:,1] > 0.5])))
            print('Coef from both LR and SVM: ', clf_prune_2.coef_[0])
            combined_coef = clf_prune_2.coef_[0]  
        else:
            print('Coef from normal training: ', clf.coef_[0])
            combined_coef = clf.coef_[0]
            print('Combined Coef: ',  combined_coef)

        if(np.max(coef_index_range) > 2 or 
           ((np.max(combined_coef)/np.min(combined_coef) >= 1.1) and np.max(coef_index_range) >= 2)):
            if(len(set(combined_coef)) > 1):
                cur_clf_coef = combined_coef 
                cutoff = max(max(0, np.mean(combined_coef)-np.std(combined_coef)),min(combined_coef))
                print(cutoff)

                remain_indexes_after_cond = (cur_clf_coef > cutoff) #np.logical_and(cur_clf_coef > cutoff, abs(cur_clf_coef) > 0.01) # # 
                remain_params_tracking = remain_params_tracking[remain_indexes_after_cond]
                print(remain_params_tracking)
                remain_indexes_after_cond_expanded = []
                for i in range(0, len(coef_index_range)): #
                    s_e_range = coef_index_range[i,1]-coef_index_range[i,0]
                    s1, e1 = coef_index_range[i,0], coef_index_range[i,1]
                    s2, e2 = index_range[i,0], index_range[i,1]
                    saved_indexes = np.where(cur_clf_coef[s1:e1] > cutoff)[0]
                    for j in range(N_size):
                        remain_indexes_after_cond_expanded.extend(np.array(saved_indexes) + j * s_e_range + s2)

                new_coef_index_range_seq = []
                for i in range(0, len(coef_index_range)): #
                    s, e = coef_index_range[i,0], coef_index_range[i,1]
                    new_coef_index_range_seq.append(sum((remain_indexes_after_cond)[s:e]))

                coef_index_range = []
                index_range = []
                cur_sum = 0
                for i in range(0, len(new_coef_index_range_seq)):
                    coef_index_range.append([cur_sum, cur_sum + new_coef_index_range_seq[i]])
                    index_range.append([cur_sum * 6, 6 * (cur_sum + new_coef_index_range_seq[i])])
                    cur_sum += new_coef_index_range_seq[i]

                coef_index_range = np.array(coef_index_range)
                index_range = np.array(index_range)
                print(coef_index_range)
                print(index_range)

                L=L[:,remain_indexes_after_cond_expanded]
                scores_for_training = scores_for_training[:, remain_indexes_after_cond]
    if((len(last_training_data_indexes) == len(data_indexes)) and 
       (sum(last_training_data_indexes == data_indexes) == len(data_indexes)) and 
       (np.max(coef_index_range) < 2)):
        counter =  counter + 1
    else:
        counter = 0
    if(counter > 3):
        break
    last_training_data_indexes = data_indexes

##################################################################
Iteration = 0, L shape = (4207, 108)
All agree, Number of outliers = 254
All agree, Number of inliers = 938
num of inliers = 677
num of outliers = 1220
num of outliers = 1181
Training data shape:  (1858, 18)
Training data F-1 0.6001955034213099
F-1 score from LR: 0.531039640987285
(1858, 57)
(1858,)
F-1 score from SVM: 0.5139896373056995
length of prediction_high_conf_outliers: 1394
length of prediction high conf inliers:  1071
F-1 score from both LR and SVM: 0.5286035473394954
Coef from both LR and SVM:  [0.60492785 0.65605714 0.34466662 0.45955233 0.74888096 0.84434315
 0.69856631 0.60681423 0.58292854 0.56326411 0.54844167 0.54748759
 0.53929945 2.25872224 2.50299141 2.41378856 2.19282821 2.57765753]
0.34466662098609196
[ 0  1  4 10 11 12 13 14 15 16 17 18 20 21 22 23 24]
[[ 0  3]
 [ 3 12]
 [12 17]
 [17 17]]
[[  0  18]
 [ 18  72]
 [ 72 102]
 [102 102]]
#################################################################

F-1 score from SVM: 0.51908011119535
length of prediction_high_conf_outliers: 1688
length of prediction high conf inliers:  839
F-1 score from both LR and SVM: 0.5489661440581687
Coef from both LR and SVM:  [2.99549128 3.19235211 2.96718852 4.17462971 4.06413237]
2.9671885190153837
[ 1 11 22 24]
[[0 1]
 [1 2]
 [2 4]
 [4 4]]
[[ 0  6]
 [ 6 12]
 [12 24]
 [24 24]]
##################################################################
Iteration = 11, L shape = (4207, 24)
All agree, Number of outliers = 480
All agree, Number of inliers = 1391
num of inliers = 762
num of outliers = 1772
num of outliers = 1694
Training data shape:  (2456, 4)
Training data F-1 0.5974025974025975
F-1 score from LR: 0.5525661315848971
(2456, 57)
(2456,)
F-1 score from SVM: 0.5201816347124117
length of prediction_high_conf_outliers: 1670
length of prediction high conf inliers:  825
F-1 score from both LR and SVM: 0.5501485035412383
Coef from both LR and SVM:  [2.87203729 4.61564003 4.21275329 4.29946807]
3.33171992584

F-1 score from SVM: 0.5709555345316935
length of prediction_high_conf_outliers: 1688
length of prediction high conf inliers:  837
F-1 score from both LR and SVM: 0.5611374407582939
Coef from both LR and SVM:  [6.55286096 6.05629591]
##################################################################
Iteration = 25, L shape = (4207, 12)
All agree, Number of outliers = 957
All agree, Number of inliers = 2011
num of inliers = 836
num of outliers = 1818
num of outliers = 1705
Training data shape:  (2541, 2)
Training data F-1 0.6246418338108883
F-1 score from LR: 0.5610045013030088
(2541, 57)
(2541,)
F-1 score from SVM: 0.5737859500235738
length of prediction_high_conf_outliers: 1694
length of prediction high conf inliers:  835
F-1 score from both LR and SVM: 0.5611374407582939
Coef from both LR and SVM:  [6.55368113 6.06029882]
##################################################################
Iteration = 26, L shape = (4207, 12)
All agree, Number of outliers = 957
All agree, Number of inli

F-1 score from SVM: 0.5870246085011186
length of prediction_high_conf_outliers: 1868
length of prediction high conf inliers:  573
##################################################################
Iteration = 39, L shape = (4207, 6)
All agree, Number of outliers = 1400
All agree, Number of inliers = 2308
num of inliers = 573
num of outliers = 2101
num of outliers = 1884
Training data shape:  (2457, 1)
Training data F-1 0.6263736263736264
F-1 score from LR: 0.5740064446831364
(2457, 57)
(2457,)
F-1 score from SVM: 0.5892778023925566
length of prediction_high_conf_outliers: 1921
length of prediction high conf inliers:  546
##################################################################
Iteration = 40, L shape = (4207, 6)
All agree, Number of outliers = 1400
All agree, Number of inliers = 2308
num of inliers = 546
num of outliers = 2144
num of outliers = 1936
Training data shape:  (2482, 1)
Training data F-1 0.6280160857908846
F-1 score from LR: 0.5738612175393785
(2482, 57)
(2482,)
F-

### Use GT label to train a classifier

In [17]:
clf_X = SVC(gamma='auto', probability=True, random_state=0)
transformer = RobustScaler().fit(X)
X_transformed = transformer.transform(X)
X_training_data = X_transformed[data_indexes]
clf_X.fit(X_training_data, y[data_indexes])
clf_predictions_X = clf_X.predict(X_transformed)
clf_predict_proba_X = clf_X.predict_proba(X_transformed)[:,1]
print("F-1 score from SVM:",metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba_X > 0.5])))

F-1 score from SVM: 0.8993691799339141
