In [1]:
import numpy as np
from scipy.io import arff
import pandas as pd
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn import metrics
import scipy as sp
import logging
from sklearn.neighbors import NearestNeighbors
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
    
def run_lof(X, y, k=60):
    clf = LocalOutlierFactor(n_neighbors=k)
    clf.fit(X)
    lof_scores = -clf.negative_outlier_factor_
    return lof_scores

def get_predictions(scores, num_outliers = 400, method_name = 'LOF'):
    threshold = np.sort(scores)[::-1][num_outliers]
    # threshold, max_f1 = get_best_f1_score(y, lof_scores)
    predictions = np.array(scores > threshold)
    predictions = np.array([int(i) for i in predictions])
#     print('F1 for {} : {}'.format(method_name, metrics.f1_score(y, predictions)))
    return predictions, scores, metrics.f1_score(y, predictions)

def get_precision_recall(scores, num_outliers = 400, method_name = 'LOF'):
    threshold = np.sort(scores)[::-1][num_outliers]
    # threshold, max_f1 = get_best_f1_score(y, lof_scores)
    predictions = np.array(scores > threshold)
    predictions = np.array([int(i) for i in predictions])
#     print('F1 for {} : {}'.format(method_name, metrics.f1_score(y, predictions)))
    return predictions, scores, metrics.f1_score(y, predictions), metrics.precision_score(y, predictions), metrics.recall_score(y, predictions)

def get_best_F1(scores):
    best_f1 = 0
    for i in range(np.shape(scores)[0]):
        threshold = np.sort(scores)[::-1][i]
        predictions = np.array(scores > threshold)
        predictions = np.array([int(i) for i in predictions])
        cur_f1 = metrics.f1_score(y, predictions)
        best_f1 = max(cur_f1, best_f1)
    return best_f1

def run_knn(X, y, k=60):
    neigh = NearestNeighbors(n_neighbors=k)
    neigh.fit(X)
    knn_dists = neigh.kneighbors(X)[0][:,-1]
    return knn_dists

def run_isolation_forest(X, y, max_features = 1.0):
    # training the model
    clf = IsolationForest(random_state=42,max_features=max_features)
    clf.fit(X)
    # predictions
    sklearn_score_anomalies = clf.decision_function(X)
    if_scores = [-1*s + 0.5 for s in sklearn_score_anomalies]
    return if_scores

def mahalanobis(x):
    """Compute the Mahalanobis Distance between each row of x and the data
    """
    x_minus_mu = x - np.mean(x)
    cov = np.cov(x.T)
    inv_covmat = sp.linalg.inv(cov)
    results = []
    x_minus_mu = np.array(x_minus_mu)
    for i in range(np.shape(x)[0]):
        cur_data = x_minus_mu[i,:]
        results.append(np.dot(np.dot(x_minus_mu[i,:], inv_covmat), x_minus_mu[i,:].T))
    return np.array(results)
#     left_term = np.dot(x_minus_mu, inv_covmat)
#     mahal = np.dot(left_term, x_minus_mu.T)
#     print(mahal.diagonal())
#     return mahal.diagonal()

def run_mahalanobis(X, y):
    # training the model
    dist = mahalanobis(x=X)
    return dist

def load_dataset(filename):
    with open(filename, 'r') as f:
        data, meta = arff.loadarff(f)
    data = pd.DataFrame(data)
    X = data.drop(columns=['id', 'outlier'])
    # Map dataframe to encode values and put values into a numpy array
    y = data["outlier"].map(lambda x: 1 if x == b'yes' else 0).values
    return X, y

### Load pageblock dataset

In [2]:
filename = './PageBlocks_norm_10.arff'
X, y = load_dataset(filename=filename)
print(np.shape(X), np.shape(y))
K = 80
N = 560
# num_outliers = [N, N, N, N]
class_balance = [0.9, 0.1]
# lof_krange = [55, 60, 65, 70, 75] 
# knn_krange = [55, 60, 65, 70, 75] 
# if_range = [0.5, 0.6,0.7, 0.8,0.9]
lof_krange = list(range(10,110,10)) * 6
knn_krange = list(range(10,110,10)) * 6
# lof_krange = range(70,90,4) 
# knn_krange = [60, 70, 80, 90, 100] 
if_range = [0.5, 0.6, 0.7, 0.8, 0.9] * 6
# mahalanobis_N_range=[560]
mahalanobis_N_range = [300, 400, 500, 600, 700, 800]
# mahalanobis_N_range = [550, 560, 570, 580, 590, 600]
N_size = 6
if_N_range = np.sort(mahalanobis_N_range * 5)
N_range = np.sort(mahalanobis_N_range * 10)

(5473, 10) (5473,)


## Isolation Forest results

In [3]:
f1s = []
precisions = []
recalls = []

temp_if_results = dict()
unique_if_features = list(set(if_range)) 
best_f1 = 0
best_precision = 0
best_recall = 0
for k in unique_if_features:
    print(k)
    if_scores = run_isolation_forest(X, y, max_features=k)
    temp_if_results[k] = if_scores
for i in range(len(if_range)):
    if_predictions, if_scores,f1, precision, recall = get_precision_recall(temp_if_results[if_range[i]], num_outliers=if_N_range[i], method_name='IF')
    f1s.append(f1)
    if f1>best_f1:
        best_f1 = f1
        best_precision = precision
        best_recall = recall
    precisions.append(precision)
    recalls.append(recall)
print(f"Average F1s = {sum(f1s)/len(f1s)}, Average Precision = {sum(precisions)/len(precisions)}, Average recall = {sum(recalls)/len(recalls)}")
print(f"Average F1s = {sum(f1s)/len(f1s)}, Max F1s = {max(f1s)}, Precision at best F1 = {best_precision}, recall at best F1 = {best_recall}")

0.5
0.6
0.8
0.9
0.7
Average F1s = 0.40867557318580394, Average Precision = 0.4297496031746032, Average recall = 0.4104761904761904
Average F1s = 0.40867557318580394, Max F1s = 0.4970588235294117, Precision at best F1 = 0.4225, recall at best F1 = 0.6035714285714285


## Get Best Unsupervised Result

In [7]:
all_results = []
all_scores = []
f1s = []

method_to_bestf1 = {}
best_f1 = 0
best_precision=0
best_recall=0

temp_lof_results = dict()
unique_lof_ks = list(set(lof_krange)) 

best_lof_f1 = 0
best_lof_precision = 0
best_lof_recall = 0
for k in unique_lof_ks:
#     print(k)
    lof_scores = run_lof(X, y, k=k)
    temp_lof_results[k] = lof_scores
for i in range(len(lof_krange)):
    lof_predictions, lof_scores, f1, precision,recall = get_precision_recall(temp_lof_results[lof_krange[i]], num_outliers=N_range[i], method_name='LOF')
    all_results.append(lof_predictions)
    all_scores.append(lof_scores)
    f1s.append(f1)
    if f1 > best_f1:
        best_f1 = f1
        best_precision = precision
        best_recall = recall

best_lof_f1 = 0
for i in np.sort(unique_lof_ks):
    temp_f1 = max(np.array(f1s[0:60])[np.where(np.array(lof_krange) == i)[0]])
    best_lof_f1 = max(best_lof_f1, temp_f1)

method_to_bestf1["LOF"] = best_lof_f1

temp_knn_results = dict()
unique_knn_ks = list(set(knn_krange)) 
for k in unique_knn_ks:
    knn_scores = run_knn(X, y, k=k)
    temp_knn_results[k] = knn_scores
for i in range(len(knn_krange)):
    knn_predictions, knn_scores,f1,precision,recall = get_precision_recall(temp_knn_results[knn_krange[i]], num_outliers=N_range[i], method_name='KNN')
    all_results.append(knn_predictions)
    all_scores.append(knn_scores)
    f1s.append(f1)
    if f1 > best_f1:
        best_f1 = f1
        best_precision = precision
        best_recall = recall
best_knn_f1 = 0
for i in np.sort(unique_knn_ks):
    temp_f1 = max(np.array(f1s[60:120])[np.where(np.array(knn_krange) == i)[0]])
    best_knn_f1 = max(best_knn_f1, temp_f1)
method_to_bestf1["KNN"] = best_knn_f1

temp_if_results = dict()
unique_if_features = list(set(if_range)) 
for k in unique_if_features:
    if_scores = run_isolation_forest(X, y, max_features=k)
    temp_if_results[k] = if_scores
for i in range(len(if_range)):
    if_predictions, if_scores,f1,precision,recall = get_precision_recall(temp_if_results[if_range[i]], num_outliers=N_range[i], method_name='IF')
    all_results.append(if_predictions)
    all_scores.append(if_scores)
    f1s.append(f1)
    if f1 > best_f1:
        best_f1 = f1
        best_precision = precision
        best_recall = recall
best_if_f1 = 0
for i in np.sort(unique_if_features):
    temp_f1 = max(np.array(f1s[120:150])[np.where(np.array(if_range) == i)[0]])
    best_if_f1 = max(best_if_f1, temp_f1)
method_to_bestf1["IF"] = best_if_f1
   
mahalanobis_scores = run_mahalanobis(X, y)
best_mahala_f1 = 0
for i in range(len(mahalanobis_N_range)):
    mahalanobis_predictions,mahalanobis_scores,f1,precision,recall = get_precision_recall(mahalanobis_scores, num_outliers=mahalanobis_N_range[i], method_name='mahala')
    all_results.append(mahalanobis_predictions)
    all_scores.append(mahalanobis_scores)
    best_mahala_f1 = max(best_mahala_f1, f1)
    f1s.append(f1)
    if f1 > best_f1:
        best_f1 = f1
        best_precision = precision
        best_recall = recall
method_to_bestf1["Mahala"] = best_mahala_f1
best_method = ""
best_f1 =0
for method, f1 in method_to_bestf1.items():
    if f1 > best_f1:
        best_method = method
        best_f1 = f1

print(f"Best Method = {best_method}, Best F1 = {best_f1}")
L = np.stack(all_results).T
scores = np.stack(all_scores).T
print(f"Best F1 = {best_f1}, best_precision = {best_precision}, best_recall={best_recall}")

Best LOF F-1 = 0.5120689655172415
Best Method = Mahala, Best F1 = 0.5411764705882354
Best F1 = 0.5411764705882354, best_precision = 0.46, best_recall=0.6571428571428571


In [18]:
print(np.shape(L))
print(np.shape(scores))

(5473, 156)
(5473, 156)


### Majority Vote

In [19]:
mid = np.shape(L)[1]/2
predictions = np.full((len(y)), 0)
predictions[np.sum(L, axis = 1) > mid] = 1
print('F1 for MV:', metrics.f1_score(y, predictions))

F1 for MV: 0.43640124095139604


### Save and load pickles

In [20]:
L_prev = L
scores_prev = scores

In [21]:
L = L_prev
scores = scores_prev

In [22]:
# print(max(f1s)) 
print(np.shape(L))

(5473, 156)


In [23]:
prediction_result_list = []
classifier_result_list = []
prediction_list = []
cur_f1_scores = []
prediction_high_conf_outliers = np.array([])
prediction_high_conf_inliers = np.array([])
prediction_classifier_disagree = np.array([])

In [24]:
index_range = np.array([[0, 60], [60, 120], [120, 150], [150, 156]])
coef_index_range = np.array([[0, 10], [10, 20], [20, 25], [25, 26]])
coef_remain_index = range(156)

In [25]:
scores_for_training_indexes = []
for i in range(len(index_range)):
    start=index_range[i][0]
    temp_range = coef_index_range[i][1]-coef_index_range[i][0]
    scores_for_training_indexes  = scores_for_training_indexes + list(range(start, start+temp_range))
print(scores_for_training_indexes) 
scores_for_training = scores[:, np.array(scores_for_training_indexes)]
print(np.shape(scores_for_training))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 120, 121, 122, 123, 124, 150]
(5473, 26)


### Iterative train LR and classifier(SVM)

In [26]:
# stable version
high_confidence_threshold = 0.99
low_confidence_threshold = 0.01
max_iter = 200
remain_params_tracking = np.array(range(0,np.max(coef_index_range)))
training_data_F1 = []
two_prediction_corr = []

min_max_diff = []
N_size = 6

last_training_data_indexes = []
counter = 0

for i_range in range(0, 50):
    print("##################################################################")
    print('Iteration = {}, L shape = {}'.format(i_range, np.shape(L)))
    num_methods = np.shape(L)[1]

    agree_outlier_indexes = np.sum(L,axis=1)==np.shape(L)[1]
    print('All agree, Number of outliers = {}'.format(sum(agree_outlier_indexes)))
    agree_inlier_indexes = np.sum(L,axis=1)==0
    print('All agree, Number of inliers = {}'.format(sum(agree_inlier_indexes)))

    disagree_indexes = np.where(np.logical_or(np.sum(L,axis = 1)==0, np.sum(L,axis = 1)==num_methods)==0)[0]

    all_inlier_indexes = np.setdiff1d(np.where(agree_inlier_indexes)[0], prediction_high_conf_outliers)
    if len(prediction_high_conf_inliers) > 0:
        all_inlier_indexes = np.intersect1d(np.setdiff1d(np.where(agree_inlier_indexes)[0], prediction_high_conf_outliers), prediction_high_conf_inliers)
    print('num of inliers = {}'.format(np.shape(all_inlier_indexes)[0]))

    all_outlier_indexes = np.union1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)
    print('num of outliers = {}'.format(np.shape(all_outlier_indexes)[0]))
    all_inlier_indexes = np.setdiff1d(all_inlier_indexes, prediction_classifier_disagree)
    
    self_agree_index_list = []
    if((len(all_outlier_indexes) == 0) or (len(all_inlier_indexes)/ len(all_outlier_indexes) > 1000)):
        for i in range(0, len(index_range)):
            if(index_range[i,1]-index_range[i,0] <= 6):
                continue
            temp_index = disagree_indexes[np.where(np.sum(L[disagree_indexes][:,index_range[i,0]: index_range[i,1]], axis = 1)==(index_range[i,1]-index_range[i,0]))[0]]
            self_agree_index_list = np.union1d(self_agree_index_list, temp_index)
        self_agree_index_list = [int(i) for i in self_agree_index_list]
#     self_agree_index_list = np.random.RandomState(1).permutation(self_agree_index_list)[:500]
    all_outlier_indexes = np.union1d(all_outlier_indexes, self_agree_index_list)
    all_outlier_indexes = np.setdiff1d(all_outlier_indexes, prediction_classifier_disagree)
    print('num of outliers = {}'.format(np.shape(all_outlier_indexes)[0]))
    
    
    from sklearn.preprocessing import RobustScaler
    from sklearn.preprocessing import StandardScaler
    data_indexes = np.concatenate((all_inlier_indexes, all_outlier_indexes), axis = 0)
    data_indexes = np.array([int(i) for i in data_indexes])
    labels = np.concatenate((np.zeros(len(all_inlier_indexes)), np.ones(len(all_outlier_indexes))), axis = 0)
    transformer = RobustScaler().fit(scores_for_training)
    scores_transformed = transformer.transform(scores_for_training)
    training_data = scores_transformed[data_indexes]
    print('Training data shape: ', np.shape(training_data))
    training_data_F1.append(metrics.f1_score(y[data_indexes], labels))
    print('Training data F-1', metrics.f1_score(y[data_indexes], labels))

    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import SVC
#     clf = SVC(gamma='auto', probability=True, random_state=0)
#     clf.fit(training_data, labels)
    clf = LogisticRegression(random_state=0, penalty='l2', max_iter=max_iter).fit(training_data, labels) 
    clf_predictions = clf.predict(scores_transformed)
    clf_predict_proba = clf.predict_proba(scores_transformed)[:,1]
    print("F-1 score from LR:",metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba > 0.5])))
    
    
    transformer = RobustScaler().fit(X)
    X_transformed = transformer.transform(X)
    X_training_data = X_transformed[data_indexes]
    print(np.shape(X_training_data))
    print(np.shape(labels))

    clf_X = SVC(gamma='auto', probability=True, random_state=0)
    clf_X.fit(X_training_data, labels)
    clf_predictions_X = clf_X.predict(X_transformed)
    clf_predict_proba_X = clf_X.predict_proba(X_transformed)[:,1]
    print("F-1 score from SVM:",metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba_X > 0.5])))
    
    agreed_outlier_indexes = np.where(np.sum(L,axis=1)==np.shape(L)[1])[0]
    agreed_inlier_indexes = np.where(np.sum(L,axis=1)==0)[0]
        
    prediction_result_list.append(clf_predict_proba)
    classifier_result_list.append(clf_predict_proba_X)
    
    prediction_list.append(np.array([int(i) for i in clf_predictions]))
    
    prediction_high_conf_outliers = np.intersect1d(np.where(prediction_result_list[-1] > high_confidence_threshold)[0],
                                                   np.where(classifier_result_list[-1] > high_confidence_threshold)[0])
    print('length of prediction_high_conf_outliers:' , len(prediction_high_conf_outliers))
    prediction_high_conf_inliers = np.intersect1d(np.where(prediction_result_list[-1] < low_confidence_threshold)[0],
                                                   np.where(classifier_result_list[-1] < low_confidence_threshold)[0])
    print('length of prediction high conf inliers: ', len(prediction_high_conf_inliers))
    
    temp_prediction = np.array([int(i) for i in prediction_result_list[-1] > 0.5])
    temp_classifier = np.array([int(i) for i in classifier_result_list[-1] > 0.5])
    prediction_classifier_disagree = np.where(temp_prediction != temp_classifier)[0]
#     print('length of prediction-classifier disagree: {}'.format(len(prediction_classifier_disagree)))
#     print('length of prediction-classifier disagree in training: {}'.format(len(np.where(temp_prediction[data_indexes] != temp_classifier[data_indexes])[0])))
#     print(np.corrcoef(clf_predict_proba,clf_predict_proba_X))
    two_prediction_corr.append(np.corrcoef(clf_predict_proba,clf_predict_proba_X)[0,1])

    if np.max(coef_index_range) >= 2:
        if(len(prediction_high_conf_outliers) > 0 and len(prediction_high_conf_inliers) > 0):
            new_data_indexes = np.concatenate((prediction_high_conf_outliers, prediction_high_conf_inliers), axis = 0)
            new_data_indexes = np.array([int(i) for i in new_data_indexes])
            new_labels = np.concatenate((np.ones(len(prediction_high_conf_outliers)), np.zeros(len(prediction_high_conf_inliers))), axis = 0)
            clf_prune_2 = LogisticRegression(random_state=0, penalty='l2', max_iter=max_iter).fit(scores_transformed[new_data_indexes], new_labels) 
            combined_coef = clf_prune_2.coef_[0]  
        else:
            print('Coef from normal training: ', clf.coef_[0])
            combined_coef = clf.coef_[0]
            print('Combined Coef: ',  combined_coef)

        if(np.max(coef_index_range) >= 2):
            if(len(set(combined_coef)) > 1):
                cur_clf_coef = combined_coef 
                cutoff = max(max(0, np.mean(combined_coef)-np.std(combined_coef)),min(combined_coef))
                print(cutoff)

                remain_indexes_after_cond = (cur_clf_coef > cutoff) #np.logical_and(cur_clf_coef > cutoff, abs(cur_clf_coef) > 0.01) # # 
                remain_params_tracking = remain_params_tracking[remain_indexes_after_cond]
                print(remain_params_tracking)
                remain_indexes_after_cond_expanded = []
                for i in range(0, len(coef_index_range)): #
                    s_e_range = coef_index_range[i,1]-coef_index_range[i,0]
                    s1, e1 = coef_index_range[i,0], coef_index_range[i,1]
                    s2, e2 = index_range[i,0], index_range[i,1]
                    saved_indexes = np.where(cur_clf_coef[s1:e1] > cutoff)[0]
                    for j in range(N_size):
                        remain_indexes_after_cond_expanded.extend(np.array(saved_indexes) + j * s_e_range + s2)

                new_coef_index_range_seq = []
                for i in range(0, len(coef_index_range)): #
                    s, e = coef_index_range[i,0], coef_index_range[i,1]
                    new_coef_index_range_seq.append(sum((remain_indexes_after_cond)[s:e]))

                coef_index_range = []
                index_range = []
                cur_sum = 0
                for i in range(0, len(new_coef_index_range_seq)):
                    coef_index_range.append([cur_sum, cur_sum + new_coef_index_range_seq[i]])
                    index_range.append([cur_sum * 6, 6 * (cur_sum + new_coef_index_range_seq[i])])
                    cur_sum += new_coef_index_range_seq[i]

                coef_index_range = np.array(coef_index_range)
                index_range = np.array(index_range)
                print(coef_index_range)
                print(index_range)

                L=L[:,remain_indexes_after_cond_expanded]
                scores_for_training = scores_for_training[:, remain_indexes_after_cond]
    if((len(last_training_data_indexes) == len(data_indexes)) and 
       (sum(last_training_data_indexes == data_indexes) == len(data_indexes)) and 
       (np.max(coef_index_range) < 2)):
        counter =  counter + 1
    else:
        counter = 0
    if(counter > 3):
        break
    last_training_data_indexes = data_indexes

##################################################################
Iteration = 0, L shape = (5473, 156)
All agree, Number of outliers = 55
All agree, Number of inliers = 3788
num of inliers = 3788
num of outliers = 55
num of outliers = 55
Training data shape:  (3843, 26)
Training data F-1 0.48
F-1 score from LR: 0.38451612903225807
(3843, 10)
(3843,)
F-1 score from SVM: 0.492436974789916
length of prediction_high_conf_outliers: 57
length of prediction high conf inliers:  4587
0.09554116067589043
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 21 22 23 24
 25]
[[ 0 10]
 [10 20]
 [20 24]
 [24 25]]
[[  0  60]
 [ 60 120]
 [120 144]
 [144 150]]
##################################################################
Iteration = 1, L shape = (5473, 150)
All agree, Number of outliers = 55
All agree, Number of inliers = 3788
num of inliers = 3787
num of outliers = 57
num of outliers = 57
Training data shape:  (3844, 25)
Training data F-1 0.48684210526315785
F-1 score from LR: 0.38501291

F-1 score from SVM: 0.5046888320545609
length of prediction_high_conf_outliers: 119
length of prediction high conf inliers:  4567
0.27412521188708133
[ 3  4  5  6  7  8  9 25]
[[0 7]
 [7 7]
 [7 7]
 [7 8]]
[[ 0 42]
 [42 42]
 [42 42]
 [42 48]]
##################################################################
Iteration = 14, L shape = (5473, 48)
All agree, Number of outliers = 116
All agree, Number of inliers = 4202
num of inliers = 4178
num of outliers = 132
num of outliers = 132
Training data shape:  (4310, 8)
Training data F-1 0.48684210526315785
F-1 score from LR: 0.47110141766630315
(4310, 10)
(4310,)
F-1 score from SVM: 0.5184549356223176
length of prediction_high_conf_outliers: 133
length of prediction high conf inliers:  4571
0.34669607631575117
[ 4  5  6  7  8  9 25]
[[0 6]
 [6 6]
 [6 6]
 [6 7]]
[[ 0 36]
 [36 36]
 [36 36]
 [36 42]]
##################################################################
Iteration = 15, L shape = (5473, 42)
All agree, Number of outliers = 143
All agree

F-1 score from SVM: 0.5945512820512819
length of prediction_high_conf_outliers: 253
length of prediction high conf inliers:  4456
##################################################################
Iteration = 29, L shape = (5473, 6)
All agree, Number of outliers = 300
All agree, Number of inliers = 4673
num of inliers = 4373
num of outliers = 300
num of outliers = 281
Training data shape:  (4654, 1)
Training data F-1 0.6427350427350428
F-1 score from LR: 0.5015608740894901
(4654, 10)
(4654,)
F-1 score from SVM: 0.5945512820512819
length of prediction_high_conf_outliers: 253
length of prediction high conf inliers:  4459
##################################################################
Iteration = 30, L shape = (5473, 6)
All agree, Number of outliers = 300
All agree, Number of inliers = 4673
num of inliers = 4376
num of outliers = 300
num of outliers = 281
Training data shape:  (4657, 1)
Training data F-1 0.6427350427350428
F-1 score from LR: 0.5015608740894901
(4657, 10)
(4657,)
F-1 sc

In [27]:
index_range = np.array([[0, 60], [60, 120], [120, 150], [150, 156]])
coef_index_range = np.array([[0, 10], [10, 20], [20, 25], [25, 26]])
coef_remain_index = range(156)

scores_for_training_indexes = []
for i in range(len(index_range)):
    start=index_range[i][0]
    temp_range = coef_index_range[i][1]-coef_index_range[i][0]
    scores_for_training_indexes  = scores_for_training_indexes + list(range(start, start+temp_range))
print(scores_for_training_indexes) 
scores_for_training = scores[:, np.array(scores_for_training_indexes)]
print(np.shape(scores_for_training))

transformer = RobustScaler().fit(scores_for_training)
scores_transformed = transformer.transform(scores_for_training)
training_data = scores_transformed[data_indexes]
print('Training data shape: ', np.shape(training_data))
training_data_F1.append(metrics.f1_score(y[data_indexes], labels))
print('Training data F-1', metrics.f1_score(y[data_indexes], labels))

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
clf = LogisticRegression(random_state=0, penalty='l2', max_iter=max_iter).fit(training_data, labels) 
clf_predictions = clf.predict(scores_transformed)
clf_predict_proba = clf.predict_proba(scores_transformed)[:,1]
print("F-1 score from LR:",metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba > 0.5])))

transformer = RobustScaler().fit(X)
X_transformed = transformer.transform(X)
X_training_data = X_transformed[data_indexes]
print(np.shape(X_training_data))
print(np.shape(labels))

clf_X = SVC(gamma='auto', probability=True, random_state=0)
clf_X.fit(X_training_data, labels)
clf_predictions_X = clf_X.predict(X_transformed)
clf_predict_proba_X = clf_X.predict_proba(X_transformed)[:,1]
print("F-1 score from SVM:",metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba_X > 0.5])))
cur_f1_scores.append(metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba_X > 0.5])))

agreed_outlier_indexes = np.where(np.sum(L,axis=1)==np.shape(L)[1])[0]
agreed_inlier_indexes = np.where(np.sum(L,axis=1)==0)[0]

prediction_result_list.append(clf_predict_proba)
classifier_result_list.append(clf_predict_proba_X)

prediction_list.append(np.array([int(i) for i in clf_predictions]))

prediction_high_conf_outliers = np.intersect1d(np.where(prediction_result_list[-1] > high_confidence_threshold)[0],
                                               np.where(classifier_result_list[-1] > high_confidence_threshold)[0])
print('length of prediction_high_conf_outliers:' , len(prediction_high_conf_outliers))
prediction_high_conf_inliers = np.intersect1d(np.where(prediction_result_list[-1] < low_confidence_threshold)[0],
                                               np.where(classifier_result_list[-1] < low_confidence_threshold)[0])
print('length of prediction high conf inliers: ', len(prediction_high_conf_inliers))

temp_prediction = np.array([int(i) for i in prediction_result_list[-1] > 0.5])
temp_classifier = np.array([int(i) for i in classifier_result_list[-1] > 0.5])
prediction_classifier_disagree = np.where(temp_prediction != temp_classifier)[0]

L = L_prev
remain_params_tracking = np.array(range(0,np.max(coef_index_range)))

if np.max(coef_index_range) >= 2:
    if(len(prediction_high_conf_outliers) > 0 and len(prediction_high_conf_inliers) > 0):
        new_data_indexes = np.concatenate((prediction_high_conf_outliers, prediction_high_conf_inliers), axis = 0)
        new_data_indexes = np.array([int(i) for i in new_data_indexes])
        new_labels = np.concatenate((np.ones(len(prediction_high_conf_outliers)), np.zeros(len(prediction_high_conf_inliers))), axis = 0)
        clf_prune_2 = LogisticRegression(random_state=0, penalty='l2', max_iter=max_iter).fit(scores_transformed[new_data_indexes], new_labels) 
        print("F-1 score from both LR and SVM:",metrics.f1_score(y, np.array([int(i) for i in clf_prune_2.predict_proba(scores_transformed)[:,1] > 0.5])))
        print('Coef from both LR and SVM: ', clf_prune_2.coef_[0])
        combined_coef = clf_prune_2.coef_[0]  
    else:
        print('Coef from normal training: ', clf.coef_[0])
        combined_coef = clf.coef_[0]
        print('Combined Coef: ',  combined_coef)

    if(np.max(coef_index_range) > 2 or 
       ((np.max(combined_coef)/np.min(combined_coef) >= 1.1) and np.max(coef_index_range) >= 2)):
        if(len(set(combined_coef)) > 1):
            cur_clf_coef = combined_coef 
            cutoff = max(max(0, np.mean(combined_coef)-np.std(combined_coef)),min(combined_coef))
            print(cutoff)

            remain_indexes_after_cond = (cur_clf_coef > cutoff) #np.logical_and(cur_clf_coef > cutoff, abs(cur_clf_coef) > 0.01) # # 
            remain_params_tracking = remain_params_tracking[remain_indexes_after_cond]
            print(remain_params_tracking)
            remain_indexes_after_cond_expanded = []
            for i in range(0, len(coef_index_range)): #
                s_e_range = coef_index_range[i,1]-coef_index_range[i,0]
                s1, e1 = coef_index_range[i,0], coef_index_range[i,1]
                s2, e2 = index_range[i,0], index_range[i,1]
                saved_indexes = np.where(cur_clf_coef[s1:e1] > cutoff)[0]
                for j in range(N_size):
                    remain_indexes_after_cond_expanded.extend(np.array(saved_indexes) + j * s_e_range + s2)

            new_coef_index_range_seq = []
            for i in range(0, len(coef_index_range)): #
                s, e = coef_index_range[i,0], coef_index_range[i,1]
                new_coef_index_range_seq.append(sum((remain_indexes_after_cond)[s:e]))

            coef_index_range = []
            index_range = []
            cur_sum = 0
            for i in range(0, len(new_coef_index_range_seq)):
                coef_index_range.append([cur_sum, cur_sum + new_coef_index_range_seq[i]])
                index_range.append([cur_sum * 6, 6 * (cur_sum + new_coef_index_range_seq[i])])
                cur_sum += new_coef_index_range_seq[i]

            coef_index_range = np.array(coef_index_range)
            index_range = np.array(index_range)
            print(coef_index_range)
            print(index_range)

            L=L[:,remain_indexes_after_cond_expanded]
            scores_for_training = scores_for_training[:, remain_indexes_after_cond]



[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 120, 121, 122, 123, 124, 150]
(5473, 26)
Training data shape:  (4705, 26)
Training data F-1 0.6427350427350428
F-1 score from LR: 0.5048923679060664
(4705, 10)
(4705,)
F-1 score from SVM: 0.6059113300492611
length of prediction_high_conf_outliers: 302
length of prediction high conf inliers:  4528
F-1 score from both LR and SVM: 0.5128712871287128
Coef from both LR and SVM:  [-0.00808495  0.15602076  0.30290388  0.44906138  0.51603012  0.57574151
  0.58377619  0.53913469  0.42653064  0.36604707 -0.06788897  0.01035462
  0.04232272  0.03985595  0.03900475  0.0346761   0.02963501  0.02012668
  0.01433287  0.01370137  0.16081188  0.17225025  0.16494419  0.16661363
  0.15867251  0.31411491]
0
[ 1  2  3  4  5  6  7  8  9 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25]
[[ 0  9]
 [ 9 18]
 [18 23]
 [23 24]]
[[  0  54]
 [ 54 108]
 [108 138]
 [138 144]]


In [28]:
last_training_data_indexes = []
counter = 0

for i_range in range(0, 50):
    print("##################################################################")
    print('Iteration = {}, L shape = {}'.format(i_range, np.shape(L)))
    num_methods = np.shape(L)[1]

#     agree_outlier_indexes = (np.sum(L,axis=1)==np.shape(L)[1])
#     print('All agree, Number of outliers = {}'.format(sum(agree_outlier_indexes)))
#     agree_inlier_indexes = (np.sum(L,axis=1)==0)
#     print('All agree, Number of inliers = {}'.format(sum(agree_inlier_indexes)))

#     all_inlier_indexes = np.union1d(np.setdiff1d(np.where(agree_inlier_indexes)[0], prediction_high_conf_outliers), prediction_high_conf_inliers)
#     print('num of inliers = {}'.format(np.shape(all_inlier_indexes)[0]))
#     all_outlier_indexes = np.union1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)
#     print('num of outliers = {}'.format(np.shape(all_outlier_indexes)[0]))

#     disagree_indexes = np.where(np.logical_or(np.sum(L,axis = 1)==0, np.sum(L,axis = 1)==num_methods)==0)[0]

    ########################################################################

    agree_outlier_indexes = np.sum(L,axis=1)==np.shape(L)[1]
    print('All agree, Number of outliers = {}'.format(sum(agree_outlier_indexes)))
    agree_inlier_indexes = np.sum(L,axis=1)==0
    print('All agree, Number of inliers = {}'.format(sum(agree_inlier_indexes)))

    disagree_indexes = np.where(np.logical_or(np.sum(L,axis = 1)==0, np.sum(L,axis = 1)==num_methods)==0)[0]
    # print('Number of disagreed points = {}'.format(len(disagree_indexes)))
    # print('Number of disagreed points (true outliers) = {}'.format(sum(y[disagree_indexes] == 1)))
    # print('Number of disagreed points (true inliers) = {}'.format(sum(y[disagree_indexes] == 0)))

#     all_inlier_indexes = np.where(agree_inlier_indexes)[0]
    all_inlier_indexes = np.setdiff1d(np.where(agree_inlier_indexes)[0], prediction_high_conf_outliers)
    if len(prediction_high_conf_inliers) >0:
        all_inlier_indexes = np.intersect1d(np.setdiff1d(np.where(agree_inlier_indexes)[0], prediction_high_conf_outliers), prediction_high_conf_inliers)
    print('num of inliers = {}'.format(np.shape(all_inlier_indexes)[0]))
#     all_outlier_indexes = np.union1d(np.where(agree_outlier_indexes)[0], self_agree_index_list)

#     if(len(np.intersect1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)) > 0 and
#       (len(np.union1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)) > 2000)):
#         all_outlier_indexes = np.intersect1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)
#     else:
    all_outlier_indexes = np.union1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)
#     if(len(all_outlier_indexes) > 1000):
#         all_outlier_indexes = np.random.RandomState(1).permutation(all_outlier_indexes)[:1000]
        
#     all_outlier_indexes = np.union1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)
    print('num of outliers = {}'.format(np.shape(all_outlier_indexes)[0]))
    all_inlier_indexes = np.setdiff1d(all_inlier_indexes, prediction_classifier_disagree)
    
    self_agree_index_list = []
    if((len(all_outlier_indexes) == 0) or (len(all_inlier_indexes)/ len(all_outlier_indexes) > 1000)):
        for i in range(0, len(index_range)):
            if(index_range[i,1]-index_range[i,0] <= 6):
                continue
            temp_index = disagree_indexes[np.where(np.sum(L[disagree_indexes][:,index_range[i,0]: index_range[i,1]], axis = 1)==(index_range[i,1]-index_range[i,0]))[0]]
            self_agree_index_list = np.union1d(self_agree_index_list, temp_index)
        self_agree_index_list = [int(i) for i in self_agree_index_list]
#     self_agree_index_list = np.random.RandomState(1).permutation(self_agree_index_list)[:500]
    all_outlier_indexes = np.union1d(all_outlier_indexes, self_agree_index_list)
    all_outlier_indexes = np.setdiff1d(all_outlier_indexes, prediction_classifier_disagree)
    print('num of outliers = {}'.format(np.shape(all_outlier_indexes)[0]))
    
    
    from sklearn.preprocessing import RobustScaler
    from sklearn.preprocessing import StandardScaler
    data_indexes = np.concatenate((all_inlier_indexes, all_outlier_indexes), axis = 0)
    data_indexes = np.array([int(i) for i in data_indexes])
    labels = np.concatenate((np.zeros(len(all_inlier_indexes)), np.ones(len(all_outlier_indexes))), axis = 0)
    transformer = RobustScaler().fit(scores_for_training)
    scores_transformed = transformer.transform(scores_for_training)
    training_data = scores_transformed[data_indexes]
    print('Training data shape: ', np.shape(training_data))
    training_data_F1.append(metrics.f1_score(y[data_indexes], labels))
    print('Training data F-1', metrics.f1_score(y[data_indexes], labels))

    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import SVC
#     clf = SVC(gamma='auto', probability=True, random_state=0)
#     clf.fit(training_data, labels)
    clf = LogisticRegression(random_state=0, penalty='l2', max_iter=max_iter).fit(training_data, labels) 
    clf_predictions = clf.predict(scores_transformed)
    clf_predict_proba = clf.predict_proba(scores_transformed)[:,1]
#     print("F-1 score from LR:",metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba > 0.5])))
    
    
    transformer = RobustScaler().fit(X)
    X_transformed = transformer.transform(X)
    X_training_data = X_transformed[data_indexes]
    print(np.shape(X_training_data))
    print(np.shape(labels))

    clf_X = SVC(gamma='auto', probability=True, random_state=0)
    clf_X.fit(X_training_data, labels)
    clf_predictions_X = clf_X.predict(X_transformed)
    clf_predict_proba_X = clf_X.predict_proba(X_transformed)[:,1]
    print("F-1 score from SVM:",metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba_X > 0.5])))
    print("precision score from SVM:",metrics.precision_score(y, np.array([int(i) for i in clf_predict_proba_X > 0.5])))
    print("recall score from SVM:",metrics.recall_score(y, np.array([int(i) for i in clf_predict_proba_X > 0.5])))
    cur_f1_scores.append(metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba_X > 0.5])))
        
    agreed_outlier_indexes = np.where(np.sum(L,axis=1)==np.shape(L)[1])[0]
    agreed_inlier_indexes = np.where(np.sum(L,axis=1)==0)[0]
        
    prediction_result_list.append(clf_predict_proba)
    classifier_result_list.append(clf_predict_proba_X)
    
    prediction_list.append(np.array([int(i) for i in clf_predictions]))
    
    prediction_high_conf_outliers = np.intersect1d(np.where(prediction_result_list[-1] > high_confidence_threshold)[0],
                                                   np.where(classifier_result_list[-1] > high_confidence_threshold)[0])
    print('length of prediction_high_conf_outliers:' , len(prediction_high_conf_outliers))
    prediction_high_conf_inliers = np.intersect1d(np.where(prediction_result_list[-1] < low_confidence_threshold)[0],
                                                   np.where(classifier_result_list[-1] < low_confidence_threshold)[0])
    print('length of prediction high conf inliers: ', len(prediction_high_conf_inliers))
    
    temp_prediction = np.array([int(i) for i in prediction_result_list[-1] > 0.5])
    temp_classifier = np.array([int(i) for i in classifier_result_list[-1] > 0.5])
    prediction_classifier_disagree = np.where(temp_prediction != temp_classifier)[0]
    
    if np.max(coef_index_range) >= 2:
        if(len(prediction_high_conf_outliers) > 0 and len(prediction_high_conf_inliers) > 0):
            new_data_indexes = np.concatenate((prediction_high_conf_outliers, prediction_high_conf_inliers), axis = 0)
            new_data_indexes = np.array([int(i) for i in new_data_indexes])
            new_labels = np.concatenate((np.ones(len(prediction_high_conf_outliers)), np.zeros(len(prediction_high_conf_inliers))), axis = 0)
            clf_prune_2 = LogisticRegression(random_state=0, penalty='l2', max_iter=max_iter).fit(scores_transformed[new_data_indexes], new_labels) 
            print("F-1 score from both LR and SVM:",metrics.f1_score(y, np.array([int(i) for i in clf_prune_2.predict_proba(scores_transformed)[:,1] > 0.5])))
            print('Coef from both LR and SVM: ', clf_prune_2.coef_[0])
            combined_coef = clf_prune_2.coef_[0]  
        else:
            print('Coef from normal training: ', clf.coef_[0])
            combined_coef = clf.coef_[0]
            print('Combined Coef: ',  combined_coef)

        if(np.max(coef_index_range) > 2 or 
           ((np.max(combined_coef)/np.min(combined_coef) >= 1.1) and np.max(coef_index_range) >= 2)):
            if(len(set(combined_coef)) > 1):
                cur_clf_coef = combined_coef 
                cutoff = max(max(0, np.mean(combined_coef)-np.std(combined_coef)),min(combined_coef))
                print(cutoff)

                remain_indexes_after_cond = (cur_clf_coef > cutoff) #np.logical_and(cur_clf_coef > cutoff, abs(cur_clf_coef) > 0.01) # # 
                remain_params_tracking = remain_params_tracking[remain_indexes_after_cond]
                print(remain_params_tracking)
                remain_indexes_after_cond_expanded = []
                for i in range(0, len(coef_index_range)): #
                    s_e_range = coef_index_range[i,1]-coef_index_range[i,0]
                    s1, e1 = coef_index_range[i,0], coef_index_range[i,1]
                    s2, e2 = index_range[i,0], index_range[i,1]
                    saved_indexes = np.where(cur_clf_coef[s1:e1] > cutoff)[0]
                    for j in range(N_size):
                        remain_indexes_after_cond_expanded.extend(np.array(saved_indexes) + j * s_e_range + s2)

                new_coef_index_range_seq = []
                for i in range(0, len(coef_index_range)): #
                    s, e = coef_index_range[i,0], coef_index_range[i,1]
                    new_coef_index_range_seq.append(sum((remain_indexes_after_cond)[s:e]))

                coef_index_range = []
                index_range = []
                cur_sum = 0
                for i in range(0, len(new_coef_index_range_seq)):
                    coef_index_range.append([cur_sum, cur_sum + new_coef_index_range_seq[i]])
                    index_range.append([cur_sum * 6, 6 * (cur_sum + new_coef_index_range_seq[i])])
                    cur_sum += new_coef_index_range_seq[i]

                coef_index_range = np.array(coef_index_range)
                index_range = np.array(index_range)
                print(coef_index_range)
                print(index_range)

                L=L[:,remain_indexes_after_cond_expanded]
                scores_for_training = scores_for_training[:, remain_indexes_after_cond]
    if((len(last_training_data_indexes) == len(data_indexes)) and 
       (sum(last_training_data_indexes == data_indexes) == len(data_indexes)) and 
       (np.max(coef_index_range) < 2)):
        counter =  counter + 1
    else:
        counter = 0
    if(counter > 3):
        break
    last_training_data_indexes = data_indexes

##################################################################
Iteration = 0, L shape = (5473, 144)
All agree, Number of outliers = 72
All agree, Number of inliers = 3973
num of inliers = 3959
num of outliers = 302
num of outliers = 302
Training data shape:  (4261, 24)
Training data F-1 0.6987522281639929
(4261, 10)
(4261,)
F-1 score from SVM: 0.5812220566318926
precision score from SVM: 0.49872122762148335
recall score from SVM: 0.6964285714285714
length of prediction_high_conf_outliers: 350
length of prediction high conf inliers:  4336
F-1 score from both LR and SVM: 0.5107577174929842
Coef from both LR and SVM:  [0.2047815  0.3553212  0.43243614 0.46878782 0.4900089  0.47701726
 0.43964934 0.36566207 0.33075212 0.10925449 0.14140703 0.13820321
 0.1284651  0.12403254 0.1188301  0.11247623 0.10959282 0.1067406
 0.18769317 0.19227322 0.19074616 0.19698304 0.18560817 0.39287225]
0.11328322192536236
[ 1  2  3  4  5  6  7  8  9 12 13 14 15 16 20 21 22 23 24 25]
[[ 0  9]
 [ 9 14]
 [14 

F-1 score from SVM: 0.5785498489425982
precision score from SVM: 0.5013089005235603
recall score from SVM: 0.6839285714285714
length of prediction_high_conf_outliers: 374
length of prediction high conf inliers:  4363
F-1 score from both LR and SVM: 0.5257249766136577
Coef from both LR and SVM:  [0.92041274 0.82644441 0.79600326 0.82519406 1.13191367]
0.7960032639634408
[ 3  5  7 25]
[[0 3]
 [3 3]
 [3 3]
 [3 4]]
[[ 0 18]
 [18 18]
 [18 18]
 [18 24]]
##################################################################
Iteration = 10, L shape = (5473, 24)
All agree, Number of outliers = 120
All agree, Number of inliers = 4247
num of inliers = 4182
num of outliers = 374
num of outliers = 374
Training data shape:  (4556, 4)
Training data F-1 0.6705202312138728
(4556, 10)
(4556,)
F-1 score from SVM: 0.5800604229607251
precision score from SVM: 0.5026178010471204
recall score from SVM: 0.6857142857142857
length of prediction_high_conf_outliers: 363
length of prediction high conf inliers:  4365
F

F-1 score from SVM: 0.6335504885993486
precision score from SVM: 0.5823353293413174
recall score from SVM: 0.6946428571428571
length of prediction_high_conf_outliers: 251
length of prediction high conf inliers:  4496
##################################################################
Iteration = 24, L shape = (5473, 6)
All agree, Number of outliers = 300
All agree, Number of inliers = 4673
num of inliers = 4413
num of outliers = 300
num of outliers = 281
Training data shape:  (4694, 1)
Training data F-1 0.6585365853658537
(4694, 10)
(4694,)
F-1 score from SVM: 0.6351791530944625
precision score from SVM: 0.5838323353293413
recall score from SVM: 0.6964285714285714
length of prediction_high_conf_outliers: 251
length of prediction high conf inliers:  4498
##################################################################
Iteration = 25, L shape = (5473, 6)
All agree, Number of outliers = 300
All agree, Number of inliers = 4673
num of inliers = 4414
num of outliers = 300
num of outliers = 

### Use GT label to train a classifier

In [15]:
# random select the same number of labels
# data_indexes = np.random.permutation(len(y))[:len(data_indexes)]
# train a SVM classifier
clf_X = SVC(gamma='auto', probability=True, random_state=0)
transformer = RobustScaler().fit(X)
X_transformed = transformer.transform(X)
X_training_data = X_transformed[data_indexes]
clf_X.fit(X_training_data, y[data_indexes])
clf_predictions_X = clf_X.predict(X_transformed)
clf_predict_proba_X = clf_X.predict_proba(X_transformed)[:,1]
print("F-1 score from SVM:",metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba_X > 0.5])))

F-1 score from SVM: 0.7996389891696751
