In [1]:
import numpy as np
from scipy.io import arff
import pandas as pd
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn import metrics
import scipy as sp
import logging
from sklearn.neighbors import NearestNeighbors
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
    
def run_lof(X, y, k=60):
    clf = LocalOutlierFactor(n_neighbors=k)
    clf.fit(X)
    lof_scores = -clf.negative_outlier_factor_
    return lof_scores

def get_predictions(scores, num_outliers = 400, method_name = 'LOF'):
    threshold = np.sort(scores)[::-1][num_outliers]
    # threshold, max_f1 = get_best_f1_score(y, lof_scores)
    predictions = np.array(scores > threshold)
    predictions = np.array([int(i) for i in predictions])
#     print('F1 for {} : {}'.format(method_name, metrics.f1_score(y, predictions)))
    return predictions, scores, metrics.f1_score(y, predictions)

def get_best_F1(scores):
    best_f1 = 0
    for i in range(np.shape(scores)[0]):
        threshold = np.sort(scores)[::-1][i]
        predictions = np.array(scores > threshold)
        predictions = np.array([int(i) for i in predictions])
        cur_f1 = metrics.f1_score(y, predictions)
        best_f1 = max(cur_f1, best_f1)
    return best_f1

def run_knn(X, y, k=60):
    neigh = NearestNeighbors(n_neighbors=k)
    neigh.fit(X)
    knn_dists = neigh.kneighbors(X)[0][:,-1]
    return knn_dists

def run_isolation_forest(X, y, max_features = 1.0):
    # training the model
    clf = IsolationForest(random_state=42,max_features=max_features)
    clf.fit(X)
    # predictions
    sklearn_score_anomalies = clf.decision_function(X)
    if_scores = [-1*s + 0.5 for s in sklearn_score_anomalies]
    return if_scores

def mahalanobis(x):
    """Compute the Mahalanobis Distance between each row of x and the data
    """
    x_minus_mu = x - np.mean(x)
    cov = np.cov(x.T)
    det = np.linalg.det(cov)
    if det != 0:
        inv_covmat = sp.linalg.inv(cov)
    else:
        inv_covmat = sp.linalg.pinv(cov)
    results = []
    x_minus_mu = np.array(x_minus_mu)
    for i in range(np.shape(x)[0]):
        cur_data = x_minus_mu[i,:]
        results.append(np.dot(np.dot(x_minus_mu[i,:], inv_covmat), x_minus_mu[i,:].T))
    return np.array(results)
#     left_term = np.dot(x_minus_mu, inv_covmat)
#     mahal = np.dot(left_term, x_minus_mu.T)
#     print(mahal.diagonal())
#     return mahal.diagonal()

def run_mahalanobis(X, y):
    # training the model
    dist = mahalanobis(x=X)
    return dist

def load_dataset(filename):
    with open(filename, 'r') as f:
        data, meta = arff.loadarff(f)
    data = pd.DataFrame(data)
    X = data.drop(columns=['id', 'outlier'])
    # Map dataframe to encode values and put values into a numpy array
    y = data["outlier"].map(lambda x: 1 if x == b'yes' else 0).values
    return X, y

### Load SpamBase dataset

In [2]:
filename = './SpamBase_withoutdupl_norm_40.arff'
X, y = load_dataset(filename=filename)
print(np.shape(X), np.shape(y))
K = 9
N = 1679
class_balance = [1- N/4207.0, N/4207.0]
lof_krange = list(range(10,110,10)) * 6
knn_krange = list(range(10,110,10)) * 6
if_range = [0.5, 0.6, 0.7, 0.8, 0.9] * 6
# mahalanobis_N_range=[N]
mahalanobis_N_range = [1400, 1500, 1600, 1700, 1800, 1900]
if_N_range = np.sort(mahalanobis_N_range * 5)
N_range = np.sort(mahalanobis_N_range * 10) 
print(N/len(y))

(4207, 57) (4207,)
0.39909674352270025


### Load pageblock dataset

In [2]:
filename = './PageBlocks/PageBlocks_norm_10.arff'
X, y = load_dataset(filename=filename)
print(np.shape(X), np.shape(y))
K = 80
N = 560
# num_outliers = [N, N, N, N]
class_balance = [0.9, 0.1]
# lof_krange = [55, 60, 65, 70, 75] 
# knn_krange = [55, 60, 65, 70, 75] 
# if_range = [0.5, 0.6,0.7, 0.8,0.9]
lof_krange = list(range(10,110,10)) * 6
knn_krange = list(range(10,110,10)) * 6
# lof_krange = range(70,90,4) 
# knn_krange = [60, 70, 80, 90, 100] 
if_range = [0.5, 0.6, 0.7, 0.8, 0.9] * 6
# mahalanobis_N_range=[560]
mahalanobis_N_range = [300, 400, 500, 600, 700, 800]
# mahalanobis_N_range = [550, 560, 570, 580, 590, 600]
N_size = 6
if_N_range = np.sort(mahalanobis_N_range * 5)
N_range = np.sort(mahalanobis_N_range * 10)

(5473, 10) (5473,)


### Load Pima dataset

In [220]:
filename = './Pima_withoutdupl_norm_35.arff'
X, y = load_dataset(filename=filename)
print(np.shape(X), np.shape(y))
K = 100
N = 268
print(N/len(y))
num_outliers = [N, N, N, N]
class_balance = [1- N/768.0, N/768.0]
lof_krange = list(range(10,210,10)) * 6
knn_krange = list(range(10,210,10)) * 6
if_range = [0.5, 0.6, 0.7, 0.8, 0.9] * 6
mahalanobis_N_range=[N]
mahalanobis_N_range = [220,230,240,250,260,270]

if_N_range = np.sort(mahalanobis_N_range * 5)
N_range = np.sort(mahalanobis_N_range * 20)

(768, 8) (768,)
0.3489583333333333


### ALOI dataset

In [34]:
filename = './ALOI_withoutdupl_norm.arff'
X, y = load_dataset(filename=filename)
print(np.shape(X), np.shape(y))
print(sum(y))
N = 1508
num_outliers = [N, N, N, N]
class_balance = [1- N/49534.0, N/49534.0]
lof_krange = list(range(10,110,10)) * 6
knn_krange = list(range(10,110,10)) * 6
if_range = [0.5, 0.6, 0.7, 0.8, 0.9] * 6
# mahalanobis_N_range=[N]
mahalanobis_N_range=[1500, 2000, 2500, 3000, 3500, 4000]
if_N_range = np.sort(mahalanobis_N_range * 5)
N_range = np.sort(mahalanobis_N_range *10)

(49534, 27) (49534,)
1508


### Load InternetAds

In [1256]:
filename = './InternetAds_withoutdupl_norm_19.arff'
X, y = load_dataset(filename=filename)
print(np.shape(X), np.shape(y))
print(sum(y))
N = 368
num_outliers = [N, N, N, N]
class_balance = [1- N/1966.0, N/1966.0]
lof_krange = list(range(5,55,5)) * 6
knn_krange = list(range(5,55,5)) * 6
if_range = [0.5, 0.6, 0.7, 0.8, 0.9] * 6
# mahalanobis_N_range=[N]
mahalanobis_N_range = [300, 350, 400, 450, 500, 550]
if_N_range = np.sort(mahalanobis_N_range * 5)
N_range = np.sort(mahalanobis_N_range *10)

(1966, 1555) (1966,)
368


### Load KDDCup 99 dataset

In [4]:
filename = './KDDCup99_withoutdupl_norm_catremoved.arff'
X, y = load_dataset(filename=filename)
print(np.shape(X), np.shape(y))
print(sum(y))
N = 200
num_outliers = [N, N, N, N]
class_balance = [1- N/48113.0, N/48113.0]
lof_krange = list(range(10,110,10)) * 6
knn_krange = list(range(10,110,10)) * 6
if_range = [0.5, 0.6, 0.7, 0.8, 0.9] * 6
mahalanobis_N_range=[500,1000,1500,2000,2500,3000]
# mahalanobis_N_range = [20, 40, 60,80, 100,120]
if_N_range = np.sort(mahalanobis_N_range * 5)
N_range = np.sort(mahalanobis_N_range *10)

(48113, 40) (48113,)
200


In [2]:
filename='kdd99-unsupervised-ad.csv'
import pandas as pd
data = pd.read_csv(filename, header=None)
X = data.drop(columns=[29])
print(np.shape(np.array(X)))
# Map dataframe to encode values and put values into a numpy array
y = data[29].map(lambda x: 1 if x == 'o' else 0).values
print(sum(y))

lof_krange = list(range(10,110,10)) * 6
knn_krange = list(range(10,110,10)) * 6
if_range = [0.5, 0.6, 0.7, 0.8, 0.9] * 6
mahalanobis_N_range=[1000,1500,2000,2500,3000,3500]
if_N_range = np.sort(mahalanobis_N_range * 5)
N_range = np.sort(mahalanobis_N_range *10)

(620098, 29)
1052


### Load shuttle dataset

In [37]:
import hdf5storage
mat = hdf5storage.loadmat('shuttle.mat')
X = mat['X']
y = mat['y']
print(len(y))
print(np.sum(y)/len(y))
lof_krange = list(range(10,110,10)) * 6
knn_krange = list(range(10,110,10)) * 6
if_range = [0.5, 0.6, 0.7, 0.8, 0.9] * 6
mahalanobis_N_range=[1000,1500,2000, 2500,3000, 3500]
# mahalanobis_N_range = [20, 40, 60,80, 100,120]
if_N_range = np.sort(mahalanobis_N_range * 5)
N_range = np.sort(mahalanobis_N_range *10)
print(np.shape(X))

# normalize
# from sklearn.preprocessing import Normalizer
# transformer = Normalizer().fit(X) 
# X = transformer.transform(X)

49097
0.0715114976475141
(49097, 9)


### Load mulcross dataset

In [184]:
filename = './mulcross.arff'

with open(filename, 'r') as f:
    data, meta = arff.loadarff(f)
data = pd.DataFrame(data)
X = data.drop(columns=['Target'])
y = data["Target"].map(lambda x: 1 if x == b'Anomaly' else 0).values
# X, y = load_dataset(filename=filename)
print(np.shape(X), np.shape(y))
print(sum(y))
print(sum(y)/len(y))
lof_krange = list(range(10,110,10)) * 6
knn_krange = list(range(10,110,10)) * 6
if_range = [0.5, 0.6, 0.7, 0.8, 0.9] * 6
mahalanobis_N_range=[20000, 22000, 24000, 26000, 28000, 30000]
# mahalanobis_N_range = [20, 40, 60,80, 100,120]
if_N_range = np.sort(mahalanobis_N_range * 5)
N_range = np.sort(mahalanobis_N_range *10)

(262144, 4) (262144,)
26214
0.09999847412109375


### Load HTTP dataset

In [3]:
import hdf5storage
mat = hdf5storage.loadmat('http.mat')
X = mat['X']
y = mat['y']
print(len(y))
print(np.sum(y))
print(np.sum(y)/len(y))
lof_krange = list(range(10,110,10)) * 6
knn_krange = list(range(10,110,10)) * 6
if_range = [0.5, 0.6, 0.7, 0.8, 0.9] * 6
# mahalanobis_N_range=[1500, 2000, 2500, 3000, 3500, 4000]
mahalanobis_N_range=[5000, 10000, 15000,20000, 25000, 30000]
# mahalanobis_N_range=[10000, 15000, 20000, 25000, 30000, 35000]
# mahalanobis_N_range = [20, 40, 60,80, 100,120]
if_N_range = np.sort(mahalanobis_N_range * 5)
N_range = np.sort(mahalanobis_N_range *10)

# # remove duplicates
# newdata = pd.DataFrame(np.concatenate((X,y), axis = 1)).drop_duplicates()
# X = newdata[[0,1,2]].values
# y = np.array([1 if i==1.0 else 0 for i in newdata[[3]].values])
# print('Remove duplicates: ', len(y))

# # normalize
# from sklearn.preprocessing import Normalizer
# transformer = Normalizer().fit(X) 
# X = transformer.transform(X)

567498
2211.0
0.003896048972859816


### Load ForestCover Dataset

In [3]:
import hdf5storage
import pickle
dataset = pickle.load(open("cover_dataset.pickle", "rb"))
X = dataset['X']
y = dataset['y']
print(np.sum(y))
print(len(y))
lof_krange = list(range(10,110,10)) * 6
knn_krange = list(range(10,110,10)) * 6
if_range = [0.5, 0.6, 0.7, 0.8, 0.9] * 6
mahalanobis_N_range=[5000, 8000, 10000, 12000, 15000, 18000]
# mahalanobis_N_range = [20, 40, 60,80, 100,120]
if_N_range = np.sort(mahalanobis_N_range * 5)
N_range = np.sort(mahalanobis_N_range *10)

from sklearn.preprocessing import RobustScaler
transformer = RobustScaler().fit(X)
X_transformed = transformer.transform(X)

2747
286048


### Load Annthyroid dataset

In [185]:
filename = './Annthyroid_withoutdupl_norm_07.arff'
X, y = load_dataset(filename=filename)
print(np.shape(X), np.shape(y))
print(sum(y)/len(y))
N = 534
num_outliers = [N, N, N, N]
class_balance = [1- N/7129.0, N/7129.0]
lof_krange = list(range(10,110,10)) * 6
knn_krange = list(range(10,110,10)) * 6
if_range = [0.5, 0.6, 0.7, 0.8, 0.9] * 6
# mahalanobis_N_range=[N]
mahalanobis_N_range=[300, 400,500,600,700,800]
if_N_range = np.sort(mahalanobis_N_range * 5)
N_range = np.sort(mahalanobis_N_range *10)

(7129, 21) (7129,)
0.07490531631364848


### Load Musk dataset

In [49]:
import hdf5storage
mat = hdf5storage.loadmat('musk.mat')
X = mat['X']
y = mat['y']
print(len(y))
print(np.sum(y))
print(np.sum(y)/len(y))
lof_krange = list(range(10,110,10)) * 6
knn_krange = list(range(10,110,10)) * 6
if_range = [0.5, 0.6, 0.7, 0.8, 0.9] * 6
mahalanobis_N_range=[100, 120, 140, 160, 180, 200]
# mahalanobis_N_range = [20, 40, 60,80, 100,120]
if_N_range = np.sort(mahalanobis_N_range * 5)
N_range = np.sort(mahalanobis_N_range *10)
print(np.shape(X))

3062
97.0
0.03167864141084259
(3062, 166)


### Load Satimage-2 dataset

In [67]:
import hdf5storage
mat = hdf5storage.loadmat('satimage-2.mat')
X = mat['X']
y = mat['y']
print(len(y))
print(np.sum(y))
print(np.sum(y)/len(y))
lof_krange = list(range(10,110,10)) * 6
knn_krange = list(range(10,110,10)) * 6
if_range = [0.5, 0.6, 0.7, 0.8, 0.9] * 6
mahalanobis_N_range=[60, 80, 100, 120, 140, 160]
# mahalanobis_N_range = [20, 40, 60,80, 100,120]
if_N_range = np.sort(mahalanobis_N_range * 5)
N_range = np.sort(mahalanobis_N_range *10)
print(np.shape(X))

5803
71.0
0.0122350508357746
(5803, 36)


### Load Pendigits Dataset

In [None]:
import hdf5storage
mat = hdf5storage.loadmat('pendigits.mat')
X = mat['X']
y = mat['y']
print(len(y))
print(np.sum(y))
print(np.sum(y)/len(y))
lof_krange = list(range(10,110,10)) * 6
knn_krange = list(range(10,110,10)) * 6
if_range = [0.5, 0.6, 0.7, 0.8, 0.9] * 6
mahalanobis_N_range=[150,200,250,300,350,400]
# mahalanobis_N_range = [20, 40, 60,80, 100,120]
if_N_range = np.sort(mahalanobis_N_range * 5)
N_range = np.sort(mahalanobis_N_range *10)
print(np.shape(X))

### Compute Outlier Scores

In [3]:
all_results = []
all_scores = []
f1s = []

temp_lof_results = dict()
unique_lof_ks = list(set(lof_krange)) 
for k in unique_lof_ks:
#     print(k)
    lof_scores = run_lof(X, y, k=k)
    temp_lof_results[k] = lof_scores
for i in range(len(lof_krange)):
    lof_predictions, lof_scores, f1 = get_predictions(temp_lof_results[lof_krange[i]], num_outliers=N_range[i], method_name='LOF')
    all_results.append(lof_predictions)
    all_scores.append(lof_scores)
    f1s.append(f1)
best_lof_f1 = 0
for i in np.sort(unique_lof_ks):
    temp_f1 = max(np.array(f1s[0:60])[np.where(np.array(lof_krange) == i)[0]])
    print('LOF k = {}, best F-1 = {}'.format(i, temp_f1))
    best_lof_f1 = max(best_lof_f1, temp_f1)
print('Best LOF F-1 = {}'.format(best_lof_f1))

temp_knn_results = dict()
unique_knn_ks = list(set(knn_krange)) 
for k in unique_knn_ks:
    print(k)
    knn_scores = run_knn(X, y, k=k)
    temp_knn_results[k] = knn_scores
for i in range(len(knn_krange)):
    knn_predictions, knn_scores,f1 = get_predictions(temp_knn_results[knn_krange[i]], num_outliers=N_range[i], method_name='KNN')
    all_results.append(knn_predictions)
    all_scores.append(knn_scores)
    f1s.append(f1)
best_knn_f1 = 0
for i in np.sort(unique_knn_ks):
    temp_f1 = max(np.array(f1s[60:120])[np.where(np.array(knn_krange) == i)[0]])
    print('KNN k = {}, best F-1 = {}'.format(i, temp_f1))
    best_knn_f1 = max(best_knn_f1, temp_f1)
print('Best KNN F-1 = {}'.format(best_knn_f1))
    
temp_if_results = dict()
unique_if_features = list(set(if_range)) 
for k in unique_if_features:
    print(k)
    if_scores = run_isolation_forest(X, y, max_features=k)
    temp_if_results[k] = if_scores
for i in range(len(if_range)):
    if_predictions, if_scores,f1 = get_predictions(temp_if_results[if_range[i]], num_outliers=N_range[i], method_name='IF')
    all_results.append(if_predictions)
    all_scores.append(if_scores)
    f1s.append(f1)
best_if_f1 = 0
for i in np.sort(unique_if_features):
    temp_f1 = max(np.array(f1s[120:150])[np.where(np.array(if_range) == i)[0]])
    print('IF = {}, best F-1 = {}'.format(i, temp_f1))
    best_if_f1 = max(best_if_f1, temp_f1)
print('Best IF F-1 = {}'.format(best_if_f1))
    
mahalanobis_scores = run_mahalanobis(X, y)
best_mahala_f1 = 0
for i in range(len(mahalanobis_N_range)):
    mahalanobis_predictions,mahalanobis_scores,f1 = get_predictions(mahalanobis_scores, num_outliers=mahalanobis_N_range[i], method_name='mahala')
    all_results.append(mahalanobis_predictions)
    all_scores.append(mahalanobis_scores)
    best_mahala_f1 = max(best_mahala_f1, f1)
    f1s.append(f1)
print('mahalanobis = {}'.format(max(np.array(f1s[150:]))))
print('Best Mahala F-1 = {}'.format(best_mahala_f1))
L = np.stack(all_results).T
scores = np.stack(all_scores).T

LOF k = 10, best F-1 = 0.39999999999999997
LOF k = 20, best F-1 = 0.4520833333333334
LOF k = 30, best F-1 = 0.42499999999999993
LOF k = 40, best F-1 = 0.4547169811320755
LOF k = 50, best F-1 = 0.49056603773584906
LOF k = 60, best F-1 = 0.5020833333333333
LOF k = 70, best F-1 = 0.5018867924528301
LOF k = 80, best F-1 = 0.5120689655172415
LOF k = 90, best F-1 = 0.4862068965517241
LOF k = 100, best F-1 = 0.4773584905660378
Best LOF F-1 = 0.5120689655172415
100
70
40
10
80
50
20
90
60
30
KNN k = 10, best F-1 = 0.39841269841269844
KNN k = 20, best F-1 = 0.42698412698412697
KNN k = 30, best F-1 = 0.42698412698412697
KNN k = 40, best F-1 = 0.43333333333333335
KNN k = 50, best F-1 = 0.4317460317460317
KNN k = 60, best F-1 = 0.42758620689655175
KNN k = 70, best F-1 = 0.4241379310344828
KNN k = 80, best F-1 = 0.4190476190476191
KNN k = 90, best F-1 = 0.4174603174603174
KNN k = 100, best F-1 = 0.41764705882352937
Best KNN F-1 = 0.43333333333333335
0.5
0.6
0.8
0.9
0.7
IF = 0.5, best F-1 = 0.409433

In [4]:
print(np.shape(L))
print(np.shape(scores))

(5473, 156)
(5473, 156)


### Majority Vote

In [37]:
mid = np.shape(L)[1]/2
predictions = np.full((len(y)), 0)
predictions[np.sum(L, axis = 1) > mid] = 1
print('F1 for MV:', metrics.f1_score(y, predictions))

F1 for MV: 0.10857538035961271


### Save and load outlier scores

In [151]:
import pickle
dataset_results = {'L': L, 'scores': scores, 'f1s': f1s}
pickle.dump(dataset_results, open("smtp.pickle", "wb" ))

In [3]:
# if the outlier scores have already been computed, load it here
import pickle
dataset_results = pickle.load(open("kdd_large.pickle", "rb"))
L = dataset_results['L']
scores = dataset_results['scores']
f1s = dataset_results['f1s']

In [5]:
L_prev = L
scores_prev = scores

In [6]:
L = L_prev
scores = scores_prev

In [7]:
print(max(f1s)) 
print(np.shape(L))

0.5411764705882354
(5473, 156)


In [8]:
prediction_result_list = []
classifier_result_list = []
prediction_list = []
cur_f1_scores = []
prediction_high_conf_outliers = np.array([])
prediction_high_conf_inliers = np.array([])
prediction_classifier_disagree = np.array([])

In [9]:
index_range = np.array([[0, 60], [60, 120], [120, 150], [150, 156]])
coef_index_range = np.array([[0, 10], [10, 20], [20, 25], [25, 26]])
coef_remain_index = range(156)

In [10]:
# for i in range(60):
#     scores[scores[:,i] > 100,i] = 100

In [11]:
# index_range = np.array([[0, 120], [120, 240], [240, 270], [270, 276]])
# coef_index_range = np.array([[0, 20], [20, 40], [40, 45], [45, 46]])
# coef_remain_index = range(276)

In [12]:
# new_scores = []
# for i in range(np.shape(scores)[1]):
#     new_scores.append(np.argsort(np.argsort(scores[:,i]))/len(scores[:,i]))
# scores = np.stack(new_scores).T

In [13]:
# For knn, we run it on 10 different k values and 6 different Ns. However, we don't need to use all the 60 values 
# to train the two models, we just need to run the training on 10 columns with different k values. 
# (Because if k is fixed, the knn score is fixed. N is just used to determine whether the point is an outlier or not.)
scores_for_training_indexes = []
for i in range(len(index_range)):
    start=index_range[i][0]
    temp_range = coef_index_range[i][1]-coef_index_range[i][0]
    scores_for_training_indexes  = scores_for_training_indexes + list(range(start, start+temp_range))
print(scores_for_training_indexes) 
scores_for_training = scores[:, np.array(scores_for_training_indexes)]
print(np.shape(scores_for_training))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 120, 121, 122, 123, 124, 150]
(5473, 26)


In [11]:
def generate_coef_proba(coef):
    # weights --> ranking
    pos_ranking = np.argsort(np.argsort(coef)) + 1
#     print(pos_ranking)

    # ranking --> u_k
    num_coefs = len(coef)
    normalize_factor = np.sum([max(0, np.log(num_coefs + 1) - np.log(j)) for j in range(1, num_coefs + 1)])
    u_k = [max(0, np.log(num_coefs + 1) - np.log(j))/normalize_factor - 1/num_coefs for j in pos_ranking]
    
    # normalize u_k
    normalized_u_k = (u_k-min(u_k))/(max(u_k)-min(u_k))
#     print(normalized_u_k)

    # u_k to probabilities
    import scipy.stats
    pdfs = np.array([scipy.stats.norm(0, np.std(normalized_u_k)).pdf(j) for j in normalized_u_k])
#     preservation_rate = 0.8
#     probabilities = pdfs * preservation_rate * num_coefs/np.sum(pdfs)
    probabilities = (pdfs-min(pdfs))/(max(pdfs)-min(pdfs))
#     probabilities = pdfs
    print(probabilities)
    return probabilities

def filter_update_list(R, value_list):
    P = 2
    prediction = 0
    for new_value in value_list:
        K = P / (P + R)
        prediction = prediction + K * (new_value - prediction)
        P = (1 - K) * P
    return prediction

def get_kf_results(proba_list):
    results = []
    for i in range(np.shape(proba_list)[1]):
        results.append(filter_update_list(0.1, proba_list[:, i]))
    return np.array(results)

def generate_decision_on_proba(probabilities):
    return np.array([np.random.binomial(n=1, p = min(1, proba)) for proba in probabilities])
    

### Iterative train LR and classifier(SVM)

In [None]:
# stable version
# Please note, to determine whether the point is outliers/inliers, we use all the 60 values, to actually train the 
# models, we use 10 values with different ks. 
high_confidence_threshold = 0.99
low_confidence_threshold = 0.01
LR_threshold = 0.5
max_iter = 500
union_inliers = False
remain_params_tracking = np.array(range(0,np.max(coef_index_range)))
training_data_F1 = []
two_prediction_corr = []

min_max_diff = []
N_size = 6

last_training_data_indexes = []
counter = 0

for i_range in range(0, 50):
    print("##################################################################")
    print('Iteration = {}, L shape = {}'.format(i_range, np.shape(L)))
    num_methods = np.shape(L)[1]
    
    agree_outlier_indexes = np.sum(L,axis=1)==np.shape(L)[1]
#     print('All agree, Number of outliers = {}'.format(sum(agree_outlier_indexes)))
    agree_inlier_indexes = np.sum(L,axis=1)==0
#     print('All agree, Number of inliers = {}'.format(sum(agree_inlier_indexes)))

    disagree_indexes = np.where(np.logical_or(np.sum(L,axis = 1)==0, np.sum(L,axis = 1)==num_methods)==0)[0]
    
#     all_inlier_indexes = np.where(agree_inlier_indexes)[0]
    all_inlier_indexes = np.setdiff1d(np.where(agree_inlier_indexes)[0], prediction_high_conf_outliers)
    if len(prediction_high_conf_inliers) >0:
        if union_inliers:
            all_inlier_indexes = np.union1d(np.setdiff1d(np.where(agree_inlier_indexes)[0], prediction_high_conf_outliers), prediction_high_conf_inliers)
        else:
            all_inlier_indexes = np.intersect1d(np.setdiff1d(np.where(agree_inlier_indexes)[0], prediction_high_conf_outliers), prediction_high_conf_inliers)
    print('num of inliers = {}'.format(np.shape(all_inlier_indexes)[0]))
#     all_outlier_indexes = np.union1d(np.where(agree_outlier_indexes)[0], self_agree_index_list)

#     if(len(np.intersect1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)) > 0 and
#       (len(np.union1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)) > 2000)):
#         all_outlier_indexes = np.intersect1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)
#     else:
    all_outlier_indexes = np.union1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)
#     if(len(all_outlier_indexes) > 1000):
#         all_outlier_indexes = np.random.RandomState(1).permutation(all_outlier_indexes)[:1000]
        
#     all_outlier_indexes = np.union1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)
    print('num of outliers = {}'.format(np.shape(all_outlier_indexes)[0]))
    all_inlier_indexes = np.setdiff1d(all_inlier_indexes, prediction_classifier_disagree)
    
    self_agree_index_list = []
    if((len(all_outlier_indexes) == 0) or (len(all_inlier_indexes)/ len(all_outlier_indexes) > 1000)):
        for i in range(0, len(index_range)):
            if(index_range[i,1]-index_range[i,0] <= 6):
                continue
            temp_index = disagree_indexes[np.where(np.sum(L[disagree_indexes][:,index_range[i,0]: index_range[i,1]], axis = 1)==(index_range[i,1]-index_range[i,0]))[0]]
            self_agree_index_list = np.union1d(self_agree_index_list, temp_index)
        self_agree_index_list = [int(i) for i in self_agree_index_list]
#     self_agree_index_list = np.random.RandomState(1).permutation(self_agree_index_list)[:500]
    all_outlier_indexes = np.union1d(all_outlier_indexes, self_agree_index_list)
    all_outlier_indexes = np.setdiff1d(all_outlier_indexes, prediction_classifier_disagree)
    print('num of outliers = {}'.format(np.shape(all_outlier_indexes)[0]))
    
    
    from sklearn.preprocessing import RobustScaler
    from sklearn.preprocessing import StandardScaler
    data_indexes = np.concatenate((all_inlier_indexes, all_outlier_indexes), axis = 0)
    data_indexes = np.array([int(i) for i in data_indexes])
    labels = np.concatenate((np.zeros(len(all_inlier_indexes)), np.ones(len(all_outlier_indexes))), axis = 0)
    transformer = RobustScaler().fit(scores_for_training)
    scores_transformed = transformer.transform(scores_for_training)
    training_data = scores_transformed[data_indexes]
    print('Training data shape: ', np.shape(training_data))
    training_data_F1.append(metrics.f1_score(y[data_indexes], labels))
    print('Training data F-1', metrics.f1_score(y[data_indexes], labels))
    accurate_training = (np.array(y[data_indexes]) == np.array(labels))
    print('Training data accuracy:', sum(accurate_training)/len(labels))
    print('Training data outlier accuracy:', sum(y[data_indexes][-int(sum(labels)):])/sum(labels))
    
    transformer = RobustScaler().fit(X)
    X_transformed = transformer.transform(X)
    X_training_data = X_transformed[data_indexes]
    print(np.shape(X_training_data))
    print(np.shape(labels))
    
    from sklearn.svm import SVC
    clf_X = SVC(gamma='auto', probability=True, random_state=0)
    clf_X.fit(X_training_data, labels)
    clf_predictions_X = clf_X.predict(X_transformed)
    clf_predict_proba_X = clf_X.predict_proba(X_transformed)[:,1]
    SVM_threshold = 0.5
    print("F-1 score from SVM:",metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba_X > SVM_threshold])))
    cur_f1_scores.append(metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba_X > SVM_threshold])))
    print('Number of outliers by SVM:', sum(np.array([int(i) for i in clf_predict_proba_X > SVM_threshold])))
    
    from sklearn.linear_model import LogisticRegression
    clf = LogisticRegression(random_state=0, penalty='l2', max_iter=max_iter).fit(training_data, labels) 
    clf_predictions = clf.predict(scores_transformed)
    clf_predict_proba = clf.predict_proba(scores_transformed)[:,1]
#     LR_threshold = np.array(np.sort(clf_predict_proba)[::-1])[int(sum(clf_predictions_X))]
    print("F-1 score from LR:",metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba > LR_threshold])))
    print('Number of outliers by LR: ', sum(np.array([int(i) for i in clf_predict_proba > LR_threshold])))
    
    agreed_outlier_indexes = np.where(np.sum(L,axis=1)==np.shape(L)[1])[0]
    agreed_inlier_indexes = np.where(np.sum(L,axis=1)==0)[0]
        
    prediction_result_list.append(clf_predict_proba)
    classifier_result_list.append(clf_predict_proba_X)
    
    prediction_list.append(np.array([int(i) for i in clf_predictions]))
    
    prediction_high_conf_outliers = np.intersect1d(np.where(prediction_result_list[-1] > high_confidence_threshold)[0],
                                                   np.where(classifier_result_list[-1] > high_confidence_threshold)[0])
#     print('length of prediction_high_conf_outliers:' , len(prediction_high_conf_outliers))
    prediction_high_conf_inliers = np.intersect1d(np.where(prediction_result_list[-1] < low_confidence_threshold)[0],
                                                   np.where(classifier_result_list[-1] < low_confidence_threshold)[0])
#     print('length of prediction high conf inliers: ', len(prediction_high_conf_inliers))
    
    temp_prediction = np.array([int(i) for i in prediction_result_list[-1] > LR_threshold])
    temp_classifier = np.array([int(i) for i in classifier_result_list[-1] > SVM_threshold])
    prediction_classifier_disagree = np.where(temp_prediction != temp_classifier)[0]
#     print('length of prediction-classifier disagree: {}'.format(len(prediction_classifier_disagree)))
#     print('length of prediction-classifier disagree in training: {}'.format(len(np.where(temp_prediction[data_indexes] != temp_classifier[data_indexes])[0])))
    print(np.corrcoef(clf_predict_proba,clf_predict_proba_X))
    two_prediction_corr.append(np.corrcoef(clf_predict_proba,clf_predict_proba_X)[0,1])

    if np.max(coef_index_range) >= 2:
        if(len(prediction_high_conf_outliers) > 0 and len(prediction_high_conf_inliers) > 0):
            new_data_indexes = np.concatenate((prediction_high_conf_outliers, prediction_high_conf_inliers), axis = 0)
            new_data_indexes = np.array([int(i) for i in new_data_indexes])
            new_labels = np.concatenate((np.ones(len(prediction_high_conf_outliers)), np.zeros(len(prediction_high_conf_inliers))), axis = 0)
            clf_prune_2 = LogisticRegression(random_state=0, penalty='l2', max_iter=max_iter).fit(scores_transformed[new_data_indexes], new_labels) 
#             print("F-1 score from both LR and SVM:",metrics.f1_score(y, np.array([int(i) for i in clf_prune_2.predict_proba(scores_transformed)[:,1] > 0.5])))
#             print('Coef from both LR and SVM: ', clf_prune_2.coef_[0])
            combined_coef = clf_prune_2.coef_[0]  
        else:
#             print('Coef from normal training: ', clf.coef_[0])
            combined_coef = clf.coef_[0]
#             print('Combined Coef: ',  combined_coef)

        if(np.max(coef_index_range) >= 2 or 
           ((np.max(combined_coef)/np.min(combined_coef) >= 1.1) and np.max(coef_index_range) >= 2)):
            if(len(set(combined_coef)) > 1):
                cur_clf_coef = combined_coef 
                cutoff = max(max(0, np.mean(combined_coef)-np.std(combined_coef)),min(combined_coef))
#                 print(cutoff)

                remain_indexes_after_cond = (cur_clf_coef > cutoff) #np.logical_and(cur_clf_coef > cutoff, abs(cur_clf_coef) > 0.01) # # 
                remain_params_tracking = remain_params_tracking[remain_indexes_after_cond]
                print(remain_params_tracking)
                remain_indexes_after_cond_expanded = []
                for i in range(0, len(coef_index_range)): #
                    s_e_range = coef_index_range[i,1]-coef_index_range[i,0]
                    s1, e1 = coef_index_range[i,0], coef_index_range[i,1]
                    s2, e2 = index_range[i,0], index_range[i,1]
                    saved_indexes = np.where(cur_clf_coef[s1:e1] > cutoff)[0]
                    for j in range(N_size):
                        remain_indexes_after_cond_expanded.extend(np.array(saved_indexes) + j * s_e_range + s2)

                new_coef_index_range_seq = []
                for i in range(0, len(coef_index_range)): #
                    s, e = coef_index_range[i,0], coef_index_range[i,1]
                    new_coef_index_range_seq.append(sum((remain_indexes_after_cond)[s:e]))

                coef_index_range = []
                index_range = []
                cur_sum = 0
                for i in range(0, len(new_coef_index_range_seq)):
                    coef_index_range.append([cur_sum, cur_sum + new_coef_index_range_seq[i]])
                    index_range.append([cur_sum * 6, 6 * (cur_sum + new_coef_index_range_seq[i])])
                    cur_sum += new_coef_index_range_seq[i]

                coef_index_range = np.array(coef_index_range)
                index_range = np.array(index_range)
                print(coef_index_range)
#                 print(index_range)

                L=L[:,remain_indexes_after_cond_expanded]
                scores_for_training = scores_for_training[:, remain_indexes_after_cond]
    if((len(last_training_data_indexes) == len(data_indexes)) and 
       (sum(last_training_data_indexes == data_indexes) == len(data_indexes)) and 
       (np.max(coef_index_range) < 2)):
        counter =  counter + 1
    else:
        counter = 0
    if(counter > 3):
        break
    last_training_data_indexes = data_indexes

##################################################################
Iteration = 0, L shape = (620098, 156)
num of inliers = 600280
num of outliers = 0
num of outliers = 1378
Training data shape:  (601658, 26)
Training data F-1 0.5953109072375128
Training data accuracy: 0.9986803134006362
Training data outlier accuracy: 0.42380261248185774
(601658, 29)
(601658,)


### The following code is used to run the previous code for multiple times, but the results do not vary that much.

In [237]:
index_range = np.array([[0, 60], [60, 120], [120, 150], [150, 156]])
coef_index_range = np.array([[0, 10], [10, 20], [20, 25], [25, 26]])
coef_remain_index = range(156)

scores_for_training_indexes = []
for i in range(len(index_range)):
    start=index_range[i][0]
    temp_range = coef_index_range[i][1]-coef_index_range[i][0]
    scores_for_training_indexes  = scores_for_training_indexes + list(range(start, start+temp_range))
print(scores_for_training_indexes) 
scores_for_training = scores[:, np.array(scores_for_training_indexes)]
print(np.shape(scores_for_training))

transformer = RobustScaler().fit(scores_for_training)
scores_transformed = transformer.transform(scores_for_training)
training_data = scores_transformed[data_indexes]
print('Training data shape: ', np.shape(training_data))
training_data_F1.append(metrics.f1_score(y[data_indexes], labels))
print('Training data F-1', metrics.f1_score(y[data_indexes], labels))

transformer = RobustScaler().fit(X)
X_transformed = transformer.transform(X)
X_training_data = X_transformed[data_indexes]
print(np.shape(X_training_data))
print(np.shape(labels))

clf_X = SVC(gamma='auto', probability=True, random_state=0)
clf_X.fit(X_training_data, labels)
clf_predictions_X = clf_X.predict(X_transformed)
clf_predict_proba_X = clf_X.predict_proba(X_transformed)[:,1]
print("F-1 score from SVM:",metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba_X > 0.5])))
cur_f1_scores.append(metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba_X > 0.5])))

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
clf = LogisticRegression(random_state=0, penalty='l2', max_iter=max_iter).fit(training_data, labels) 
clf_predictions = clf.predict(scores_transformed)
clf_predict_proba = clf.predict_proba(scores_transformed)[:,1]
# LR_threshold = np.array(np.sort(clf_predict_proba)[::-1])[int(sum(clf_predictions_X))]
print("F-1 score from LR:",metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba > LR_threshold])))
print('Number of outliers by LR: ', sum(np.array([int(i) for i in clf_predict_proba > LR_threshold])))


agreed_outlier_indexes = np.where(np.sum(L,axis=1)==np.shape(L)[1])[0]
agreed_inlier_indexes = np.where(np.sum(L,axis=1)==0)[0]

prediction_result_list.append(clf_predict_proba)
classifier_result_list.append(clf_predict_proba_X)

prediction_list.append(np.array([int(i) for i in clf_predictions]))

prediction_high_conf_outliers = np.intersect1d(np.where(prediction_result_list[-1] > high_confidence_threshold)[0],
                                               np.where(classifier_result_list[-1] > high_confidence_threshold)[0])
print('length of prediction_high_conf_outliers:' , len(prediction_high_conf_outliers))
prediction_high_conf_inliers = np.intersect1d(np.where(prediction_result_list[-1] < low_confidence_threshold)[0],
                                               np.where(classifier_result_list[-1] < low_confidence_threshold)[0])
print('length of prediction high conf inliers: ', len(prediction_high_conf_inliers))

temp_prediction = np.array([int(i) for i in prediction_result_list[-1] > LR_threshold])
temp_classifier = np.array([int(i) for i in classifier_result_list[-1] > 0.5])
prediction_classifier_disagree = np.where(temp_prediction != temp_classifier)[0]
print(np.corrcoef(clf_predict_proba,clf_predict_proba_X))

L = L_prev
remain_params_tracking = np.array(range(0,np.max(coef_index_range)))

if np.max(coef_index_range) >= 2:
    if(len(prediction_high_conf_outliers) > 0 and len(prediction_high_conf_inliers) > 0):
        new_data_indexes = np.concatenate((prediction_high_conf_outliers, prediction_high_conf_inliers), axis = 0)
        new_data_indexes = np.array([int(i) for i in new_data_indexes])
        new_labels = np.concatenate((np.ones(len(prediction_high_conf_outliers)), np.zeros(len(prediction_high_conf_inliers))), axis = 0)
        clf_prune_2 = LogisticRegression(random_state=0, penalty='l2', max_iter=max_iter).fit(scores_transformed[new_data_indexes], new_labels) 
        print("F-1 score from both LR and SVM:",metrics.f1_score(y, np.array([int(i) for i in clf_prune_2.predict_proba(scores_transformed)[:,1] > 0.5])))
        print('Coef from both LR and SVM: ', clf_prune_2.coef_[0])
        combined_coef = clf_prune_2.coef_[0]  
    else:
        print('Coef from normal training: ', clf.coef_[0])
        combined_coef = clf.coef_[0]
        print('Combined Coef: ',  combined_coef)

    if(np.max(coef_index_range) > 2 or 
       ((np.max(combined_coef)/np.min(combined_coef) >= 1.1) and np.max(coef_index_range) >= 2)):
        if(len(set(combined_coef)) > 1):
            cur_clf_coef = combined_coef 
            cutoff = max(max(0, np.mean(combined_coef)-np.std(combined_coef)),min(combined_coef))
            print(cutoff)

            remain_indexes_after_cond = (cur_clf_coef > cutoff) #np.logical_and(cur_clf_coef > cutoff, abs(cur_clf_coef) > 0.01) # # 
            remain_params_tracking = remain_params_tracking[remain_indexes_after_cond]
            print(remain_params_tracking)
            remain_indexes_after_cond_expanded = []
            for i in range(0, len(coef_index_range)): #
                s_e_range = coef_index_range[i,1]-coef_index_range[i,0]
                s1, e1 = coef_index_range[i,0], coef_index_range[i,1]
                s2, e2 = index_range[i,0], index_range[i,1]
                saved_indexes = np.where(cur_clf_coef[s1:e1] > cutoff)[0]
                for j in range(N_size):
                    remain_indexes_after_cond_expanded.extend(np.array(saved_indexes) + j * s_e_range + s2)

            new_coef_index_range_seq = []
            for i in range(0, len(coef_index_range)): #
                s, e = coef_index_range[i,0], coef_index_range[i,1]
                new_coef_index_range_seq.append(sum((remain_indexes_after_cond)[s:e]))

            coef_index_range = []
            index_range = []
            cur_sum = 0
            for i in range(0, len(new_coef_index_range_seq)):
                coef_index_range.append([cur_sum, cur_sum + new_coef_index_range_seq[i]])
                index_range.append([cur_sum * 6, 6 * (cur_sum + new_coef_index_range_seq[i])])
                cur_sum += new_coef_index_range_seq[i]

            coef_index_range = np.array(coef_index_range)
            index_range = np.array(index_range)
            print(coef_index_range)
            print(index_range)

            L=L[:,remain_indexes_after_cond_expanded]
            scores_for_training = scores_for_training[:, remain_indexes_after_cond]



[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 120, 121, 122, 123, 124, 150]
(6435, 26)
Training data shape:  (4972, 26)
Training data F-1 0.6486090775988287
(4972, 36)
(4972,)
F-1 score from SVM: 0.6798226164079822
F-1 score from LR: 0.5684425184807546
Number of outliers by LR:  1887
length of prediction_high_conf_outliers: 1524
length of prediction high conf inliers:  3645
[[1.        0.8054605]
 [0.8054605 1.       ]]
F-1 score from both LR and SVM: 0.5758263941458491
Coef from both LR and SVM:  [-2.83100263e-01 -3.99856073e-01 -2.68811602e-01 -1.49840618e-01
 -1.24860456e-01 -4.86986570e-02  1.51851196e-03  1.12958221e-01
  1.98108632e-01  2.77749017e-01  1.60904209e+00  1.72673143e+00
  1.82370081e+00  1.81498134e+00  1.80672879e+00  1.81666636e+00
  1.80077497e+00  1.82487648e+00  1.81047103e+00  1.79117451e+00
  6.18081682e-01  4.08150387e-01  6.69212781e-02  8.25602491e-01
  5.15766889e-01  2.28698657e-01]
0
[ 6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 

In [None]:
last_training_data_indexes = []
counter = 0

for i_range in range(0, 50):
    print("##################################################################")
    print('Iteration = {}, L shape = {}'.format(i_range, np.shape(L)))
    num_methods = np.shape(L)[1]

#     agree_outlier_indexes = (np.sum(L,axis=1)==np.shape(L)[1])
#     print('All agree, Number of outliers = {}'.format(sum(agree_outlier_indexes)))
#     agree_inlier_indexes = (np.sum(L,axis=1)==0)
#     print('All agree, Number of inliers = {}'.format(sum(agree_inlier_indexes)))

#     all_inlier_indexes = np.union1d(np.setdiff1d(np.where(agree_inlier_indexes)[0], prediction_high_conf_outliers), prediction_high_conf_inliers)
#     print('num of inliers = {}'.format(np.shape(all_inlier_indexes)[0]))
#     all_outlier_indexes = np.union1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)
#     print('num of outliers = {}'.format(np.shape(all_outlier_indexes)[0]))

#     disagree_indexes = np.where(np.logical_or(np.sum(L,axis = 1)==0, np.sum(L,axis = 1)==num_methods)==0)[0]

    ########################################################################

    agree_outlier_indexes = np.sum(L,axis=1)==np.shape(L)[1]
#     print('All agree, Number of outliers = {}'.format(sum(agree_outlier_indexes)))
    agree_inlier_indexes = np.sum(L,axis=1)==0
#     print('All agree, Number of inliers = {}'.format(sum(agree_inlier_indexes)))

    disagree_indexes = np.where(np.logical_or(np.sum(L,axis = 1)==0, np.sum(L,axis = 1)==num_methods)==0)[0]
    # print('Number of disagreed points = {}'.format(len(disagree_indexes)))
    # print('Number of disagreed points (true outliers) = {}'.format(sum(y[disagree_indexes] == 1)))
    # print('Number of disagreed points (true inliers) = {}'.format(sum(y[disagree_indexes] == 0)))

#     all_inlier_indexes = np.where(agree_inlier_indexes)[0]
    all_inlier_indexes = np.setdiff1d(np.where(agree_inlier_indexes)[0], prediction_high_conf_outliers)
    if len(prediction_high_conf_inliers) >0:
        all_inlier_indexes = np.intersect1d(np.setdiff1d(np.where(agree_inlier_indexes)[0], prediction_high_conf_outliers), prediction_high_conf_inliers)
#     print('num of inliers = {}'.format(np.shape(all_inlier_indexes)[0]))
#     all_outlier_indexes = np.union1d(np.where(agree_outlier_indexes)[0], self_agree_index_list)

#     if(len(np.intersect1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)) > 0 and
#       (len(np.union1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)) > 2000)):
#         all_outlier_indexes = np.intersect1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)
#     else:
    all_outlier_indexes = np.union1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)
#     if(len(all_outlier_indexes) > 1000):
#         all_outlier_indexes = np.random.RandomState(1).permutation(all_outlier_indexes)[:1000]
        
#     all_outlier_indexes = np.union1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)
#     print('num of outliers = {}'.format(np.shape(all_outlier_indexes)[0]))
    all_inlier_indexes = np.setdiff1d(all_inlier_indexes, prediction_classifier_disagree)
    
    self_agree_index_list = []
    if((len(all_outlier_indexes) == 0) or (len(all_inlier_indexes)/ len(all_outlier_indexes) > 1000)):
        for i in range(0, len(index_range)):
            if(index_range[i,1]-index_range[i,0] <= 6):
                continue
            temp_index = disagree_indexes[np.where(np.sum(L[disagree_indexes][:,index_range[i,0]: index_range[i,1]], axis = 1)==(index_range[i,1]-index_range[i,0]))[0]]
            self_agree_index_list = np.union1d(self_agree_index_list, temp_index)
        self_agree_index_list = [int(i) for i in self_agree_index_list]
#     self_agree_index_list = np.random.RandomState(1).permutation(self_agree_index_list)[:500]
    all_outlier_indexes = np.union1d(all_outlier_indexes, self_agree_index_list)
    all_outlier_indexes = np.setdiff1d(all_outlier_indexes, prediction_classifier_disagree)
#     print('num of outliers = {}'.format(np.shape(all_outlier_indexes)[0]))
    
    
    from sklearn.preprocessing import RobustScaler
    from sklearn.preprocessing import StandardScaler
    data_indexes = np.concatenate((all_inlier_indexes, all_outlier_indexes), axis = 0)
    data_indexes = np.array([int(i) for i in data_indexes])
    labels = np.concatenate((np.zeros(len(all_inlier_indexes)), np.ones(len(all_outlier_indexes))), axis = 0)
    transformer = RobustScaler().fit(scores_for_training)
    scores_transformed = transformer.transform(scores_for_training)
    training_data = scores_transformed[data_indexes]
#     print('Training data shape: ', np.shape(training_data))
    training_data_F1.append(metrics.f1_score(y[data_indexes], labels))
#     print('Training data F-1', metrics.f1_score(y[data_indexes], labels))
    
    transformer = RobustScaler().fit(X)
    X_transformed = transformer.transform(X)
    X_training_data = X_transformed[data_indexes]
    print(np.shape(X_training_data))
    print(np.shape(labels))

    clf_X = SVC(gamma='auto', probability=True, random_state=0)
    clf_X.fit(X_training_data, labels)
    clf_predictions_X = clf_X.predict(X_transformed)
    clf_predict_proba_X = clf_X.predict_proba(X_transformed)[:,1]
    print("F-1 score from SVM:",metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba_X > 0.5])))
    cur_f1_scores.append(metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba_X > 0.5])))
    
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import SVC
    clf = LogisticRegression(random_state=0, penalty='l2', max_iter=max_iter).fit(training_data, labels) 
    clf_predictions = clf.predict(scores_transformed)
    clf_predict_proba = clf.predict_proba(scores_transformed)[:,1]
    LR_threshold = np.array(np.sort(clf_predict_proba)[::-1])[int(sum(clf_predictions_X))]
    print("F-1 score from LR:",metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba > LR_threshold])))
    print('Number of outliers by LR: ', sum(np.array([int(i) for i in clf_predict_proba > LR_threshold])))
    
    
    agreed_outlier_indexes = np.where(np.sum(L,axis=1)==np.shape(L)[1])[0]
    agreed_inlier_indexes = np.where(np.sum(L,axis=1)==0)[0]
        
    prediction_result_list.append(clf_predict_proba)
    classifier_result_list.append(clf_predict_proba_X)
    
    prediction_list.append(np.array([int(i) for i in clf_predictions]))
    print(np.corrcoef(clf_predict_proba,clf_predict_proba_X))
    
    prediction_high_conf_outliers = np.intersect1d(np.where(prediction_result_list[-1] > high_confidence_threshold)[0],
                                                   np.where(classifier_result_list[-1] > high_confidence_threshold)[0])
#     print('length of prediction_high_conf_outliers:' , len(prediction_high_conf_outliers))
    prediction_high_conf_inliers = np.intersect1d(np.where(prediction_result_list[-1] < low_confidence_threshold)[0],
                                                   np.where(classifier_result_list[-1] < low_confidence_threshold)[0])
#     print('length of prediction high conf inliers: ', len(prediction_high_conf_inliers))
    
    temp_prediction = np.array([int(i) for i in prediction_result_list[-1] > LR_threshold])
    temp_classifier = np.array([int(i) for i in classifier_result_list[-1] > 0.5])
    prediction_classifier_disagree = np.where(temp_prediction != temp_classifier)[0]
    
    if np.max(coef_index_range) >= 2:
        if(len(prediction_high_conf_outliers) > 0 and len(prediction_high_conf_inliers) > 0):
            new_data_indexes = np.concatenate((prediction_high_conf_outliers, prediction_high_conf_inliers), axis = 0)
            new_data_indexes = np.array([int(i) for i in new_data_indexes])
            new_labels = np.concatenate((np.ones(len(prediction_high_conf_outliers)), np.zeros(len(prediction_high_conf_inliers))), axis = 0)
            clf_prune_2 = LogisticRegression(random_state=0, penalty='l2', max_iter=max_iter).fit(scores_transformed[new_data_indexes], new_labels) 
#             print("F-1 score from both LR and SVM:",metrics.f1_score(y, np.array([int(i) for i in clf_prune_2.predict_proba(scores_transformed)[:,1] > 0.5])))
#             print('Coef from both LR and SVM: ', clf_prune_2.coef_[0])
            combined_coef = clf_prune_2.coef_[0]  
        else:
#             print('Coef from normal training: ', clf.coef_[0])
            combined_coef = clf.coef_[0]
#             print('Combined Coef: ',  combined_coef)

        if(np.max(coef_index_range) >= 2 or 
           ((np.max(combined_coef)/np.min(combined_coef) >= 1.1) and np.max(coef_index_range) >= 2)):
            if(len(set(combined_coef)) > 1):
                cur_clf_coef = combined_coef 
                cutoff = max(max(0, np.mean(combined_coef)-np.std(combined_coef)),min(combined_coef))
#                 print(cutoff)

                remain_indexes_after_cond = (cur_clf_coef > cutoff) #np.logical_and(cur_clf_coef > cutoff, abs(cur_clf_coef) > 0.01) # # 
                remain_params_tracking = remain_params_tracking[remain_indexes_after_cond]
                print(remain_params_tracking)
                remain_indexes_after_cond_expanded = []
                for i in range(0, len(coef_index_range)): #
                    s_e_range = coef_index_range[i,1]-coef_index_range[i,0]
                    s1, e1 = coef_index_range[i,0], coef_index_range[i,1]
                    s2, e2 = index_range[i,0], index_range[i,1]
                    saved_indexes = np.where(cur_clf_coef[s1:e1] > cutoff)[0]
                    for j in range(N_size):
                        remain_indexes_after_cond_expanded.extend(np.array(saved_indexes) + j * s_e_range + s2)

                new_coef_index_range_seq = []
                for i in range(0, len(coef_index_range)): #
                    s, e = coef_index_range[i,0], coef_index_range[i,1]
                    new_coef_index_range_seq.append(sum((remain_indexes_after_cond)[s:e]))

                coef_index_range = []
                index_range = []
                cur_sum = 0
                for i in range(0, len(new_coef_index_range_seq)):
                    coef_index_range.append([cur_sum, cur_sum + new_coef_index_range_seq[i]])
                    index_range.append([cur_sum * 6, 6 * (cur_sum + new_coef_index_range_seq[i])])
                    cur_sum += new_coef_index_range_seq[i]

                coef_index_range = np.array(coef_index_range)
                index_range = np.array(index_range)
                print(coef_index_range)
#                 print(index_range)

                L=L[:,remain_indexes_after_cond_expanded]
                scores_for_training = scores_for_training[:, remain_indexes_after_cond]
    if((len(last_training_data_indexes) == len(data_indexes)) and 
       (sum(last_training_data_indexes == data_indexes) == len(data_indexes)) and 
       (np.max(coef_index_range) < 2)):
        counter =  counter + 1
    else:
        counter = 0
    if(counter > 3):
        break
    last_training_data_indexes = data_indexes

##################################################################
Iteration = 0, L shape = (6435, 120)
(3868, 36)
(3868,)
F-1 score from SVM: 0.6676802780191139
F-1 score from LR: 0.5887546468401488
Number of outliers by LR:  2268
[[1.         0.83877918]
 [0.83877918 1.        ]]
[ 8  9 10 11 12 13 14 15 16 17 18 19 20 21 23 24]
[[ 0  2]
 [ 2 12]
 [12 16]
 [16 16]]
##################################################################
Iteration = 1, L shape = (6435, 96)
(4505, 36)
(4505,)
F-1 score from SVM: 0.6648032681143841
F-1 score from LR: 0.6057273768613973
Number of outliers by LR:  2329
[[1.         0.84494523]
 [0.84494523 1.        ]]
[ 8  9 10 11 12 13 14 15 16 17 18 19]
[[ 0  2]
 [ 2 12]
 [12 12]
 [12 12]]
##################################################################
Iteration = 2, L shape = (6435, 72)
(4939, 36)
(4939,)
F-1 score from SVM: 0.6626865671641791
F-1 score from LR: 0.5794010889292197
Number of outliers by LR:  2372
[[1.         0.82674177]
 [0.82674177 1.  