In [1]:
import numpy as np
from scipy.io import arff
import pandas as pd
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn import metrics
import scipy as sp
import logging
from sklearn.neighbors import NearestNeighbors
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
    
def run_lof(X, y, k=60):
    clf = LocalOutlierFactor(n_neighbors=k)
    clf.fit(X)
    lof_scores = -clf.negative_outlier_factor_
    return lof_scores

def get_predictions(scores, num_outliers = 400, method_name = 'LOF'):
    threshold = np.sort(scores)[::-1][num_outliers]
    # threshold, max_f1 = get_best_f1_score(y, lof_scores)
    predictions = np.array(scores > threshold)
    predictions = np.array([int(i) for i in predictions])
#     print('F1 for {} : {}'.format(method_name, metrics.f1_score(y, predictions)))
    return predictions, scores, metrics.f1_score(y, predictions)

def get_precision_recall(scores, num_outliers = 400, method_name = 'LOF'):
    threshold = np.sort(scores)[::-1][num_outliers]
    # threshold, max_f1 = get_best_f1_score(y, lof_scores)
    predictions = np.array(scores > threshold)
    predictions = np.array([int(i) for i in predictions])
#     print('F1 for {} : {}'.format(method_name, metrics.f1_score(y, predictions)))
    return predictions, scores, metrics.f1_score(y, predictions), metrics.precision_score(y, predictions), metrics.recall_score(y, predictions)

def get_best_F1(scores):
    best_f1 = 0
    for i in range(np.shape(scores)[0]):
        threshold = np.sort(scores)[::-1][i]
        predictions = np.array(scores > threshold)
        predictions = np.array([int(i) for i in predictions])
        cur_f1 = metrics.f1_score(y, predictions)
        best_f1 = max(cur_f1, best_f1)
    return best_f1

def run_knn(X, y, k=60):
    neigh = NearestNeighbors(n_neighbors=k)
    neigh.fit(X)
    knn_dists = neigh.kneighbors(X)[0][:,-1]
    return knn_dists

def run_isolation_forest(X, y, max_features = 1.0):
    # training the model
    clf = IsolationForest(random_state=42,max_features=max_features)
    clf.fit(X)
    # predictions
    sklearn_score_anomalies = clf.decision_function(X)
    if_scores = [-1*s + 0.5 for s in sklearn_score_anomalies]
    return if_scores

def mahalanobis(x):
    """Compute the Mahalanobis Distance between each row of x and the data
    """
    x_minus_mu = x - np.mean(x)
    cov = np.cov(x.T)
    inv_covmat = sp.linalg.inv(cov)
    results = []
    x_minus_mu = np.array(x_minus_mu)
    for i in range(np.shape(x)[0]):
        cur_data = x_minus_mu[i,:]
        results.append(np.dot(np.dot(x_minus_mu[i,:], inv_covmat), x_minus_mu[i,:].T))
    return np.array(results)
#     left_term = np.dot(x_minus_mu, inv_covmat)
#     mahal = np.dot(left_term, x_minus_mu.T)
#     print(mahal.diagonal())
#     return mahal.diagonal()

def run_mahalanobis(X, y):
    # training the model
    dist = mahalanobis(x=X)
    return dist

def load_dataset(filename):
    with open(filename, 'r') as f:
        data, meta = arff.loadarff(f)
    data = pd.DataFrame(data)
    X = data.drop(columns=['id', 'outlier'])
    # Map dataframe to encode values and put values into a numpy array
    y = data["outlier"].map(lambda x: 1 if x == b'yes' else 0).values
    return X, y

### Load shuttle dataset

In [2]:
import hdf5storage
mat = hdf5storage.loadmat('shuttle.mat')
X = mat['X']
y = mat['y']
print(len(y))
print(np.sum(y)/len(y))
lof_krange = list(range(10,110,10)) * 6
knn_krange = list(range(10,110,10)) * 6
if_range = [0.5, 0.6, 0.7, 0.8, 0.9] * 6
mahalanobis_N_range=[1000,1500,2000, 2500,3000, 3500]
# mahalanobis_N_range = [20, 40, 60,80, 100,120]
if_N_range = np.sort(mahalanobis_N_range * 5)
N_range = np.sort(mahalanobis_N_range *10)
print(np.shape(X))

# normalize
# from sklearn.preprocessing import Normalizer
# transformer = Normalizer().fit(X) 
# X = transformer.transform(X)

49097
0.0715114976475141
(49097, 9)


In [3]:
f1s = []
precisions = []
recalls = []

temp_if_results = dict()
unique_if_features = list(set(if_range)) 
best_f1 = 0
best_precision = 0
best_recall = 0
for k in unique_if_features:
    print(k)
    if_scores = run_isolation_forest(X, y, max_features=k)
    temp_if_results[k] = if_scores
for i in range(len(if_range)):
    if_predictions, if_scores,f1, precision, recall = get_precision_recall(temp_if_results[if_range[i]], num_outliers=if_N_range[i], method_name='IF')
    f1s.append(f1)
    if f1>best_f1:
        best_f1 = f1
        best_precision = precision
        best_recall = recall
    precisions.append(precision)
    recalls.append(recall)
    
print(f"Average F1s = {sum(f1s)/len(f1s)}, Average Precision = {sum(precisions)/len(precisions)}, Average recall = {sum(recalls)/len(recalls)}")
print(f"Max F1s = {max(f1s)}, Precision at best F1 = {best_precision}, recall at best F1 = {best_recall}")

0.5
0.6
0.8
0.9
0.7
Average F1s = 0.7351099356690932, Average Precision = 0.9801370537567927, Average recall = 0.6236779644925473
Max F1s = 0.9533590072742834, Precision at best F1 = 0.9548571428571428, recall at best F1 = 0.9518655653659926


In [None]:
all_results = []
all_scores = []
f1s = []

method_to_bestf1 = {}
best_f1 = 0
best_precision=0
best_recall=0

temp_lof_results = dict()
unique_lof_ks = list(set(lof_krange)) 

best_lof_f1 = 0
best_lof_precision = 0
best_lof_recall = 0
for k in unique_lof_ks:
#     print(k)
    lof_scores = run_lof(X, y, k=k)
    temp_lof_results[k] = lof_scores
for i in range(len(lof_krange)):
    lof_predictions, lof_scores, f1, precision,recall = get_precision_recall(temp_lof_results[lof_krange[i]], num_outliers=N_range[i], method_name='LOF')
    all_results.append(lof_predictions)
    all_scores.append(lof_scores)
    f1s.append(f1)
    if f1 > best_f1:
        best_f1 = f1
        best_precision = precision
        best_recall = recall

best_lof_f1 = 0
for i in np.sort(unique_lof_ks):
    temp_f1 = max(np.array(f1s[0:60])[np.where(np.array(lof_krange) == i)[0]])
    best_lof_f1 = max(best_lof_f1, temp_f1)

method_to_bestf1["LOF"] = best_lof_f1

temp_knn_results = dict()
unique_knn_ks = list(set(knn_krange)) 
for k in unique_knn_ks:
    knn_scores = run_knn(X, y, k=k)
    temp_knn_results[k] = knn_scores
for i in range(len(knn_krange)):
    knn_predictions, knn_scores,f1,precision,recall = get_precision_recall(temp_knn_results[knn_krange[i]], num_outliers=N_range[i], method_name='KNN')
    all_results.append(knn_predictions)
    all_scores.append(knn_scores)
    f1s.append(f1)
    if f1 > best_f1:
        best_f1 = f1
        best_precision = precision
        best_recall = recall
best_knn_f1 = 0
for i in np.sort(unique_knn_ks):
    temp_f1 = max(np.array(f1s[60:120])[np.where(np.array(knn_krange) == i)[0]])
    best_knn_f1 = max(best_knn_f1, temp_f1)
method_to_bestf1["KNN"] = best_knn_f1

temp_if_results = dict()
unique_if_features = list(set(if_range)) 
for k in unique_if_features:
    if_scores = run_isolation_forest(X, y, max_features=k)
    temp_if_results[k] = if_scores
for i in range(len(if_range)):
    if_predictions, if_scores,f1,precision,recall = get_precision_recall(temp_if_results[if_range[i]], num_outliers=if_N_range[i], method_name='IF')
    all_results.append(if_predictions)
    all_scores.append(if_scores)
    f1s.append(f1)
    if f1 > best_f1:
        best_f1 = f1
        best_precision = precision
        best_recall = recall
best_if_f1 = 0
for i in np.sort(unique_if_features):
    temp_f1 = max(np.array(f1s[120:150])[np.where(np.array(if_range) == i)[0]])
    best_if_f1 = max(best_if_f1, temp_f1)
method_to_bestf1["IF"] = best_if_f1
   
mahalanobis_scores = run_mahalanobis(X, y)
best_mahala_f1 = 0
for i in range(len(mahalanobis_N_range)):
    mahalanobis_predictions,mahalanobis_scores,f1,precision,recall = get_precision_recall(mahalanobis_scores, num_outliers=mahalanobis_N_range[i], method_name='mahala')
    all_results.append(mahalanobis_predictions)
    all_scores.append(mahalanobis_scores)
    best_mahala_f1 = max(best_mahala_f1, f1)
    f1s.append(f1)
    if f1 > best_f1:
        best_f1 = f1
        best_precision = precision
        best_recall = recall
method_to_bestf1["Mahala"] = best_mahala_f1
best_method = ""
best_f1 =0
for method, f1 in method_to_bestf1.items():
    if f1 > best_f1:
        best_method = method
        best_f1 = f1

print(f"Best Method = {best_method}, Best F1 = {best_f1}")
L = np.stack(all_results).T
scores = np.stack(all_scores).T
print(f"Best F1 = {best_f1}, best_precision = {best_precision}, best_recall={best_recall}")

In [8]:
print(np.shape(L))
print(np.shape(scores))

(49097, 156)
(49097, 156)


### Majority Vote

In [9]:
mid = np.shape(L)[1]/2
predictions = np.full((len(y)), 0)
predictions[np.sum(L, axis = 1) > mid] = 1
print('F1 for MV:', metrics.f1_score(y, predictions))

F1 for MV: 0.234249837627192


### Save and load pickles

In [10]:
L_prev = L
scores_prev = scores

In [11]:
L = L_prev
scores = scores_prev

In [12]:
# print(max(f1s)) 
print(np.shape(L))

(49097, 156)


In [13]:
prediction_result_list = []
classifier_result_list = []
prediction_list = []
cur_f1_scores = []
prediction_high_conf_outliers = np.array([])
prediction_high_conf_inliers = np.array([])
prediction_classifier_disagree = np.array([])

In [14]:
index_range = np.array([[0, 60], [60, 120], [120, 150], [150, 156]])
coef_index_range = np.array([[0, 10], [10, 20], [20, 25], [25, 26]])
coef_remain_index = range(156)

In [15]:
scores_for_training_indexes = []
for i in range(len(index_range)):
    start=index_range[i][0]
    temp_range = coef_index_range[i][1]-coef_index_range[i][0]
    scores_for_training_indexes  = scores_for_training_indexes + list(range(start, start+temp_range))
print(scores_for_training_indexes) 
scores_for_training = scores[:, np.array(scores_for_training_indexes)]
print(np.shape(scores_for_training))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 120, 121, 122, 123, 124, 150]
(49097, 26)


### Iterative train LR and classifier(SVM)

In [16]:
# stable version
high_confidence_threshold = 0.99
low_confidence_threshold = 0.01
max_iter = 200
remain_params_tracking = np.array(range(0,np.max(coef_index_range)))
training_data_F1 = []
two_prediction_corr = []

min_max_diff = []
N_size = 6

last_training_data_indexes = []
counter = 0

for i_range in range(0, 50):
    print("##################################################################")
    print('Iteration = {}, L shape = {}'.format(i_range, np.shape(L)))
    num_methods = np.shape(L)[1]

    agree_outlier_indexes = np.sum(L,axis=1)==np.shape(L)[1]
    print('All agree, Number of outliers = {}'.format(sum(agree_outlier_indexes)))
    agree_inlier_indexes = np.sum(L,axis=1)==0
    print('All agree, Number of inliers = {}'.format(sum(agree_inlier_indexes)))

    disagree_indexes = np.where(np.logical_or(np.sum(L,axis = 1)==0, np.sum(L,axis = 1)==num_methods)==0)[0]

    all_inlier_indexes = np.setdiff1d(np.where(agree_inlier_indexes)[0], prediction_high_conf_outliers)
    if len(prediction_high_conf_inliers) > 0:
        all_inlier_indexes = np.intersect1d(np.setdiff1d(np.where(agree_inlier_indexes)[0], prediction_high_conf_outliers), prediction_high_conf_inliers)
    print('num of inliers = {}'.format(np.shape(all_inlier_indexes)[0]))

    all_outlier_indexes = np.union1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)
    print('num of outliers = {}'.format(np.shape(all_outlier_indexes)[0]))
    all_inlier_indexes = np.setdiff1d(all_inlier_indexes, prediction_classifier_disagree)
    
    self_agree_index_list = []
    if((len(all_outlier_indexes) == 0) or (len(all_inlier_indexes)/ len(all_outlier_indexes) > 1000)):
        for i in range(0, len(index_range)):
            if(index_range[i,1]-index_range[i,0] <= 6):
                continue
            temp_index = disagree_indexes[np.where(np.sum(L[disagree_indexes][:,index_range[i,0]: index_range[i,1]], axis = 1)==(index_range[i,1]-index_range[i,0]))[0]]
            self_agree_index_list = np.union1d(self_agree_index_list, temp_index)
        self_agree_index_list = [int(i) for i in self_agree_index_list]
#     self_agree_index_list = np.random.RandomState(1).permutation(self_agree_index_list)[:500]
    all_outlier_indexes = np.union1d(all_outlier_indexes, self_agree_index_list)
    all_outlier_indexes = np.setdiff1d(all_outlier_indexes, prediction_classifier_disagree)
    print('num of outliers = {}'.format(np.shape(all_outlier_indexes)[0]))
    
    
    from sklearn.preprocessing import RobustScaler
    from sklearn.preprocessing import StandardScaler
    data_indexes = np.concatenate((all_inlier_indexes, all_outlier_indexes), axis = 0)
    data_indexes = np.array([int(i) for i in data_indexes])
    labels = np.concatenate((np.zeros(len(all_inlier_indexes)), np.ones(len(all_outlier_indexes))), axis = 0)
    transformer = RobustScaler().fit(scores_for_training)
    scores_transformed = transformer.transform(scores_for_training)
    training_data = scores_transformed[data_indexes]
    print('Training data shape: ', np.shape(training_data))
    training_data_F1.append(metrics.f1_score(y[data_indexes], labels))
    print('Training data F-1', metrics.f1_score(y[data_indexes], labels))

    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import SVC
#     clf = SVC(gamma='auto', probability=True, random_state=0)
#     clf.fit(training_data, labels)
    clf = LogisticRegression(random_state=0, penalty='l2', max_iter=max_iter).fit(training_data, labels) 
    clf_predictions = clf.predict(scores_transformed)
    clf_predict_proba = clf.predict_proba(scores_transformed)[:,1]
    print("F-1 score from LR:",metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba > 0.5])))
    
    
    transformer = RobustScaler().fit(X)
    X_transformed = transformer.transform(X)
    X_training_data = X_transformed[data_indexes]
    print(np.shape(X_training_data))
    print(np.shape(labels))

    clf_X = SVC(gamma='auto', probability=True, random_state=0)
    clf_X.fit(X_training_data, labels)
    clf_predictions_X = clf_X.predict(X_transformed)
    clf_predict_proba_X = clf_X.predict_proba(X_transformed)[:,1]
    print("F-1 score from SVM:",metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba_X > 0.5])))
    
    agreed_outlier_indexes = np.where(np.sum(L,axis=1)==np.shape(L)[1])[0]
    agreed_inlier_indexes = np.where(np.sum(L,axis=1)==0)[0]
        
    prediction_result_list.append(clf_predict_proba)
    classifier_result_list.append(clf_predict_proba_X)
    
    prediction_list.append(np.array([int(i) for i in clf_predictions]))
    
    prediction_high_conf_outliers = np.intersect1d(np.where(prediction_result_list[-1] > high_confidence_threshold)[0],
                                                   np.where(classifier_result_list[-1] > high_confidence_threshold)[0])
    print('length of prediction_high_conf_outliers:' , len(prediction_high_conf_outliers))
    prediction_high_conf_inliers = np.intersect1d(np.where(prediction_result_list[-1] < low_confidence_threshold)[0],
                                                   np.where(classifier_result_list[-1] < low_confidence_threshold)[0])
    print('length of prediction high conf inliers: ', len(prediction_high_conf_inliers))
    
    temp_prediction = np.array([int(i) for i in prediction_result_list[-1] > 0.5])
    temp_classifier = np.array([int(i) for i in classifier_result_list[-1] > 0.5])
    prediction_classifier_disagree = np.where(temp_prediction != temp_classifier)[0]
#     print('length of prediction-classifier disagree: {}'.format(len(prediction_classifier_disagree)))
#     print('length of prediction-classifier disagree in training: {}'.format(len(np.where(temp_prediction[data_indexes] != temp_classifier[data_indexes])[0])))
#     print(np.corrcoef(clf_predict_proba,clf_predict_proba_X))
    two_prediction_corr.append(np.corrcoef(clf_predict_proba,clf_predict_proba_X)[0,1])

    if np.max(coef_index_range) >= 2:
        if(len(prediction_high_conf_outliers) > 0 and len(prediction_high_conf_inliers) > 0):
            new_data_indexes = np.concatenate((prediction_high_conf_outliers, prediction_high_conf_inliers), axis = 0)
            new_data_indexes = np.array([int(i) for i in new_data_indexes])
            new_labels = np.concatenate((np.ones(len(prediction_high_conf_outliers)), np.zeros(len(prediction_high_conf_inliers))), axis = 0)
            clf_prune_2 = LogisticRegression(random_state=0, penalty='l2', max_iter=max_iter).fit(scores_transformed[new_data_indexes], new_labels) 
            combined_coef = clf_prune_2.coef_[0]  
        else:
            print('Coef from normal training: ', clf.coef_[0])
            combined_coef = clf.coef_[0]
            print('Combined Coef: ',  combined_coef)

        if(np.max(coef_index_range) >= 2):
            if(len(set(combined_coef)) > 1):
                cur_clf_coef = combined_coef 
                cutoff = max(max(0, np.mean(combined_coef)-np.std(combined_coef)),min(combined_coef))
                print(cutoff)

                remain_indexes_after_cond = (cur_clf_coef > cutoff) #np.logical_and(cur_clf_coef > cutoff, abs(cur_clf_coef) > 0.01) # # 
                remain_params_tracking = remain_params_tracking[remain_indexes_after_cond]
                print(remain_params_tracking)
                remain_indexes_after_cond_expanded = []
                for i in range(0, len(coef_index_range)): #
                    s_e_range = coef_index_range[i,1]-coef_index_range[i,0]
                    s1, e1 = coef_index_range[i,0], coef_index_range[i,1]
                    s2, e2 = index_range[i,0], index_range[i,1]
                    saved_indexes = np.where(cur_clf_coef[s1:e1] > cutoff)[0]
                    for j in range(N_size):
                        remain_indexes_after_cond_expanded.extend(np.array(saved_indexes) + j * s_e_range + s2)

                new_coef_index_range_seq = []
                for i in range(0, len(coef_index_range)): #
                    s, e = coef_index_range[i,0], coef_index_range[i,1]
                    new_coef_index_range_seq.append(sum((remain_indexes_after_cond)[s:e]))

                coef_index_range = []
                index_range = []
                cur_sum = 0
                for i in range(0, len(new_coef_index_range_seq)):
                    coef_index_range.append([cur_sum, cur_sum + new_coef_index_range_seq[i]])
                    index_range.append([cur_sum * 6, 6 * (cur_sum + new_coef_index_range_seq[i])])
                    cur_sum += new_coef_index_range_seq[i]

                coef_index_range = np.array(coef_index_range)
                index_range = np.array(index_range)
                print(coef_index_range)
                print(index_range)

                L=L[:,remain_indexes_after_cond_expanded]
                scores_for_training = scores_for_training[:, remain_indexes_after_cond]
    if((len(last_training_data_indexes) == len(data_indexes)) and 
       (sum(last_training_data_indexes == data_indexes) == len(data_indexes)) and 
       (np.max(coef_index_range) < 2)):
        counter =  counter + 1
    else:
        counter = 0
    if(counter > 3):
        break
    last_training_data_indexes = data_indexes

##################################################################
Iteration = 0, L shape = (49097, 156)
All agree, Number of outliers = 5
All agree, Number of inliers = 36049
num of inliers = 36049
num of outliers = 5
num of outliers = 1360
Training data shape:  (37409, 26)
Training data F-1 0.7915381225209344
F-1 score from LR: 0.6690008079719904
(37409, 9)
(37409,)
F-1 score from SVM: 0.7784699725890855
length of prediction_high_conf_outliers: 2262
length of prediction high conf inliers:  42276
0.010494407022842775
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
[[ 0 10]
 [10 20]
 [20 25]
 [25 25]]
[[  0  60]
 [ 60 120]
 [120 150]
 [150 150]]
##################################################################
Iteration = 1, L shape = (49097, 150)
All agree, Number of outliers = 17
All agree, Number of inliers = 38236
num of inliers = 38150
num of outliers = 2262
num of outliers = 2262
Training data shape:  (40412, 25)
Training data F-1 0.8292557301055885

F-1 score from LR: 0.7809254013220019
(41843, 9)
(41843,)
F-1 score from SVM: 0.8267985399740962
length of prediction_high_conf_outliers: 3519
length of prediction high conf inliers:  42252
0.4726806636905592
[ 0  1  5  6  7  8  9 19 20 21 22 23 24]
[[ 0  7]
 [ 7  8]
 [ 8 13]
 [13 13]]
[[ 0 42]
 [42 48]
 [48 78]
 [78 78]]
##################################################################
Iteration = 13, L shape = (49097, 78)
All agree, Number of outliers = 17
All agree, Number of inliers = 38697
num of inliers = 38378
num of outliers = 3519
num of outliers = 3519
Training data shape:  (41897, 13)
Training data F-1 0.8767954037663581
F-1 score from LR: 0.784267545925577
(41897, 9)
(41897,)
F-1 score from SVM: 0.8258261789956486
length of prediction_high_conf_outliers: 3572
length of prediction high conf inliers:  42233
0.4502114116153831
[ 0  1  6  7  8  9 19 20 21 22 23 24]
[[ 0  6]
 [ 6  7]
 [ 7 12]
 [12 12]]
[[ 0 36]
 [36 42]
 [42 72]
 [72 72]]
#######################################

F-1 score from SVM: 0.9220063025210085
length of prediction_high_conf_outliers: 2275
length of prediction high conf inliers:  44023
##################################################################
Iteration = 27, L shape = (49097, 6)
All agree, Number of outliers = 1000
All agree, Number of inliers = 45597
num of inliers = 44023
num of outliers = 2275
num of outliers = 2275
Training data shape:  (46298, 1)
Training data F-1 0.9958066651953211
F-1 score from LR: 0.9495786316240536
(46298, 9)
(46298,)
F-1 score from SVM: 0.9256525177959399
length of prediction_high_conf_outliers: 2156
length of prediction high conf inliers:  44199
##################################################################
Iteration = 28, L shape = (49097, 6)
All agree, Number of outliers = 1000
All agree, Number of inliers = 45597
num of inliers = 44199
num of outliers = 2156
num of outliers = 2156
Training data shape:  (46355, 1)
Training data F-1 0.9965091924598557
F-1 score from LR: 0.9444765760555235
(46355

All agree, Number of outliers = 1000
All agree, Number of inliers = 45597
num of inliers = 44599
num of outliers = 1791
num of outliers = 1791
Training data shape:  (46390, 1)
Training data F-1 0.9977616116396195
F-1 score from LR: 0.9101651264959855
(46390, 9)
(46390,)
F-1 score from SVM: 0.9468716289104638
length of prediction_high_conf_outliers: 1787
length of prediction high conf inliers:  44600
##################################################################
Iteration = 44, L shape = (49097, 6)
All agree, Number of outliers = 1000
All agree, Number of inliers = 45597
num of inliers = 44600
num of outliers = 1787
num of outliers = 1787
Training data shape:  (46387, 1)
Training data F-1 0.9977565900168255
F-1 score from LR: 0.9101378996817701
(46387, 9)
(46387,)
F-1 score from SVM: 0.9472548226089302
length of prediction_high_conf_outliers: 1787
length of prediction high conf inliers:  44603
##################################################################
Iteration = 45, L shape

In [17]:
index_range = np.array([[0, 60], [60, 120], [120, 150], [150, 156]])
coef_index_range = np.array([[0, 10], [10, 20], [20, 25], [25, 26]])
coef_remain_index = range(156)

scores_for_training_indexes = []
for i in range(len(index_range)):
    start=index_range[i][0]
    temp_range = coef_index_range[i][1]-coef_index_range[i][0]
    scores_for_training_indexes  = scores_for_training_indexes + list(range(start, start+temp_range))
print(scores_for_training_indexes) 
scores_for_training = scores[:, np.array(scores_for_training_indexes)]
print(np.shape(scores_for_training))

transformer = RobustScaler().fit(scores_for_training)
scores_transformed = transformer.transform(scores_for_training)
training_data = scores_transformed[data_indexes]
print('Training data shape: ', np.shape(training_data))
training_data_F1.append(metrics.f1_score(y[data_indexes], labels))
print('Training data F-1', metrics.f1_score(y[data_indexes], labels))

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
clf = LogisticRegression(random_state=0, penalty='l2', max_iter=max_iter).fit(training_data, labels) 
clf_predictions = clf.predict(scores_transformed)
clf_predict_proba = clf.predict_proba(scores_transformed)[:,1]
print("F-1 score from LR:",metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba > 0.5])))

transformer = RobustScaler().fit(X)
X_transformed = transformer.transform(X)
X_training_data = X_transformed[data_indexes]
print(np.shape(X_training_data))
print(np.shape(labels))

clf_X = SVC(gamma='auto', probability=True, random_state=0)
clf_X.fit(X_training_data, labels)
clf_predictions_X = clf_X.predict(X_transformed)
clf_predict_proba_X = clf_X.predict_proba(X_transformed)[:,1]
print("F-1 score from SVM:",metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba_X > 0.5])))
cur_f1_scores.append(metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba_X > 0.5])))

agreed_outlier_indexes = np.where(np.sum(L,axis=1)==np.shape(L)[1])[0]
agreed_inlier_indexes = np.where(np.sum(L,axis=1)==0)[0]

prediction_result_list.append(clf_predict_proba)
classifier_result_list.append(clf_predict_proba_X)

prediction_list.append(np.array([int(i) for i in clf_predictions]))

prediction_high_conf_outliers = np.intersect1d(np.where(prediction_result_list[-1] > high_confidence_threshold)[0],
                                               np.where(classifier_result_list[-1] > high_confidence_threshold)[0])
print('length of prediction_high_conf_outliers:' , len(prediction_high_conf_outliers))
prediction_high_conf_inliers = np.intersect1d(np.where(prediction_result_list[-1] < low_confidence_threshold)[0],
                                               np.where(classifier_result_list[-1] < low_confidence_threshold)[0])
print('length of prediction high conf inliers: ', len(prediction_high_conf_inliers))

temp_prediction = np.array([int(i) for i in prediction_result_list[-1] > 0.5])
temp_classifier = np.array([int(i) for i in classifier_result_list[-1] > 0.5])
prediction_classifier_disagree = np.where(temp_prediction != temp_classifier)[0]

L = L_prev
remain_params_tracking = np.array(range(0,np.max(coef_index_range)))

if np.max(coef_index_range) >= 2:
    if(len(prediction_high_conf_outliers) > 0 and len(prediction_high_conf_inliers) > 0):
        new_data_indexes = np.concatenate((prediction_high_conf_outliers, prediction_high_conf_inliers), axis = 0)
        new_data_indexes = np.array([int(i) for i in new_data_indexes])
        new_labels = np.concatenate((np.ones(len(prediction_high_conf_outliers)), np.zeros(len(prediction_high_conf_inliers))), axis = 0)
        clf_prune_2 = LogisticRegression(random_state=0, penalty='l2', max_iter=max_iter).fit(scores_transformed[new_data_indexes], new_labels) 
        print("F-1 score from both LR and SVM:",metrics.f1_score(y, np.array([int(i) for i in clf_prune_2.predict_proba(scores_transformed)[:,1] > 0.5])))
        print('Coef from both LR and SVM: ', clf_prune_2.coef_[0])
        combined_coef = clf_prune_2.coef_[0]  
    else:
        print('Coef from normal training: ', clf.coef_[0])
        combined_coef = clf.coef_[0]
        print('Combined Coef: ',  combined_coef)

    if(np.max(coef_index_range) > 2 or 
       ((np.max(combined_coef)/np.min(combined_coef) >= 1.1) and np.max(coef_index_range) >= 2)):
        if(len(set(combined_coef)) > 1):
            cur_clf_coef = combined_coef 
            cutoff = max(max(0, np.mean(combined_coef)-np.std(combined_coef)),min(combined_coef))
            print(cutoff)

            remain_indexes_after_cond = (cur_clf_coef > cutoff) #np.logical_and(cur_clf_coef > cutoff, abs(cur_clf_coef) > 0.01) # # 
            remain_params_tracking = remain_params_tracking[remain_indexes_after_cond]
            print(remain_params_tracking)
            remain_indexes_after_cond_expanded = []
            for i in range(0, len(coef_index_range)): #
                s_e_range = coef_index_range[i,1]-coef_index_range[i,0]
                s1, e1 = coef_index_range[i,0], coef_index_range[i,1]
                s2, e2 = index_range[i,0], index_range[i,1]
                saved_indexes = np.where(cur_clf_coef[s1:e1] > cutoff)[0]
                for j in range(N_size):
                    remain_indexes_after_cond_expanded.extend(np.array(saved_indexes) + j * s_e_range + s2)

            new_coef_index_range_seq = []
            for i in range(0, len(coef_index_range)): #
                s, e = coef_index_range[i,0], coef_index_range[i,1]
                new_coef_index_range_seq.append(sum((remain_indexes_after_cond)[s:e]))

            coef_index_range = []
            index_range = []
            cur_sum = 0
            for i in range(0, len(new_coef_index_range_seq)):
                coef_index_range.append([cur_sum, cur_sum + new_coef_index_range_seq[i]])
                index_range.append([cur_sum * 6, 6 * (cur_sum + new_coef_index_range_seq[i])])
                cur_sum += new_coef_index_range_seq[i]

            coef_index_range = np.array(coef_index_range)
            index_range = np.array(index_range)
            print(coef_index_range)
            print(index_range)

            L=L[:,remain_indexes_after_cond_expanded]
            scores_for_training = scores_for_training[:, remain_indexes_after_cond]



[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 120, 121, 122, 123, 124, 150]
(49097, 26)
Training data shape:  (46390, 26)
Training data F-1 0.9977565900168255
F-1 score from LR: 0.9208955223880598
(46390, 9)
(46390,)
F-1 score from SVM: 0.9471270569193417
length of prediction_high_conf_outliers: 2024
length of prediction high conf inliers:  44803
F-1 score from both LR and SVM: 0.9237401516277687
Coef from both LR and SVM:  [-7.06126720e-02  2.19492912e-02 -1.64198319e-02 -3.69292509e-02
 -3.50017585e-02 -2.67916206e-03  1.32185094e-02  1.89266999e-02
  3.76373666e-02  5.32963329e-02 -9.12209318e-04  2.82103373e-02
  2.02577893e-02  4.47686081e-02  3.49129748e-02  1.94145062e-02
  6.37570238e-03 -3.02117404e-02 -5.37050191e-02 -7.57582826e-02
  2.07856145e+00  2.03633806e+00  1.79809238e+00  1.69064302e+00
  1.66221237e+00  1.31230674e-01]
0
[ 1  6  7  8  9 11 12 13 14 15 16 20 21 22 23 24 25]
[[ 0  5]
 [ 5 11]
 [11 16]
 [16 17]]
[[  0  30]
 [ 30  66]
 [ 66  96

In [18]:
last_training_data_indexes = []
counter = 0

for i_range in range(0, 50):
    print("##################################################################")
    print('Iteration = {}, L shape = {}'.format(i_range, np.shape(L)))
    num_methods = np.shape(L)[1]

#     agree_outlier_indexes = (np.sum(L,axis=1)==np.shape(L)[1])
#     print('All agree, Number of outliers = {}'.format(sum(agree_outlier_indexes)))
#     agree_inlier_indexes = (np.sum(L,axis=1)==0)
#     print('All agree, Number of inliers = {}'.format(sum(agree_inlier_indexes)))

#     all_inlier_indexes = np.union1d(np.setdiff1d(np.where(agree_inlier_indexes)[0], prediction_high_conf_outliers), prediction_high_conf_inliers)
#     print('num of inliers = {}'.format(np.shape(all_inlier_indexes)[0]))
#     all_outlier_indexes = np.union1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)
#     print('num of outliers = {}'.format(np.shape(all_outlier_indexes)[0]))

#     disagree_indexes = np.where(np.logical_or(np.sum(L,axis = 1)==0, np.sum(L,axis = 1)==num_methods)==0)[0]

    ########################################################################

    agree_outlier_indexes = np.sum(L,axis=1)==np.shape(L)[1]
    print('All agree, Number of outliers = {}'.format(sum(agree_outlier_indexes)))
    agree_inlier_indexes = np.sum(L,axis=1)==0
    print('All agree, Number of inliers = {}'.format(sum(agree_inlier_indexes)))

    disagree_indexes = np.where(np.logical_or(np.sum(L,axis = 1)==0, np.sum(L,axis = 1)==num_methods)==0)[0]
    # print('Number of disagreed points = {}'.format(len(disagree_indexes)))
    # print('Number of disagreed points (true outliers) = {}'.format(sum(y[disagree_indexes] == 1)))
    # print('Number of disagreed points (true inliers) = {}'.format(sum(y[disagree_indexes] == 0)))

#     all_inlier_indexes = np.where(agree_inlier_indexes)[0]
    all_inlier_indexes = np.setdiff1d(np.where(agree_inlier_indexes)[0], prediction_high_conf_outliers)
    if len(prediction_high_conf_inliers) >0:
        all_inlier_indexes = np.intersect1d(np.setdiff1d(np.where(agree_inlier_indexes)[0], prediction_high_conf_outliers), prediction_high_conf_inliers)
    print('num of inliers = {}'.format(np.shape(all_inlier_indexes)[0]))
#     all_outlier_indexes = np.union1d(np.where(agree_outlier_indexes)[0], self_agree_index_list)

#     if(len(np.intersect1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)) > 0 and
#       (len(np.union1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)) > 2000)):
#         all_outlier_indexes = np.intersect1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)
#     else:
    all_outlier_indexes = np.union1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)
#     if(len(all_outlier_indexes) > 1000):
#         all_outlier_indexes = np.random.RandomState(1).permutation(all_outlier_indexes)[:1000]
        
#     all_outlier_indexes = np.union1d(np.where(agree_outlier_indexes)[0], prediction_high_conf_outliers)
    print('num of outliers = {}'.format(np.shape(all_outlier_indexes)[0]))
    all_inlier_indexes = np.setdiff1d(all_inlier_indexes, prediction_classifier_disagree)
    
    self_agree_index_list = []
    if((len(all_outlier_indexes) == 0) or (len(all_inlier_indexes)/ len(all_outlier_indexes) > 1000)):
        for i in range(0, len(index_range)):
            if(index_range[i,1]-index_range[i,0] <= 6):
                continue
            temp_index = disagree_indexes[np.where(np.sum(L[disagree_indexes][:,index_range[i,0]: index_range[i,1]], axis = 1)==(index_range[i,1]-index_range[i,0]))[0]]
            self_agree_index_list = np.union1d(self_agree_index_list, temp_index)
        self_agree_index_list = [int(i) for i in self_agree_index_list]
#     self_agree_index_list = np.random.RandomState(1).permutation(self_agree_index_list)[:500]
    all_outlier_indexes = np.union1d(all_outlier_indexes, self_agree_index_list)
    all_outlier_indexes = np.setdiff1d(all_outlier_indexes, prediction_classifier_disagree)
    print('num of outliers = {}'.format(np.shape(all_outlier_indexes)[0]))
    
    
    from sklearn.preprocessing import RobustScaler
    from sklearn.preprocessing import StandardScaler
    data_indexes = np.concatenate((all_inlier_indexes, all_outlier_indexes), axis = 0)
    data_indexes = np.array([int(i) for i in data_indexes])
    labels = np.concatenate((np.zeros(len(all_inlier_indexes)), np.ones(len(all_outlier_indexes))), axis = 0)
    transformer = RobustScaler().fit(scores_for_training)
    scores_transformed = transformer.transform(scores_for_training)
    training_data = scores_transformed[data_indexes]
    print('Training data shape: ', np.shape(training_data))
    training_data_F1.append(metrics.f1_score(y[data_indexes], labels))
    print('Training data F-1', metrics.f1_score(y[data_indexes], labels))

    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import SVC
#     clf = SVC(gamma='auto', probability=True, random_state=0)
#     clf.fit(training_data, labels)
    clf = LogisticRegression(random_state=0, penalty='l2', max_iter=max_iter).fit(training_data, labels) 
    clf_predictions = clf.predict(scores_transformed)
    clf_predict_proba = clf.predict_proba(scores_transformed)[:,1]
    print("F-1 score from LR:",metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba > 0.5])))
    
    
    transformer = RobustScaler().fit(X)
    X_transformed = transformer.transform(X)
    X_training_data = X_transformed[data_indexes]
    print(np.shape(X_training_data))
    print(np.shape(labels))

    clf_X = SVC(gamma='auto', probability=True, random_state=0)
    clf_X.fit(X_training_data, labels)
    clf_predictions_X = clf_X.predict(X_transformed)
    clf_predict_proba_X = clf_X.predict_proba(X_transformed)[:,1]
    print("F-1 score from SVM:",metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba_X > 0.5])))
    print("precision score from SVM:",metrics.precision_score(y, np.array([int(i) for i in clf_predict_proba_X > 0.5])))
    print("recall score from SVM:",metrics.recall_score(y, np.array([int(i) for i in clf_predict_proba_X > 0.5])))
    cur_f1_scores.append(metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba_X > 0.5])))
        
    agreed_outlier_indexes = np.where(np.sum(L,axis=1)==np.shape(L)[1])[0]
    agreed_inlier_indexes = np.where(np.sum(L,axis=1)==0)[0]
        
    prediction_result_list.append(clf_predict_proba)
    classifier_result_list.append(clf_predict_proba_X)
    
    prediction_list.append(np.array([int(i) for i in clf_predictions]))
    
    prediction_high_conf_outliers = np.intersect1d(np.where(prediction_result_list[-1] > high_confidence_threshold)[0],
                                                   np.where(classifier_result_list[-1] > high_confidence_threshold)[0])
    print('length of prediction_high_conf_outliers:' , len(prediction_high_conf_outliers))
    prediction_high_conf_inliers = np.intersect1d(np.where(prediction_result_list[-1] < low_confidence_threshold)[0],
                                                   np.where(classifier_result_list[-1] < low_confidence_threshold)[0])
    print('length of prediction high conf inliers: ', len(prediction_high_conf_inliers))
    
    temp_prediction = np.array([int(i) for i in prediction_result_list[-1] > 0.5])
    temp_classifier = np.array([int(i) for i in classifier_result_list[-1] > 0.5])
    prediction_classifier_disagree = np.where(temp_prediction != temp_classifier)[0]
    
    if np.max(coef_index_range) >= 2:
        if(len(prediction_high_conf_outliers) > 0 and len(prediction_high_conf_inliers) > 0):
            new_data_indexes = np.concatenate((prediction_high_conf_outliers, prediction_high_conf_inliers), axis = 0)
            new_data_indexes = np.array([int(i) for i in new_data_indexes])
            new_labels = np.concatenate((np.ones(len(prediction_high_conf_outliers)), np.zeros(len(prediction_high_conf_inliers))), axis = 0)
            clf_prune_2 = LogisticRegression(random_state=0, penalty='l2', max_iter=max_iter).fit(scores_transformed[new_data_indexes], new_labels) 
            print("F-1 score from both LR and SVM:",metrics.f1_score(y, np.array([int(i) for i in clf_prune_2.predict_proba(scores_transformed)[:,1] > 0.5])))
            print('Coef from both LR and SVM: ', clf_prune_2.coef_[0])
            combined_coef = clf_prune_2.coef_[0]  
        else:
            print('Coef from normal training: ', clf.coef_[0])
            combined_coef = clf.coef_[0]
            print('Combined Coef: ',  combined_coef)

        if(np.max(coef_index_range) > 2 or 
           ((np.max(combined_coef)/np.min(combined_coef) >= 1.1) and np.max(coef_index_range) >= 2)):
            if(len(set(combined_coef)) > 1):
                cur_clf_coef = combined_coef 
                cutoff = max(max(0, np.mean(combined_coef)-np.std(combined_coef)),min(combined_coef))
                print(cutoff)

                remain_indexes_after_cond = (cur_clf_coef > cutoff) #np.logical_and(cur_clf_coef > cutoff, abs(cur_clf_coef) > 0.01) # # 
                remain_params_tracking = remain_params_tracking[remain_indexes_after_cond]
                print(remain_params_tracking)
                remain_indexes_after_cond_expanded = []
                for i in range(0, len(coef_index_range)): #
                    s_e_range = coef_index_range[i,1]-coef_index_range[i,0]
                    s1, e1 = coef_index_range[i,0], coef_index_range[i,1]
                    s2, e2 = index_range[i,0], index_range[i,1]
                    saved_indexes = np.where(cur_clf_coef[s1:e1] > cutoff)[0]
                    for j in range(N_size):
                        remain_indexes_after_cond_expanded.extend(np.array(saved_indexes) + j * s_e_range + s2)

                new_coef_index_range_seq = []
                for i in range(0, len(coef_index_range)): #
                    s, e = coef_index_range[i,0], coef_index_range[i,1]
                    new_coef_index_range_seq.append(sum((remain_indexes_after_cond)[s:e]))

                coef_index_range = []
                index_range = []
                cur_sum = 0
                for i in range(0, len(new_coef_index_range_seq)):
                    coef_index_range.append([cur_sum, cur_sum + new_coef_index_range_seq[i]])
                    index_range.append([cur_sum * 6, 6 * (cur_sum + new_coef_index_range_seq[i])])
                    cur_sum += new_coef_index_range_seq[i]

                coef_index_range = np.array(coef_index_range)
                index_range = np.array(index_range)
                print(coef_index_range)
                print(index_range)

                L=L[:,remain_indexes_after_cond_expanded]
                scores_for_training = scores_for_training[:, remain_indexes_after_cond]
    if((len(last_training_data_indexes) == len(data_indexes)) and 
       (sum(last_training_data_indexes == data_indexes) == len(data_indexes)) and 
       (np.max(coef_index_range) < 2)):
        counter =  counter + 1
    else:
        counter = 0
    if(counter > 3):
        break
    last_training_data_indexes = data_indexes

##################################################################
Iteration = 0, L shape = (49097, 102)
All agree, Number of outliers = 13
All agree, Number of inliers = 37071
num of inliers = 37050
num of outliers = 2024
num of outliers = 2024
Training data shape:  (39074, 17)
Training data F-1 0.9985155863433943
F-1 score from LR: 0.9328540618260244
(39074, 9)
(39074,)
F-1 score from SVM: 0.9192302657415892
precision score from SVM: 0.8505329457364341
recall score from SVM: 1.0
length of prediction_high_conf_outliers: 2313
length of prediction high conf inliers:  44306
F-1 score from both LR and SVM: 0.9287863590772317
Coef from both LR and SVM:  [-9.57661083e-02 -1.33495148e-02  1.63926974e-02  7.03286059e-02
  1.22016657e-01  3.39524855e-02  5.54537879e-02  2.24735281e-02
 -7.28882058e-03 -1.69303134e-03 -1.63817826e-03  2.06945909e+00
  1.99816052e+00  1.83438496e+00  1.67084526e+00  1.68019850e+00
  2.06080548e-01]
0
[ 7  8  9 11 12 13 20 21 22 23 24 25]
[[ 0  3]
 [ 3  6]
 [ 6 1

F-1 score from both LR and SVM: 0.9439442346790589
Coef from both LR and SVM:  [3.97576726 3.99406839]
##################################################################
Iteration = 10, L shape = (49097, 12)
All agree, Number of outliers = 732
All agree, Number of inliers = 45467
num of inliers = 44475
num of outliers = 2311
num of outliers = 2311
Training data shape:  (46786, 2)
Training data F-1 0.9967440850879097
F-1 score from LR: 0.9439442346790589
(46786, 9)
(46786,)
F-1 score from SVM: 0.9411607023187241
precision score from SVM: 0.8888607594936709
recall score from SVM: 1.0
length of prediction_high_conf_outliers: 2248
length of prediction high conf inliers:  44555
F-1 score from both LR and SVM: 0.9378845590389687
Coef from both LR and SVM:  [3.9681839 3.946146 ]
##################################################################
Iteration = 11, L shape = (49097, 12)
All agree, Number of outliers = 732
All agree, Number of inliers = 45467
num of inliers = 44555
num of outliers 

All agree, Number of inliers = 45467
num of inliers = 44740
num of outliers = 2059
num of outliers = 2059
Training data shape:  (46799, 2)
Training data F-1 0.9980535279805353
F-1 score from LR: 0.9197475202885482
(46799, 9)
(46799,)
F-1 score from SVM: 0.9491754528250879
precision score from SVM: 0.9032673012606123
recall score from SVM: 1.0
length of prediction_high_conf_outliers: 2058
length of prediction high conf inliers:  44740
F-1 score from both LR and SVM: 0.9197475202885482
Coef from both LR and SVM:  [3.85518643 3.89343152]
##################################################################
Iteration = 23, L shape = (49097, 12)
All agree, Number of outliers = 732
All agree, Number of inliers = 45467
num of inliers = 44740
num of outliers = 2058
num of outliers = 2058
Training data shape:  (46798, 2)
Training data F-1 0.9980525803310614
F-1 score from LR: 0.9197475202885482
(46798, 9)
(46798,)
F-1 score from SVM: 0.9489189189189189
precision score from SVM: 0.9028027770635124


F-1 score from LR: 0.9197475202885482
(46798, 9)
(46798,)
F-1 score from SVM: 0.9489189189189189
precision score from SVM: 0.9028027770635124
recall score from SVM: 1.0
length of prediction_high_conf_outliers: 2058
length of prediction high conf inliers:  44740
F-1 score from both LR and SVM: 0.9197475202885482
Coef from both LR and SVM:  [3.85518643 3.89343152]
##################################################################
Iteration = 35, L shape = (49097, 12)
All agree, Number of outliers = 732
All agree, Number of inliers = 45467
num of inliers = 44740
num of outliers = 2058
num of outliers = 2058
Training data shape:  (46798, 2)
Training data F-1 0.9980525803310614
F-1 score from LR: 0.9197475202885482
(46798, 9)
(46798,)
F-1 score from SVM: 0.9489189189189189
precision score from SVM: 0.9028027770635124
recall score from SVM: 1.0
length of prediction_high_conf_outliers: 2058
length of prediction high conf inliers:  44740
F-1 score from both LR and SVM: 0.9197475202885482
Coef 

F-1 score from SVM: 0.9489189189189189
precision score from SVM: 0.9028027770635124
recall score from SVM: 1.0
length of prediction_high_conf_outliers: 2058
length of prediction high conf inliers:  44740
F-1 score from both LR and SVM: 0.9197475202885482
Coef from both LR and SVM:  [3.85518643 3.89343152]
##################################################################
Iteration = 47, L shape = (49097, 12)
All agree, Number of outliers = 732
All agree, Number of inliers = 45467
num of inliers = 44740
num of outliers = 2058
num of outliers = 2058
Training data shape:  (46798, 2)
Training data F-1 0.9980525803310614
F-1 score from LR: 0.9197475202885482
(46798, 9)
(46798,)
F-1 score from SVM: 0.9489189189189189
precision score from SVM: 0.9028027770635124
recall score from SVM: 1.0
length of prediction_high_conf_outliers: 2058
length of prediction high conf inliers:  44740
F-1 score from both LR and SVM: 0.9197475202885482
Coef from both LR and SVM:  [3.85518643 3.89343152]
###########

### Use GT label to train a classifier

In [15]:
# random select the same number of labels
# data_indexes = np.random.permutation(len(y))[:len(data_indexes)]
# train a SVM classifier
clf_X = SVC(gamma='auto', probability=True, random_state=0)
transformer = RobustScaler().fit(X)
X_transformed = transformer.transform(X)
X_training_data = X_transformed[data_indexes]
clf_X.fit(X_training_data, y[data_indexes])
clf_predictions_X = clf_X.predict(X_transformed)
clf_predict_proba_X = clf_X.predict_proba(X_transformed)[:,1]
print("F-1 score from SVM:",metrics.f1_score(y, np.array([int(i) for i in clf_predict_proba_X > 0.5])))

F-1 score from SVM: 0.7996389891696751
