In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.base import BaseEstimator
from numba import jit

In [None]:
# INFLO class
class INFLO(BaseEstimator):
    def __init__(self, contamination=None):
        """
        Args:
            - contamination: expected fraction of the dataset which are outliers.
        """
        self.contamination_ = contamination
        assert (contamination > 0) & (contamination <= 0.5), "Contamination must be between 0 and 0.5"

    def fit_transform(self, data=None, k=None):
        self.k_ = k
        assert isinstance(k, int), "k must be an integer"
        assert (k<len(data)) & (k>0), "k must be smaller than the number of observations AND greater than 0."
        n_row = len(data)
        # Initial K-Nearest Neighbors search
        knn_model = NearestNeighbors(n_neighbors=self.k_)
        knn_model.fit(data)
        knn_dist_obj, knn_dist_indices = knn_model.kneighbors(data)
        obj_density = np.apply_along_axis(lambda x: 1 / np.max(x), 1, knn_dist_obj)

        # Reverse Nearest Neighbors (RNNs) search and count
        RNN = np.zeros((n_row, 1), dtype=int)
        avg_density_influ_space = np.zeros((n_row, 1), dtype=int)
        INFLO_list = []

        # Main loop - 
        for i in range(n_row):
            influ_space = np.array(np.argwhere(knn_dist_indices==i)[1:, 0])
            if len(influ_space) == 0:
                RNN[i] = k
                influ_space = knn_dist_indices[i, 1:]
            else:
                RNN[i] = len(influ_space)
                influ_space = np.unique(np.append(influ_space, knn_dist_indices[i, 1:]))
            RNN[i] = len(influ_space)
            
            sum_RNN_obs_density = 0
            for j in range(len(influ_space)):
                rnn_obs_density = obj_density[influ_space[j]]
                sum_RNN_obs_density += rnn_obs_density
            avg_density_influ_space = sum_RNN_obs_density / RNN[i]
            INFLO_ratio = avg_density_influ_space / obj_density[i]
            INFLO_list.append(INFLO_ratio)
        self.INFLO_results_ = np.array(INFLO_list).reshape(-1,)

        # Picking outliers - selecting top-n observations with highest INFLO based on contamination
        top_n = int(self.contamination_ * len(data))
        top_n_indices = self.INFLO_results_.argsort()[-top_n:][::-1]
        anomaly_detection_results = np.zeros((self.INFLO_results_.shape[0],))
        for index, item in enumerate(self.INFLO_results_):
            if index in top_n_indices:
                anomaly_detection_results[index] = int(1)
            else:
                anomaly_detection_results[index] = int(0)
        return anomaly_detection_results

In [None]:
np.random.seed(42)
random_data = np.random.random_sample((1000,5))
print(random_data)

In [None]:
len(random_data)

In [None]:
inflo = INFLO(0.005)
results = inflo.fit_transform(random_data, 3)

In [None]:
import pandas as pd
pd.Series(results).value_counts()