In [55]:
import numpy as np
import math
from sklearn.ensemble import IsolationForest
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import NearestNeighbors

In [7]:
data = np.arange(1, 5)
data

array([1, 2, 3, 4])

In [4]:
def index_of_dispersion(data):
    return np.var(data)/np.average(data)

avg = 0
for n in data:
    avg += n
avg /= len(data)
var = 0
for n in data:
    var += (n-avg)**2
var /= len(data)

index_of_dispersion(data), var/avg

(0.5, 0.5)

In [6]:
def coefficient_of_var(data):
    return math.sqrt(np.var(data))/np.average(data)

avg = 0
for n in data:
    avg += n
avg /= len(data)
var = 0
for n in data:
    var += (n-avg)**2
var /= len(data)

coefficient_of_var(data), math.sqrt(var)/avg

(0.447213595499958, 0.447213595499958)

In [11]:
def quartile_coefficient_of_dispersion(data):
    data = np.sort(data)
    q1 = data[int(len(data)*0.25)]
    q3 = data[int(len(data)*0.75)]
    print(q1, q3)
    return (q3-q1)/(q3+q1)
    
quartile_coefficient_of_dispersion(data)

2 4


0.3333333333333333

In [13]:
def entropy(data, bin_size):
    minn = min(data)
    
    dist = {}
    for n in data:
        bin_num = int((n-minn)/bin_size)
        if bin_num in dist:
            dist[bin_num] += 1
        else:
            dist[bin_num] = 1
    
    ret = 0
    for bin_num in dist:
        p = dist[bin_num]/len(data)
        ret -= p*math.log2(p)
    return ret

entropy(data, 1), math.log2(4)

(2.0, 2.0)

In [14]:
def gini(data):
    ret = 0
    for i in range(len(data)-1):
        for j in range(i+1, len(data)):
            ret += abs(data[i]-data[j])
    ret *= 2
    ret /= (2*len(data)**2*np.average(data))
    return ret

gini(data)

0.25

In [25]:
def gini_coef(wealths):
    cum_wealths = np.cumsum(sorted(np.append(wealths, 0)))
    sum_wealths = cum_wealths[-1]
    xarray = np.array(range(0, len(cum_wealths))) / np.float(len(cum_wealths)-1)
    yarray = cum_wealths / float(sum_wealths)
    B = np.trapz(yarray, x=xarray)
    A = 0.5 - B
    return A / (A+B)

gini_coef(data)

0.25

In [26]:
gini(np.array([0, 1])), gini_coef(np.array([0, 1]))

(0.5, 0.5)

In [29]:
gini(np.array([-1, 3]))

1.0

## Anomaly Detection

In [41]:
X = [[0.3], [0.5], [1], [1.1]]
clf = IsolationForest(random_state=0).fit(X)

data = [[-1.11], [0.1], [0], [90], [100], [10000000]]
clf.predict(data), clf.decision_function(data)



(array([ 1,  1,  1, -1, -1, -1]),
 array([0.1381395 , 0.1381395 , 0.1381395 , 0.13574136, 0.13574136,
        0.13574136]))

In [49]:
X = np.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]])
gm = GaussianMixture(n_components=2, random_state=0).fit(X)
print(gm.means_)
gm.score_samples([[0, 0], [12, 3], [10, 2], [10000, -10000]])

[[10.  2.]
 [ 1.  2.]]


array([-4.99996864e+05, -1.99999630e+06,  3.88631622e+00, -4.99000688e+13])

In [53]:
x = np.array([[0, 0], [12, 3], [10, 2], [10000, -10000]])
scores = gm.score_samples(x)

thresh = np.quantile(scores, .03)
print(thresh)
 
index = np.where(scores <= thresh)
values = x[index]
values

-45409062749315.414


array([[ 10000, -10000]])

In [63]:
x = np.array([[0, 0], [12, 3], [10, 2], [10000, -10000]])
nbrs = NearestNeighbors(n_neighbors = 3)
nbrs.fit(x)
distances, indexes = nbrs.kneighbors(x)
print(distances.mean(axis =1))
thresh = np.quantile(distances.mean(axis =1), .5)
print(thresh)
outlier_index = np.where(distances.mean(axis = 1) > thresh)
outlier_index

[7.52245197e+00 4.86846162e+00 4.14470233e+00 9.42408565e+03]
6.19545679306522


(array([0, 3]),)