In [1]:
%matplotlib notebook
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
import scipy.io
import scipy.stats as stats
import matplotlib.pyplot as plt
import matplotlib 
import pandas as pd
import numpy as np
import pickle

In [2]:
data = pd.read_csv("dataset3.csv")
data = data[["x1","x2"]]

In [3]:
outlier_frac = 0.1
ell = IsolationForest(contamination=outlier_frac)
ell.fit(data)

IsolationForest(contamination=0.1)

In [4]:
pred = ell.predict(data)
dataPred = data.copy()
dataPred["pred"] = pred
data

Unnamed: 0,x1,x2
0,12.41,17.95
1,13.19,17.61
2,13.44,18.27
3,11.89,16.71
4,13.09,18.96
...,...,...
995,12.25,18.35
996,13.09,18.91
997,13.38,17.66
998,12.98,17.69


In [5]:
# Continuous output of the decision_function
decision = ell.decision_function(data)
decision.min(), decision.max()

(-0.11739495435794012, 0.14056885226710197)

In [6]:
# First make a meshgrid for the (x1, x2) feature space
x1s = np.linspace(np.min(data["x1"])-5, np.max(data["x1"])+5, 15)
x2s = np.linspace(np.min(data["x2"])-5, np.max(data["x2"])+5, 15)
x1grid, x2grid = np.meshgrid(x1s, x2s) 

In [7]:
# Now make predictions for each point on the grid 
Xgrid = np.column_stack((x1grid.ravel(), x2grid.ravel()))  # Feature matrix containing all grid points
dens = ell.decision_function(Xgrid)
densgrid = dens.reshape(x1grid.shape)  # Reshape the vector of densities back onto the "grid"

In [8]:
# Get the "thresholding" value from the decision function
threshold = stats.scoreatpercentile(ell.decision_function(data), 100*outlier_frac)
threshold

2.7701365507004638e-17

In [9]:
data

Unnamed: 0,x1,x2
0,12.41,17.95
1,13.19,17.61
2,13.44,18.27
3,11.89,16.71
4,13.09,18.96
...,...,...
995,12.25,18.35
996,13.09,18.91
997,13.38,17.66
998,12.98,17.69


In [10]:
# Use the densites as the "z" values in a contour plot on the grid
fig, ax = plt.subplots()
ax.contourf(x1grid, x2grid, densgrid, cmap=plt.cm.Blues_r, levels=np.linspace(dens.min(), threshold, 7))
ax.scatter(data["x1"], data["x2"], s=4, color="g")

outliers = np.where(dataPred["pred"] == -1)
print(outliers)
# Pot circles around the predicted outliers
ax.scatter(data["x1"].loc[outliers], data["x2"].loc[outliers])
ax.legend(loc="lower right")

<IPython.core.display.Javascript object>

No handles with labels found to put in legend.


(array([  3,   7,  12,  28,  53,  57,  60,  76,  79, 101, 153, 175, 177,
       181, 187, 189, 193, 198, 205, 240, 263, 278, 287, 300, 302, 322,
       327, 332, 334, 339, 379, 387, 389, 394, 401, 404, 468, 478, 481,
       483, 489, 491, 508, 520, 533, 562, 577, 585, 593, 602, 614, 618,
       646, 652, 659, 671, 687, 691, 697, 717, 719, 720, 724, 737, 739,
       741, 744, 745, 752, 753, 766, 773, 782, 794, 801, 802, 803, 808,
       809, 810, 837, 844, 847, 855, 882, 884, 894, 906, 907, 908, 918,
       923, 947, 951, 956, 973, 974, 982, 984, 985], dtype=int64),)


<matplotlib.legend.Legend at 0x1ff804b9640>