### Model-based anomaly detection.

#### Given a learned Bayes Net structure, the lowest-likelihood data records are considered most anomalous.

In [None]:
import pandas as pd
import numpy as np

data = pd.read_csv("NYC_taxi_sample.csv")

# make all columns small integer counts (0, 1, ..., cardinality-1)
data.loc[:,'tip':'pass'] -= 1

Step 1: Learn structure. 

In [None]:
from pgmpy.estimators import HillClimbSearch
from pgmpy.estimators import BicScore
hc = HillClimbSearch(data, scoring_method=BicScore(data))
best_model = hc.estimate()
print(best_model.edges())

Step 2: Learn parameters.

In [None]:
from pgmpy.models import BayesianModel
from pgmpy.estimators import MaximumLikelihoodEstimator

model = BayesianModel(best_model.edges())
model.fit(data, estimator=MaximumLikelihoodEstimator)
for cpd in model.get_cpds():
    print("CPD of {variable}:".format(variable=cpd.variable))
    print(cpd)

Step 3. Compute log-likelihood of each data record given the model, and report the lowest likelihood (most anomalous) records.

In [None]:
# Log-likelihood of data record x given model
def LL(x,model,verbose=False):
    loglike = 0
    for cpd in model.get_cpds():
        temp_cpd = cpd.copy()
        thevariable = temp_cpd.variable
        theparents = model.predecessors(thevariable)
        for parent in theparents:
            temp_cpd.reduce([(parent, x[parent])])
        theprob = temp_cpd.get_values()[x[thevariable],0]
        if verbose:
            print thevariable,theparents,theprob
        loglike += np.log(theprob)
    return loglike

In [None]:
# compute anomalousness of each of the first 500 data records
exmp=data.iloc[:500,:].apply(lambda x: LL(x,model),axis=1)

In [None]:
import matplotlib.pylab as plt
plt.figure(figsize=(20,4))
plt.scatter(range(len(exmp)),exmp)
plt.xlim(-10,520)
plt.plot(exmp,"r--")
plt.show()

In [None]:
# 10 most anomalous data records
print exmp.sort_values().head(10)

In [None]:
# look at the most anomalous record
print data.iloc[392,:]
print
LL(data.iloc[392,:],model,verbose=True)

In [None]:
# look at the second most anomalous record
print data.iloc[380,:]
print
LL(data.iloc[380,:],model,verbose=True)

### Cluster-based anomaly detection

#### 1. Cluster with Gaussian Mixture.  Look for records with low log-likelihood as well as any tiny clusters.

In [None]:
# Simple example with Fisher's iris dataset
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data[:100]
y = iris.target[:100] # not used- unsupervised learning

In [None]:
from sklearn.mixture import GaussianMixture
GM=GaussianMixture(n_components=3,random_state=999)
GM.fit(X)

In [None]:
# score_samples function gives the log of the probability density of each data record given its cluster.
# Note that probability densities can exceed 1 (unlike probabilities of discrete data).
import matplotlib.pylab as plt
plt.figure(figsize=(20,4))
plt.scatter(range(len(X)),GM.score_samples(X))
plt.plot(GM.score_samples(X),"r--")
plt.show()

In [None]:
# 5 most anomalous data points by log-likelihood (i.e. log probability density)
pd.DataFrame(GM.score_samples(X)).sort_values(0)[:5]

In [None]:
# Also check for any very tiny clusters
print pd.Series(GM.predict(X)).value_counts()

#### 2. Cluster with k-means.  Look for records with large distance to the nearest cluster center as well as any tiny clusters.

In [None]:
from sklearn.cluster import KMeans
KM=KMeans(n_clusters=3,random_state=999)
scor=KM.fit_predict(X)

# distance to each cluster center
res=pd.DataFrame(KM.transform(X))
res.head()

In [None]:
# add cluster and distance information for each data record
res=pd.DataFrame(KM.transform(X))
res=pd.concat((res,pd.DataFrame(KM.fit_predict(X))),axis=1)
res.columns=list(range(3))+["cluster"]
res.loc[:,"score"]=res.apply(lambda x: x[int(x["cluster"])],axis=1)

# find data records farthest from cluster centers
res.sort_values("score",ascending=False)[:5]

In [None]:
# check for tiny clusters
print res['cluster'].value_counts()

## Practice Question

The data we will use here are the hourly bicycle counts on Seattle's Fremont Bridge. These data come from an automated bicycle counter, installed in late 2012, which has inductive sensors under the sidewalks on either side of the bridge.  Our goal is to detect days with abnormal counts.

In [None]:
import pandas as pd
data=pd.read_csv("Bridge.csv",index_col="Date",parse_dates=True)
data.head()

In [None]:
# visualizing the data
data.resample('W').sum().plot(figsize=(20,5))
plt.ylabel('weekly trips')
plt.show()

In [None]:
# counts for (West,East) for each hour of each day
pivoted = data.pivot_table(['East', 'West'],
                           index=data.index.date,
                           columns=data.index.hour,
                           fill_value=0)
days=pivoted.index
X=pivoted.values
print pivoted.head()
print X.shape

#### Part 1. Detect abnormal days using Gaussian mixture clustering.

In [None]:
# Normalize the data so that the 48 columns for a given day sum to 1.  You can interpret each value as
# the proportion of that day's trips that are in a given direction (west or east) on a given hour.
X = pivoted.values
X=X/X.sum(1).reshape(-1,1)
data2=pd.concat((pd.DataFrame(days),pd.DataFrame(X)),axis=1)
data2.columns=["date"]+list(data2.columns)[1:]
data2.head()

#### Now cluster the data using 5 Gaussian mixture components, and identify the 5 most anomalous days.

#### Part 2. Detect abnormal days using k-means clustering.

In [None]:
# Normalize the data so that the 48 columns for a given day sum to 1.  You can interpret each value as
# the proportion of that day's trips that are in a given direction (west or east) on a given hour.
X = pivoted.values
X=X/X.sum(1).reshape(-1,1)
data3=pd.concat((pd.DataFrame(days),pd.DataFrame(X)),axis=1)
data3.columns=["date"]+list(data3.columns)[1:]

#### Now cluster the data using k-means clustering with k=5, and identify the 5 most anomalous days.

#### Part 3.  Detect anomalous days using Bayesian networks.

In [None]:
# Let's reduce from 48 variables to 10: proportions for midnight-1am, 11am-noon, 3-4pm, 6-7pm, 9-10pm going in each direction
X = pivoted.values
X=X/X.sum(1).reshape(-1,1)
XX = XX=X[:,[0,11,15,18,21,24,35,39,42,45]]
data4=pd.concat((pd.DataFrame(days),pd.DataFrame(XX)),axis=1)
data4.columns=["date","W0","W11","W15","W18","W21","E0","E11","E15","E18","E21"]

# discretize each variable to four equal-frequency bins (quartiles)
for i in data4.iloc[:,1:]:
    data4[i] = pd.qcut(data4[i],q=4,labels=False)
print data4.head()

## Other anomaly detection methods in sklearn (time permitting)

### LOF

Local outlier factor (LOF).

In [None]:
from sklearn.neighbors import LocalOutlierFactor

# Generate train data
np.random.seed(42)
X = 0.3 * np.random.randn(100, 2)
# Generate some abnormal novel observations
X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))
X = np.r_[X + 2, X - 2, X_outliers]

# fit the model
clf = LocalOutlierFactor(n_neighbors=20)
y_pred = clf.fit_predict(X)
y_pred_outliers = y_pred[200:]

# plot the level sets of the decision function
xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))
Z = clf._decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.figure(figsize=(10,10))


plt.title("Local Outlier Factor (LOF)")
plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)

a = plt.scatter(X[:200, 0], X[:200, 1], c='white')
b = plt.scatter(X[200:, 0], X[200:, 1], c='red')
plt.axis('tight')
plt.xlim((-5, 5))
plt.ylim((-5, 5))
plt.legend([a, b],
           ["normal observations",
            "abnormal observations"],
           loc="upper left")
plt.show()

### One Class SVM and Isolation Forest

In [None]:
import matplotlib.font_manager
from sklearn import svm

xx, yy = np.meshgrid(np.linspace(-5, 5, 500), np.linspace(-5, 5, 500))
# Generate train data
X = 0.3 * np.random.randn(100, 2)
X_train = np.r_[X + 2, X - 2]
# Generate some regular novel observations
X = 0.3 * np.random.randn(20, 2)
X_test = np.r_[X + 2, X - 2]
# Generate some abnormal novel observations
X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))

# fit the model
clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
clf.fit(X_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_pred_outliers = clf.predict(X_outliers)
n_error_train = y_pred_train[y_pred_train == -1].size
n_error_test = y_pred_test[y_pred_test == -1].size
n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size

# plot the line, the points, and the nearest vectors to the plane
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.figure(figsize=(10,10))
plt.title("Novelty Detection")
plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.PuBu)
a = plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='darkred')
plt.contourf(xx, yy, Z, levels=[0, Z.max()], colors='palevioletred')

s = 40
b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white', s=s)
b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='blueviolet', s=s)
c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='gold', s=s)
plt.axis('tight')
plt.xlim((-5, 5))
plt.ylim((-5, 5))
plt.legend([a.collections[0], b1, b2, c],
           ["learned frontier", "training observations",
            "new regular observations", "new abnormal observations"],
           loc="upper left",
           prop=matplotlib.font_manager.FontProperties(size=11))
plt.xlabel(
    "error train: %d/200 ; errors novel regular: %d/40 ; "
    "errors novel abnormal: %d/40"
    % (n_error_train, n_error_test, n_error_outliers))
plt.show()

In [None]:
from sklearn.ensemble import IsolationForest

rng = np.random.RandomState(42)

# Generate train data
X = 0.3 * rng.randn(100, 2)
X_train = np.r_[X + 2, X - 2]
# Generate some regular novel observations
X = 0.3 * rng.randn(20, 2)
X_test = np.r_[X + 2, X - 2]
# Generate some abnormal novel observations
X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))

# fit the model
clf = IsolationForest(max_samples=100, random_state=rng)
clf.fit(X_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_pred_outliers = clf.predict(X_outliers)

# plot the line, the samples, and the nearest vectors to the plane
xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.figure(figsize=(10,10))

plt.title("IsolationForest")
plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)

b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white')
b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green')
c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red')
plt.axis('tight')
plt.xlim((-5, 5))
plt.ylim((-5, 5))
plt.legend([b1, b2, c],
           ["training observations",
            "new regular observations", "new abnormal observations"],
           loc="upper left")
plt.show()

Sklearn makes a distinction between "novelty detection" with clean training data and "outlier detection" with training data possibly corrupted by a small proportion of anomalies.  This is _not_ standard usage-- anomaly, outlier, and novelty detection are generally used interchangeably in the literature, and it should be separately specified whether or not the training data is assumed to be clean.