# **Machine Learning - Unsupervised Learning - 2**


## Anomaly Detection

In [None]:
import numpy as np
import pandas as pd
from sklearn.neighbors import LocalOutlierFactor
from sklearn import preprocessing

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
#%matplotlib inline

In [None]:
iris = pd.read_csv("./data/iris.csv")

### Inspect the data

In [None]:
iris.head()

In [None]:
iris.describe(include='all')

### Pre-processing

* Only **Numeric** data is extracted. 
* **Normalization**: Remember density based outlier detection is based on distances, and this means we should normalize our data. 

In [None]:
iris_num = iris.drop(["Type"], axis=1)
iris_num.head()

In [None]:
iris_num.describe()

In [None]:
iris_norm = pd.DataFrame(preprocessing.scale(iris_num), columns=iris_num.columns)
iris_norm.head()

In [None]:
iris_norm.describe()

## Outlier Detection using *Local Outlier Factor*

In [None]:
lof = LocalOutlierFactor(n_neighbors=5)
lof_predictions = lof.fit_predict(iris_norm)
lof_scores = -lof.negative_outlier_factor_
lof_scores

### Visualizing outlier scores

In [None]:
iris_norm['lof_scores'] = lof_scores
iris_norm['lof_scores'].hist()
plt.title("Outlier score distribution")
plt.xlabel("Outlier Score")

In [None]:
sns.kdeplot(iris_norm['lof_scores'])
plt.title("Outlier score distribution")
plt.xlabel("Outlier Score")

### Select outliers

In [None]:
iris_norm.head()

#### Select top outliers (in this case, say 5)


In [None]:
iris_norm.sort_values(['lof_scores'], ascending=False)[:5]

#### Select outliers based on threshold (in this case, >2)

In [None]:
iris_norm[iris_norm['lof_scores']>2]

### Visualizing outliers

In [None]:
iris_norm['outlier_ind'] = "No"
iris_norm.loc[iris_norm['lof_scores']>2, 'outlier_ind'] = "Yes"

In [None]:
iris_pair = iris_norm.drop(['lof_scores'], axis = 1)

In [None]:
sns.pairplot(iris_pair, hue = 'outlier_ind', diag_kind= "kde")

## Outlier Detection Visualization with PCA

In [None]:
from sklearn.decomposition import PCA

iris_pca = iris_norm.drop(["lof_scores", "outlier_ind"], axis=1)
iris_pca.head()

In [None]:
pca = PCA(n_components=2)
prComponents = pca.fit_transform(iris_pca)
pca_df = pd.DataFrame(prComponents, columns = ['PC1', 'PC2'])
pca_df.head()

In [None]:
plt.scatter(pca_df['PC1'], pca_df['PC2'])

### Assign the outlier indicators based on LOF scores from previous analysis

In [None]:
pca_df['outlier'] = iris_norm['lof_scores']>2

In [None]:
pca_df.head()

In [None]:
sns.lmplot(x="PC1", y="PC2", data=pca_df, fit_reg=False, hue='outlier')

## Activity on Outlier Detection

We will first investigate using a density based outlier detection approach known as Local Outlier Factor (LOF), which we discussed in class. We are going to use a dataset of 400 call center employees, and our goal is to **identify outliers among this group given their job performance data**. After loading the data, do investigate the data structure and conduct some simple explorations

In [None]:
call_center = pd.read_csv("./data/call_center.csv")
call_center.head()