<a href="https://colab.research.google.com/github/veeruamma/AutoML-Projects/blob/main/Anomaly_Detection_with_PyCaret.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**PyCaret** is an open-source, low-code machine 
learning  and end-to-end model management tool built-in Python for automating machine learning workflows. All the operations performed in PyCaret are stored in **Pipeline** that is fully automated for deployment.

In [1]:
!pip install pycaret

Collecting pycaret
[?25l  Downloading https://files.pythonhosted.org/packages/30/4b/c2b856b18c0553238908f34d53e6c211f3cc4bfa13a8e8d522567a00b3d7/pycaret-2.3.0-py3-none-any.whl (261kB)
[K     |█▎                              | 10kB 14.0MB/s eta 0:00:01[K     |██▌                             | 20kB 19.3MB/s eta 0:00:01[K     |███▊                            | 30kB 22.7MB/s eta 0:00:01[K     |█████                           | 40kB 25.1MB/s eta 0:00:01[K     |██████▎                         | 51kB 27.4MB/s eta 0:00:01[K     |███████▌                        | 61kB 22.1MB/s eta 0:00:01[K     |████████▊                       | 71kB 22.1MB/s eta 0:00:01[K     |██████████                      | 81kB 19.2MB/s eta 0:00:01[K     |███████████▎                    | 92kB 17.4MB/s eta 0:00:01[K     |████████████▌                   | 102kB 18.0MB/s eta 0:00:01[K     |█████████████▊                  | 112kB 18.0MB/s eta 0:00:01[K     |███████████████                 | 122kB 18.0

In [4]:
cd /content/drive/MyDrive/LEARNING/AutoML

/content/drive/MyDrive/LEARNING/AutoML


In [51]:
import pandas as pd
import plotly.express as px

### NYC taxi passenegers data is used for this example : data set canbe found in https://github.com/veeruamma/AutoML-Projects/blob/main/data/nyc_taxi.csv

In [53]:
data = pd.read_csv('data/nyc_taxi.csv', index_col=False)
data['timestamp'] = pd.to_datetime(data['timestamp'])

data.head()

Unnamed: 0,timestamp,value
0,2014-07-01 00:00:00,10844
1,2014-07-01 00:30:00,8127
2,2014-07-01 01:00:00,6210
3,2014-07-01 01:30:00,4656
4,2014-07-01 02:00:00,3820


In [54]:
# set timestamp to index
data.set_index('timestamp', drop=True, inplace=True)
data.head()

Unnamed: 0_level_0,value
timestamp,Unnamed: 1_level_1
2014-07-01 00:00:00,10844
2014-07-01 00:30:00,8127
2014-07-01 01:00:00,6210
2014-07-01 01:30:00,4656
2014-07-01 02:00:00,3820


In [55]:
# resample timeseries to hourly 
data = data.resample('H').sum()
data.head()

Unnamed: 0_level_0,value
timestamp,Unnamed: 1_level_1
2014-07-01 00:00:00,18971
2014-07-01 01:00:00,10866
2014-07-01 02:00:00,6693
2014-07-01 03:00:00,4433
2014-07-01 04:00:00,4379


In [56]:
# Create features from date
data['day'] = [i.day for i in data.index]
data['day_name'] = [i.day_name() for i in data.index]
data['day_of_year'] = [i.dayofyear for i in data.index]
data['week_of_year'] = [i.weekofyear for i in data.index]
data['hour'] = [i.hour for i in data.index]
data['is_weekday'] = [i.isoweekday() for i in data.index]
data.head()

Unnamed: 0_level_0,value,day,day_name,day_of_year,week_of_year,hour,is_weekday
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014-07-01 00:00:00,18971,1,Tuesday,182,27,0,2
2014-07-01 01:00:00,10866,1,Tuesday,182,27,1,2
2014-07-01 02:00:00,6693,1,Tuesday,182,27,2,2
2014-07-01 03:00:00,4433,1,Tuesday,182,27,3,2
2014-07-01 04:00:00,4379,1,Tuesday,182,27,4,2


### Initial setup with PyCaret to start machine learning experiment 

In [57]:
# init setup
from pycaret.anomaly import *
s = setup(data, session_id = 123)

Unnamed: 0,Description,Value
0,session_id,123
1,Original Data,"(5160, 7)"
2,Missing Values,False
3,Numeric Features,5
4,Categorical Features,2
5,Ordinal Features,False
6,High Cardinality Features,False
7,High Cardinality Method,
8,Transformed Data,"(5160, 19)"
9,CPU Jobs,-1


In [59]:
# To check and list of all available algorithms
models()

Unnamed: 0_level_0,Name,Reference
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
abod,Angle-base Outlier Detection,pyod.models.abod.ABOD
cluster,Clustering-Based Local Outlier,pyod.models.cblof.CBLOF
cof,Connectivity-Based Local Outlier,pyod.models.cof.COF
iforest,Isolation Forest,pyod.models.iforest.IForest
histogram,Histogram-based Outlier Detection,pyod.models.hbos.HBOS
knn,K-Nearest Neighbors Detector,pyod.models.knn.KNN
lof,Local Outlier Factor,pyod.models.lof.LOF
svm,One-class SVM detector,pyod.models.ocsvm.OCSVM
pca,Principal Component Analysis,pyod.models.pca.PCA
mcd,Minimum Covariance Determinant,pyod.models.mcd.MCD


In [60]:
# Considered Isolation Forest, but replacing the ID ‘iforest’ in the code below with any other model ID would change to that algorithm.
iforest = create_model('iforest', fraction = 0.1)
iforest_results = assign_model(iforest)
iforest_results.head()

Unnamed: 0_level_0,value,day,day_name,day_of_year,week_of_year,hour,is_weekday,Anomaly,Anomaly_Score
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2014-07-01 00:00:00,18971,1,Tuesday,182,27,0,2,0,-0.01545
2014-07-01 01:00:00,10866,1,Tuesday,182,27,1,2,0,-0.006367
2014-07-01 02:00:00,6693,1,Tuesday,182,27,2,2,0,-0.010988
2014-07-01 03:00:00,4433,1,Tuesday,182,27,3,2,0,-0.017091
2014-07-01 04:00:00,4379,1,Tuesday,182,27,4,2,0,-0.017006


### Anomaly that contains value 1 for outlier and 0 for inlier and Anomaly_Score which is a continuous value a.k.a as decision function (internally, the algorithm calculates the score based on which the anomaly is determined).

In [61]:
# check anomalies
iforest_results[iforest_results['Anomaly'] == 1].head()

Unnamed: 0_level_0,value,day,day_name,day_of_year,week_of_year,hour,is_weekday,Anomaly,Anomaly_Score
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2014-07-13,50825,13,Sunday,194,28,0,7,1,0.002663
2014-07-27,50407,27,Sunday,208,30,0,7,1,0.009264
2014-08-03,48081,3,Sunday,215,31,0,7,1,0.003045
2014-09-28,53589,28,Sunday,271,39,0,7,1,0.00444
2014-10-05,48472,5,Sunday,278,40,0,7,1,0.000325


In [67]:
#Lets plot anomalies on the graph to visualize.
import plotly.graph_objects as go

In [69]:
# plot value on y-axis and date on x-axis
fig = px.line(iforest_results, x=iforest_results.index, y="value", title='NYC TAXI TRIPS - UNSUPERVISED ANOMALY DETECTION', template = 'plotly_dark')

# create list of outlier_dates
outlier_dates = iforest_results[iforest_results['Anomaly'] == 1].index

# obtain y value of anomalies to plot
y_values = [iforest_results.loc[i]['value'] for i in outlier_dates]

fig.add_trace(go.Scatter(x=outlier_dates, y=y_values, mode = 'markers', 
                name = 'Anomaly', 
                marker=dict(color='red',size=10)))
        
fig.show()