> https://www.kaggle.com/gauravduttakiit/isolation-forest-unsupervised-anomaly-detection

> https://www.kaggle.com/gauravduttakiit/clustering-based-local-outlier-anomaly-detection

In [1]:
import pandas as pd
data = pd.read_csv('https://raw.githubusercontent.com/numenta/NAB/master/data/realKnownCause/nyc_taxi.csv')
data['timestamp'] = pd.to_datetime(data['timestamp'])
data.head()

Unnamed: 0,timestamp,value
0,2014-07-01 00:00:00,10844
1,2014-07-01 00:30:00,8127
2,2014-07-01 01:00:00,6210
3,2014-07-01 01:30:00,4656
4,2014-07-01 02:00:00,3820


In [2]:
data.head(20)

Unnamed: 0,timestamp,value
0,2014-07-01 00:00:00,10844
1,2014-07-01 00:30:00,8127
2,2014-07-01 01:00:00,6210
3,2014-07-01 01:30:00,4656
4,2014-07-01 02:00:00,3820
5,2014-07-01 02:30:00,2873
6,2014-07-01 03:00:00,2369
7,2014-07-01 03:30:00,2064
8,2014-07-01 04:00:00,2221
9,2014-07-01 04:30:00,2158


In [3]:
data.head(3).T

Unnamed: 0,0,1,2
timestamp,2014-07-01 00:00:00,2014-07-01 00:30:00,2014-07-01 01:00:00
value,10844,8127,6210


In [4]:
#

In [5]:
# create moving-averages
data['MA60'] = data['value'].rolling(60).mean()
data['MA365'] = data['value'].rolling(365).mean()
data.tail()

Unnamed: 0,timestamp,value,MA60,MA365
10315,2015-01-31 21:30:00,24670,19638.816667,13308.419178
10316,2015-01-31 22:00:00,25721,19807.8,13361.350685
10317,2015-01-31 22:30:00,27309,20017.633333,13415.520548
10318,2015-01-31 23:00:00,26591,20168.0,13460.742466
10319,2015-01-31 23:30:00,26288,20255.916667,13501.473973


In [6]:
# 

In [7]:

# plot 
import plotly.express as px
fig = px.line(data, x="timestamp", y=['value', 'MA60', 'MA365'], title='NYC Taxi Trips', template = 'plotly_dark')

fig.show()


In [8]:
# drop moving-average columns
data.drop(['MA60', 'MA365'], axis=1, inplace=True)
data.head()

Unnamed: 0,timestamp,value
0,2014-07-01 00:00:00,10844
1,2014-07-01 00:30:00,8127
2,2014-07-01 01:00:00,6210
3,2014-07-01 01:30:00,4656
4,2014-07-01 02:00:00,3820


In [9]:
# set timestamp to index
data.set_index('timestamp', drop=True, inplace=True)
data.head()

Unnamed: 0_level_0,value
timestamp,Unnamed: 1_level_1
2014-07-01 00:00:00,10844
2014-07-01 00:30:00,8127
2014-07-01 01:00:00,6210
2014-07-01 01:30:00,4656
2014-07-01 02:00:00,3820


In [10]:
# resample timeseries to hourly 
data = data.resample('H').sum()
data.head()


Unnamed: 0_level_0,value
timestamp,Unnamed: 1_level_1
2014-07-01 00:00:00,18971
2014-07-01 01:00:00,10866
2014-07-01 02:00:00,6693
2014-07-01 03:00:00,4433
2014-07-01 04:00:00,4379


In [11]:
# creature features from date
data['day'] = [i.day for i in data.index]
data['day_name'] = [i.day_name() for i in data.index]
data['day_of_year'] = [i.dayofyear for i in data.index]
data['week_of_year'] = [i.weekofyear for i in data.index]
data['hour'] = [i.hour for i in data.index]
data['is_weekday'] = [i.isoweekday() for i in data.index]
data.head()

Unnamed: 0_level_0,value,day,day_name,day_of_year,week_of_year,hour,is_weekday
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014-07-01 00:00:00,18971,1,Tuesday,182,27,0,2
2014-07-01 01:00:00,10866,1,Tuesday,182,27,1,2
2014-07-01 02:00:00,6693,1,Tuesday,182,27,2,2
2014-07-01 03:00:00,4433,1,Tuesday,182,27,3,2
2014-07-01 04:00:00,4379,1,Tuesday,182,27,4,2


In [None]:
# install slim version (default)
# !pip install pycaret

In [13]:
# init setup
from pycaret.anomaly import *
s = setup(data, session_id = 42,
          ordinal_features = {'day_name' : ['Monday', 'Tuesday', 'Wednesday', 'Thursday',
       'Friday','Sunday','Saturday',]},
          numeric_features=['is_weekday'])

IntProgress(value=0, description='Processing: ', max=3)

Text(value="Following data types have been inferred automatically, if they are correct press enter to continue…

Unnamed: 0,Data Type
value,Numeric
day,Numeric
day_name,Categorical
day_of_year,Numeric
week_of_year,Numeric
hour,Numeric
is_weekday,Numeric


KeyboardInterrupt: Interrupted by user

In [None]:
# check list of available models
models()

In [None]:
# train model
iforest = create_model('iforest')
iforest_results = assign_model(iforest)
iforest_results.head()

In [None]:
# check anomalies
iforest_results[iforest_results['Anomaly'] == 1].head()

In [None]:
import plotly.graph_objects as go
# plot value on y-axis and date on x-axis
fig = px.line(iforest_results, x=iforest_results.index, y="value", title='NYC TAXI TRIPS - UNSUPERVISED ANOMALY DETECTION', template = 'plotly_dark')
# create list of outlier_dates
outlier_dates = iforest_results[iforest_results['Anomaly'] == 1].index
# obtain y value of anomalies to plot
y_values = [iforest_results.loc[i]['value'] for i in outlier_dates]
fig.add_trace(go.Scatter(x=outlier_dates, y=y_values, mode = 'markers', 
                name = 'Anomaly', 
                marker=dict(color='red',size=5)))
        
fig.show()

In [None]:
plot_model(iforest)

In [None]:
plot_model(iforest, plot = 'umap')