In [29]:
#pip install pycaret

In [30]:
import pandas as pd
import plotly.express as px
from pycaret.anomaly import *

In [31]:
data = pd.read_csv('https://raw.githubusercontent.com/numenta/NAB/master/data/realKnownCause/nyc_taxi.csv')
data['timestamp'] = pd.to_datetime(data['timestamp'])
data.head()

Unnamed: 0,timestamp,value
0,2014-07-01 00:00:00,10844
1,2014-07-01 00:30:00,8127
2,2014-07-01 01:00:00,6210
3,2014-07-01 01:30:00,4656
4,2014-07-01 02:00:00,3820


In [32]:
# create moving-averages
data['MA48'] = data['value'].rolling(48).mean()
data['MA336'] = data['value'].rolling(336).mean()

In [33]:
len(data['MA48']),len(data['MA336'])

(10320, 10320)

In [34]:
data['MA48']

0                 NaN
1                 NaN
2                 NaN
3                 NaN
4                 NaN
             ...     
10315    18715.645833
10316    18694.437500
10317    18703.500000
10318    18696.479167
10319    18702.479167
Name: MA48, Length: 10320, dtype: float64

In [35]:
# Ensure all columns have the same length
length = len(data['timestamp'])
data = data[:length]

In [36]:
# Create the line plot
fig = px.line(data, x="timestamp", y=['value', 'MA48', 'MA336'], title='NYC Taxi Trips', template='plotly_dark')
fig.show()

In [37]:
# drop moving-average columns
data.drop(['MA48', 'MA336'], axis=1, inplace=True)
data

Unnamed: 0,timestamp,value
0,2014-07-01 00:00:00,10844
1,2014-07-01 00:30:00,8127
2,2014-07-01 01:00:00,6210
3,2014-07-01 01:30:00,4656
4,2014-07-01 02:00:00,3820
...,...,...
10315,2015-01-31 21:30:00,24670
10316,2015-01-31 22:00:00,25721
10317,2015-01-31 22:30:00,27309
10318,2015-01-31 23:00:00,26591


In [38]:
# set timestamp to index
data.set_index('timestamp', drop=True, inplace=True)
data.head()

Unnamed: 0_level_0,value
timestamp,Unnamed: 1_level_1
2014-07-01 00:00:00,10844
2014-07-01 00:30:00,8127
2014-07-01 01:00:00,6210
2014-07-01 01:30:00,4656
2014-07-01 02:00:00,3820


In [39]:
# Slice the DataFrame for instances after a specific time
specific_time = pd.Timestamp('2015-01-01')
data = data.loc[specific_time:]

In [40]:
# resample timeseries to hourly 
data = data.resample('H').sum()
data

Unnamed: 0_level_0,value
timestamp,Unnamed: 1_level_1
2015-01-01 00:00:00,51700
2015-01-01 01:00:00,58584
2015-01-01 02:00:00,51507
2015-01-01 03:00:00,44134
2015-01-01 04:00:00,30799
...,...
2015-01-31 19:00:00,56577
2015-01-31 20:00:00,48276
2015-01-31 21:00:00,48389
2015-01-31 22:00:00,53030


In [41]:
# creature features from date
data['day'] = [i.day for i in data.index]
data['day_name'] = [i.day_name() for i in data.index]
data['day_of_year'] = [i.dayofyear for i in data.index]
data['week_of_year'] = [i.weekofyear for i in data.index]
data['hour'] = [i.hour for i in data.index]
data['is_weekday'] = [i.isoweekday() for i in data.index]
data.head()

Unnamed: 0_level_0,value,day,day_name,day_of_year,week_of_year,hour,is_weekday
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-01-01 00:00:00,51700,1,Thursday,1,1,0,4
2015-01-01 01:00:00,58584,1,Thursday,1,1,1,4
2015-01-01 02:00:00,51507,1,Thursday,1,1,2,4
2015-01-01 03:00:00,44134,1,Thursday,1,1,3,4
2015-01-01 04:00:00,30799,1,Thursday,1,1,4,4


In [42]:
# init setup
s = setup(data, session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Original data shape,"(744, 7)"
2,Transformed data shape,"(744, 20)"
3,Numeric features,6
4,Categorical features,1
5,Preprocess,True
6,Imputation type,simple
7,Numeric imputation,mean
8,Categorical imputation,mode
9,Maximum one-hot encoding,-1


In [43]:
# check list of available models
models()

Unnamed: 0_level_0,Name,Reference
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
abod,Angle-base Outlier Detection,pyod.models.abod.ABOD
cluster,Clustering-Based Local Outlier,pyod.models.cblof.CBLOF
cof,Connectivity-Based Local Outlier,pyod.models.cof.COF
iforest,Isolation Forest,pyod.models.iforest.IForest
histogram,Histogram-based Outlier Detection,pyod.models.hbos.HBOS
knn,K-Nearest Neighbors Detector,pyod.models.knn.KNN
lof,Local Outlier Factor,pyod.models.lof.LOF
svm,One-class SVM detector,pyod.models.ocsvm.OCSVM
pca,Principal Component Analysis,pyod.models.pca.PCA
mcd,Minimum Covariance Determinant,pyod.models.mcd.MCD


In [44]:
# train model
iforest = create_model('iforest', fraction = 0.1)
iforest_results = assign_model(iforest)
iforest_results

Unnamed: 0_level_0,value,day,day_name,day_of_year,week_of_year,hour,is_weekday,Anomaly,Anomaly_Score
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-01-01 00:00:00,51700,1,Thursday,1,1,0,4,1,0.023103
2015-01-01 01:00:00,58584,1,Thursday,1,1,1,4,1,0.022519
2015-01-01 02:00:00,51507,1,Thursday,1,1,2,4,1,0.012140
2015-01-01 03:00:00,44134,1,Thursday,1,1,3,4,0,-0.001748
2015-01-01 04:00:00,30799,1,Thursday,1,1,4,4,0,-0.020535
...,...,...,...,...,...,...,...,...,...
2015-01-31 19:00:00,56577,31,Saturday,31,5,19,6,1,0.008566
2015-01-31 20:00:00,48276,31,Saturday,31,5,20,6,0,-0.006207
2015-01-31 21:00:00,48389,31,Saturday,31,5,21,6,0,-0.002989
2015-01-31 22:00:00,53030,31,Saturday,31,5,22,6,1,0.014720


In [45]:
# check anomalies
iforest_results[iforest_results['Anomaly'] == 1]

Unnamed: 0_level_0,value,day,day_name,day_of_year,week_of_year,hour,is_weekday,Anomaly,Anomaly_Score
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-01-01 00:00:00,51700,1,Thursday,1,1,0,4,1,0.023103
2015-01-01 01:00:00,58584,1,Thursday,1,1,1,4,1,0.022519
2015-01-01 02:00:00,51507,1,Thursday,1,1,2,4,1,0.012140
2015-01-01 23:00:00,20620,1,Thursday,1,1,23,4,1,0.000070
2015-01-02 03:00:00,5090,2,Friday,2,1,3,5,1,0.000168
...,...,...,...,...,...,...,...,...,...
2015-01-31 07:00:00,11852,31,Saturday,31,5,7,6,1,0.002077
2015-01-31 18:00:00,53330,31,Saturday,31,5,18,6,1,0.003226
2015-01-31 19:00:00,56577,31,Saturday,31,5,19,6,1,0.008566
2015-01-31 22:00:00,53030,31,Saturday,31,5,22,6,1,0.014720


In [46]:
import plotly.graph_objects as go
# plot value on y-axis and date on x-axis
fig = px.line(iforest_results, x=iforest_results.index, y="value", title='NYC TAXI TRIPS - UNSUPERVISED ANOMALY DETECTION', template = 'plotly_dark')
# create list of outlier_dates
outlier_dates = iforest_results[iforest_results['Anomaly'] == 1].index
# obtain y value of anomalies to plot
y_values = [iforest_results.loc[i]['value'] for i in outlier_dates]
fig.add_trace(go.Scatter(x=outlier_dates, y=y_values, mode = 'markers', 
                name = 'Anomaly', 
                marker=dict(color='red',size=10)))
        
fig.show()

In [47]:
import time
from datetime import datetime

# Get the current time in seconds since the epoch
current_time = time.time()

In [50]:
# Convert the time to a datetime object
datetime_obj = datetime.fromtimestamp(current_time).strftime('%Y-%m-%d %H:%M:%S')




In [51]:
datetime_obj

'2023-06-05 09:58:24'

In [None]:
# Format the datetime object as a string
formatted_time = datetime_obj.strftime('%Y-%m-%d %H:%M:%S')

print(formatted_time)