## Anomaly detection on COVID timeseries
- option 1: anomaly based soley on death timeseries 
- option 2: anomaly based on relationship of cases/deaths/tests/mobility/etc


In [38]:
import numpy as np
import pandas as pd

from sklearn.ensemble import IsolationForest

import plotly.express as px

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv')

In [4]:
df.head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,4/28/20,4/29/20,4/30/20,5/1/20,5/2/20,5/3/20,5/4/20,5/5/20,5/6/20,5/7/20
0,16,AS,ASM,16,60.0,,American Samoa,US,-14.271,-170.132,...,0,0,0,0,0,0,0,0,0,0
1,316,GU,GUM,316,66.0,,Guam,US,13.4443,144.7937,...,5,5,5,5,5,5,5,5,5,5
2,580,MP,MNP,580,69.0,,Northern Mariana Islands,US,15.0979,145.6739,...,2,2,2,2,2,2,2,2,2,2
3,630,PR,PRI,630,72.0,,Puerto Rico,US,18.2208,-66.5901,...,86,86,92,94,95,97,97,99,99,102
4,850,VI,VIR,850,78.0,,Virgin Islands,US,18.3358,-64.8963,...,4,4,4,4,4,4,4,4,4,4


In [10]:
cols = df.columns[12:]
X_train = df[cols]
X_train.head()

Unnamed: 0,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,...,4/28/20,4/29/20,4/30/20,5/1/20,5/2/20,5/3/20,5/4/20,5/5/20,5/6/20,5/7/20
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,5,5,5,5,5,5,5,5,5,5
2,0,0,0,0,0,0,0,0,0,0,...,2,2,2,2,2,2,2,2,2,2
3,0,0,0,0,0,0,0,0,0,0,...,86,86,92,94,95,97,97,99,99,102
4,0,0,0,0,0,0,0,0,0,0,...,4,4,4,4,4,4,4,4,4,4


In [13]:
# fit the model
rng = np.random.RandomState(42)
clf = IsolationForest(max_samples=100, random_state=rng)
clf.fit(X_train)
y_pred_train = clf.predict(X_train)

In [18]:
df['anomaly'] = y_pred_train
df.anomaly.describe()

count    3261.000000
mean        0.747317
std         0.664570
min        -1.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: anomaly, dtype: float64

In [21]:
df.loc[df.anomaly == -1].Province_State

1                 Guam
3          Puerto Rico
13             Alabama
32             Alabama
41             Alabama
             ...      
3240      Rhode Island
3246           Vermont
3247          Virginia
3252    Grand Princess
3253          Michigan
Name: Province_State, Length: 412, dtype: object

In [23]:
df.columns

Index(['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Province_State',
       'Country_Region', 'Lat', 'Long_',
       ...
       '4/29/20', '4/30/20', '5/1/20', '5/2/20', '5/3/20', '5/4/20', '5/5/20',
       '5/6/20', '5/7/20', 'anomaly'],
      dtype='object', length=120)

In [74]:
id_vars = ['UID', 'FIPS', 'Admin2', 'Province_State']
df_long = pd.melt(df, id_vars=id_vars, value_vars=cols, var_name='date', value_name='deaths')
df_long['date_time'] = pd.to_datetime(df_long['date'])
df_long['date_time_int'] = df_long['date_time'].astype('int')
df_long['deaths_lag'] = df_long.groupby('UID').shift(1)['deaths']
df_long['deaths_incident'] = df_long['deaths'] - df_long['deaths_lag'] 
df_long.head()

Unnamed: 0,UID,FIPS,Admin2,Province_State,date,deaths,date_time,date_time_int,deaths_lag,deaths_incident
0,16,60.0,,American Samoa,1/22/20,0,2020-01-22,1579651200000000000,,
1,316,66.0,,Guam,1/22/20,0,2020-01-22,1579651200000000000,,
2,580,69.0,,Northern Mariana Islands,1/22/20,0,2020-01-22,1579651200000000000,,
3,630,72.0,,Puerto Rico,1/22/20,0,2020-01-22,1579651200000000000,,
4,850,78.0,,Virgin Islands,1/22/20,0,2020-01-22,1579651200000000000,,


In [75]:
county = 'Nassau'  # 'New York'
df_train = df_long.loc[(df_long['Province_State'] == "New York") & (df_long['Admin2'] == county)]
df_train = df_train.dropna()
df_train.head()

Unnamed: 0,UID,FIPS,Admin2,Province_State,date,deaths,date_time,date_time_int,deaths_lag,deaths_incident
5123,84036059,36059.0,Nassau,New York,1/23/20,0,2020-01-23,1579737600000000000,0.0,0.0
8384,84036059,36059.0,Nassau,New York,1/24/20,0,2020-01-24,1579824000000000000,0.0,0.0
11645,84036059,36059.0,Nassau,New York,1/25/20,0,2020-01-25,1579910400000000000,0.0,0.0
14906,84036059,36059.0,Nassau,New York,1/26/20,0,2020-01-26,1579996800000000000,0.0,0.0
18167,84036059,36059.0,Nassau,New York,1/27/20,0,2020-01-27,1580083200000000000,0.0,0.0


In [70]:
X_train = df_train[['date_time_int', 'deaths_incident']]

clf = IsolationForest(max_samples=100, random_state=rng, contamination=0.1)
clf.fit(X_train)
y_pred_train = clf.predict(X_train)

df_train['anomaly'] = y_pred_train

In [71]:
df_train.anomaly.describe()

count    106.000000
mean       0.792453
std        0.612831
min       -1.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        1.000000
Name: anomaly, dtype: float64

In [72]:
fig = px.scatter(df_train, x='date', y='deaths_incident', color='anomaly')
fig.show()

In [19]:
IsolationForest?