# Generating the anomalies  

Now that we have bootstrapped enough sample data, we need to fabricate some anomaly records. Otherwise, all of our data will be based on healthy updates, and we won't have any examples of anomalies on which to train a model.

In [19]:
import os
import requests
import random
import pandas as pd
import datetime
from datetime import date
import numpy as np, numpy.random
import matplotlib
import matplotlib.pyplot as plt
plt.style.use('ggplot')
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import warnings
warnings.filterwarnings('ignore')

## Approximating failure event metadata

To start, we'll take 5% of the rows in our bootstrapped data and artificially shrink or inflate the numbers.  

Anomaly events are updates where:  
- The `total_rows` or `errors` are greater than +2 standard deviations from the mean (for `errors`), or less than -2 standard deviations from the mean (for all others)
    - It is important to note that some of these records have columns that are dependent of one another. So if we change the number of `total_rows` then we should adjust the number of `rows_updated` and `rows_created` accordingly.
- The `error_count` is non-zero  

We need to keep in mind that the seasonality we've identified – ie the Sunday / Monday effect – is *not* an anomaly. We should look for anomalies on these days too, understanding that the anomalistic values could be different than on other weekdays.  


In [20]:
# Read in actuals and bootstrapped data
df = pd.read_csv('./data/revisions_ACTUALS.csv')
df['update_date'] = pd.to_datetime(df['update_date'])

bootstrapped_df = pd.read_csv('./data/revisions_BOOTSTRAPPED.csv')
bootstrapped_df['date'] = pd.to_datetime(bootstrapped_df['date'])

# Generate a sample of records that we'll turn into anomalies
anomalies = bootstrapped_df.sample(frac=0.05, random_state=42)
anomalies['is_anom'] = 1
anomalies['anomaly_type'] = None

# Ensure that the bootstrapped data `rows_updated` and `rows_created` sum up to `total_rows`
anomalies['rows_created'] = anomalies['total_rows'].apply(lambda x: np.random.randint(0, x))
anomalies['rows_updated'] = anomalies['total_rows'] - anomalies['rows_created']

# anomalies.to_csv('./data/raw_anomalies.csv')

##### Steps taken to generate and format the anomalies

- 10 records: Zeroed out all continuous variables
- 10 records: -1 std dev from mean for  `total_rows`, with `rows_updated`, `rows_created`, and `errors` updated accordingly.
- 10 records: -2 std dev from mean for  `total_rows`, with `rows_updated`, `rows_created`, and `errors` updated accordingly.
    - When values were <=0, I entered very small numbers.
- 10 records: +1 std dev from mean for `errors`
- 9 records: +2 std dev from mean for `errors`

In [21]:
# When pre-processing the data for modeling, we should reference the descriptive statistics of the actuals, 
# ranther than the bootstrapped dataset.

cols = ['total_rows', 'rows_updated', 'rows_created', 'errors']
df[cols].describe()

Unnamed: 0,total_rows,rows_updated,rows_created,errors
count,20.0,20.0,20.0,20.0
mean,29599.45,9262.05,20323.4,14.0
std,18729.524,5920.923,12877.728,24.251
min,1327.0,0.0,742.0,0.0
25%,2500.25,1475.0,1500.25,0.75
50%,40124.5,12088.0,27841.5,6.0
75%,42772.75,13377.25,29046.25,10.0
max,44016.0,14816.0,30986.0,93.0


In [22]:
np.random.seed(42)

tot_rows_std_dev = df.total_rows.std()
errors_std_dev = df.errors.std()

### Zeroing out all continuous variables

anomaly_1 = anomalies[1:101]

for col in cols:
    anomaly_1[col] = 0
    
# Annotate anomalies with an anomaly type description
anomaly_1['anomaly_type'] = "Dataset published no records"


### -1 std dev from mean for `total_rows`, with `rows_updated`, `rows_created`, and `errors`
#                                                                                   updated accordingly.
anomaly_2 = anomalies[101:201]

# Subtract std dev from total and correct for negative numbers
anomaly_2['total_rows'] = anomaly_2['total_rows'].apply(lambda x: max(x - tot_rows_std_dev, 0))
anomaly_2['rows_created'] = anomaly_2['total_rows'].apply(lambda x: np.random.randint(0, x) if x > 0 else 0)
anomaly_2['rows_updated'] = anomaly_2['total_rows'] - anomaly_2['rows_created']
# anomaly_2['rows_created'] = anomaly_2['total_rows'].where((anomaly_2['total_rows'] - anomaly_2['rows_updated']) > 0, other=0)

# Assuming here that, if `total_rows` goes down by a standard deviation, then the errors will too
anomaly_2['errors'] = anomaly_2['errors'].apply(lambda x: max(x - errors_std_dev, 0))
anomaly_2['anomaly_type'] = "Minus approx. 1 standard deviation in `total_rows`"


### -2 std dev from mean for `total_rows`, with `rows_updated`, `rows_created`, and `errors`
#                                                                                   updated accordingly.
anomaly_3 = anomalies[201:301]

anomaly_3['total_rows'] = anomaly_3['total_rows'].apply(lambda x: max((x - tot_rows_std_dev*2), 0))
anomaly_3['rows_created'] = anomaly_3['total_rows'].apply(lambda x: np.random.randint(0, x) if x > 0 else 0)
anomaly_3['rows_updated'] = anomaly_3['total_rows'] - anomaly_3['rows_created']
anomaly_3['errors'] = anomaly_3['errors'].where((anomaly_3['errors'] - errors_std_dev*2) > 0, other=0)
anomaly_3['anomaly_type'] = "Minus approx. 2 standard deviations in 'total_rows'"


### +1 std dev from mean for `errors`

anomaly_4 = anomalies[301:401]

anomaly_4['errors'] = anomaly_4['errors'] + np.random.randint(errors_std_dev, errors_std_dev*1.5)
anomaly_4['anomaly_type'] = "Plus approx. 1 standard deviation in 'errors'"


### +2 or more std dev from mean for `errors`

anomaly_5 = anomalies[401:]

anomaly_5['errors'] = anomaly_5['errors'] + np.random.randint(errors_std_dev*2, errors_std_dev*3.5)
anomaly_5['anomaly_type'] = "Plus approx. 2 (or more) standard deviation in 'errors'"


### Concatenate records
frames = [anomaly_1, anomaly_2, anomaly_3, anomaly_4, anomaly_5]

anomalies_clean = pd.concat(frames)
# anomalies_clean.to_csv('./data/formatted_anomalies.csv', index=False)

In [23]:
# Fix schema of bootstrapped dataset so it can be merged with anomalies, and merge

anomalies_clean = anomalies_clean.set_index(pd.to_datetime(anomalies_clean['date']))

bootstrapped_df = bootstrapped_df.set_index(pd.to_datetime(bootstrapped_df['date']))
bootstrapped_df['is_anom'] = 0
bootstrapped_df['is_anom'] = bootstrapped_df['is_anom'].astype('int64')
bootstrapped_df_copy = bootstrapped_df

merged = pd.concat([bootstrapped_df_copy, anomalies_clean])
merged.head()

Unnamed: 0_level_0,total_rows,rows_updated,rows_created,errors,date,day_of_week,axis_label,is_anom,anomaly_type
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2024-02-23,31814.0,3210.0,28604,12.0,2024-02-23,Friday,Friday 2024-02-23,0,
2024-02-22,32641.0,7499.0,25142,28.0,2024-02-22,Thursday,Thursday 2024-02-22,0,
2024-02-21,27080.0,17965.0,9115,5.0,2024-02-21,Wednesday,Wednesday 2024-02-21,0,
2024-02-20,33074.0,2722.0,30352,8.0,2024-02-20,Tuesday,Tuesday 2024-02-20,0,
2024-02-19,2092.0,53.0,2039,11.0,2024-02-19,Monday,Monday 2024-02-19,0,


In [24]:
merged.to_csv('./data/merged.csv', index=False)