In [1]:
import pandas as pd

In [3]:
datasets = {'feb': pd.read_parquet("../data_sacheen/azure_aws_failure_descriptions.parquet"),
            'mar3': pd.read_parquet("../data_sacheen/azure_aws_gcp_03_03.parquet"),
            'mar24': pd.read_parquet("../data_sacheen/cloud_data_with_gcp_loc_march_24.parquet")}

## My deduplication results

The `orig` set is the dataset with AWS and Azure, reindexed, with GCP appended:

In [4]:
orig = pd.read_parquet("../data/outages.parquet")
orig.groupby("vendor").size()

vendor
AWS      456
Azure    287
GCP      355
dtype: int64

Only keep events in 2018 and later:

The deduplication function:

In [6]:
def deduplicate(df):
    return df.drop_duplicates(['service_name', 'location', 'event_start_time', 'event_end_time'])

My results after deduplication:

In [14]:
deduped = deduplicate(orig)
deduped.groupby("vendor").size()

vendor
AWS      329
Azure    153
GCP      279
dtype: int64

## Comparison of provided datasets

The set I'm using for `orig` is the same as the february dataset, just with GCP appended:

In [6]:
(orig[orig.vendor.isin(['AWS', 'Azure'])].groupby('vendor').size() 
 == datasets['feb'].groupby('vendor').size()).all()

True

This is how the dataset changed between versions:

In [7]:
vendor_set = lambda dataset: set(dataset.vendor.value_counts().index.tolist())
all_vendors = (vendor_set(datasets['feb'])
               .union(vendor_set(datasets['mar3']))
               .union(vendor_set(datasets['mar24'])))

def get_vendor_count(date, vendor):
    try:
        return datasets[date].groupby("vendor").size()[vendor]
    except KeyError:
        return 0

data_counts = pd.DataFrame({'vendor': list(all_vendors),
                            'feb': [get_vendor_count('feb', vd) for vd in all_vendors],
                            'mar3': [get_vendor_count('mar3', vd) for vd in all_vendors],
                            'mar24': [get_vendor_count('mar24', vd) for vd in all_vendors]}).set_index('vendor')

data_counts

Unnamed: 0_level_0,feb,mar3,mar24
vendor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,0,0,1
Azure,287,287,284
GCP,0,364,355
Google Apps,0,1,0
AWS,456,331,325
