In [4]:
import warnings
warnings.filterwarnings('ignore') # i was getting some warnings about missing glyphs in font

In [5]:
def add_hour_of_week_and_year(row):
    import pytz
    offset = {
        "N. Virginia":  "EST",
        "Ireland":  "GMT",
        "Global":  "GMT",
        "N. California":  "US/Pacific",
        "Sydney":  "Australia/Sydney",
        "Oregon":  "US/Pacific",
        "GovCloud":  "US/Pacific",
        "London":  "Europe/London",
        "Ohio":  "EST",
        "Sao Paulo":  "America/Sao_Paulo",
        "Tokyo":  "Asia/Tokyo",
        "Mumbai":  "Asia/Kolkata",
        "Singapore":  "Asia/Singapore",
        "Seoul":  "Asia/Seoul",
        "Frankfurt":  "Europe/Berlin",
        "Paris":  "Europe/Paris",
        "US-West":  "US/Pacific",
        "East US":  "EST",
        "global":  "GMT",
        "RCA – Resources using IPv4 addressing – West and South India":  "Asia/Kolkata",
        "West Europe":  "Europe/Amsterdam",
        "West US":  "US/Pacific",
        "Australia East":  "Australia/Sydney",  # based on https://azure.microsoft.com/en-us/global-infrastructure/regions/
        "North Central US":  "America/Chicago",  # based on https://azure.microsoft.com/en-us/global-infrastructure/regions/
        "North Europe and West Europe":  "Europe/Berlin",
        "UK West":  "Europe/London",
        "South Central US":  "US/Central",
        "RCA – Storage – West US":  "US/Pacific",
        "West India and South India":  "Asia/Kolkata",
        "Latency between North Europe and North America":  "GMT",
        "France Central":  "Europe/Paris",
        "East Asia":  "Asia/Hong_Kong",  # based on https://azure.microsoft.com/en-us/global-infrastructure/regions/
        "Australia Southeast":  "Australia/Melbourne",  # based on https://azure.microsoft.com/en-us/global-infrastructure/regions/
        "Korea South":  "Asia/Seoul",
        "Southeast Asia":  "Asia/Singapore",
        "West Europe and North Europe":  "Europe/Berlin",
        "Latency and Slow I/O issues in East US":  "EST",
        "Networking in West US":  "US/Pacific",
        "UK South":  "Europe/London",
        "West US 2":  "US/Central",
        "East US and West US":  "US/Central",
        "UK South and UK West":  "Europe/London",
        "North Europe":  "Europe/Berlin",
        "UK South/UK West":  "Europe/London",
        "West Central US":  "US/Central",
        "West Europe | Mitigated":  "Europe/Amsterdam",
        "Data Processing in East US":  "EST",
        "Australia East/Southeast":  "Australia/Melbourne",
        "Canada Central":  "Canada/Central",
        "Japan East":  "Japan",
        "Multiple Azure Services impacted in West Europe":  "Europe/Amsterdam",
        "Service availability issue in North Europe":  "Europe/Berlin",
        "Service Availability Issue in North Europe":  "Europe/Berlin",
        "South East Asia":  "Asia/Singapore",
        "us-central1": "US/Central", # Iowa
        "us-east4": "US/Eastern", # Northern Virginia
        "us-east1": "US/Eastern", # South Carolina
        "europe-west2": "Europe/London", # London
        "europe-west1": "Europe/Brussels", # St Ghislain
        "europe-west4": "Europe/Amsterdam", # Eemshaven
        "asia-southeast1": "Asia/Singapore", # Jurong West
        "europe-north1": "Europe/Helsinki", # Hamina
        "us-west1": "US/Pacific", # Oregon
        "Canada": "Canada/Eastern" # GCP region in Montreal
    }
    timezone_diff = pytz.timezone(offset[row['location']])
    localized_event_start_time = row['event_start_time'].tz_convert(timezone_diff)
    weekday_int = localized_event_start_time.dayofweek
    hour = localized_event_start_time.hour
    row['hour_of_week'] = (weekday_int)*24+hour
    row['year'] = localized_event_start_time.year
    return row

In [6]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_parquet("../data/classified.parquet")
original = pd.read_parquet("../data/outages.parquet")
print(original.columns)
print(df.columns)
dfj = df.join(original)
dfj = dfj[dfj.description.notna()]

Index(['service_id', 'service_name', 'location', 'status', 'event_start_time',
       'event_end_time', 'description', 'vendor', 'first_notification',
       'last_notification', 'monitor', 'org_type', 'half_desc'],
      dtype='object')
Index(['services', 'severity', 'range', 'users', 'cause', 'duration',
       'affected'],
      dtype='object')


In [7]:
def deduplicate(df):
    dfnew = df.drop_duplicates(['service_name', 'location', 'event_start_time', 'event_end_time'])
    removed_vendors = df.loc[df.index.difference(dfnew.index)].vendor.value_counts()
    print("Removed:")
    for vd, cnt in removed_vendors.iteritems():
        print(f'- {vd}: {cnt}')
    
    return dfnew

dfj = deduplicate(dfj)

Removed:
- Azure: 133
- AWS: 127
- GCP: 70


In [8]:
dfj['duration_min'] = (dfj.event_end_time - dfj.event_start_time)/60.0
dfj["event_start_time"] = pd.to_datetime(dfj["event_start_time"], unit="s", utc=True)
dfj["event_end_time"] = pd.to_datetime(dfj["event_end_time"], unit="s", utc=True)
dfj = dfj.apply(add_hour_of_week_and_year, axis='columns')
dfj.drop(labels=['event_start_time', 'event_end_time'], axis=1, inplace=True)

In [9]:
removed_events_by_vendor = dfj[dfj.year < 2018].vendor.value_counts()
print("Removed events before 2018:")
for vd, cnt in removed_events_by_vendor.iteritems():
    print(f'- {vd}: {cnt}')

dfj = dfj[dfj.year >= 2018].drop(['year'], axis='columns')

Removed events before 2018:
- AWS: 184
- GCP: 73
- Azure: 17


In [10]:
def clean_emptys(cell):
    from numpy import ndarray
    if type(cell) == ndarray and len(cell) == 0:
        return ['not provided']
    elif type(cell) == str and not bool(cell):
        return 'not provided'
    
    return cell

dfj = dfj.applymap(clean_emptys)

In [11]:
def broaden_causes(cell):
    superclasses = {'code error': {'code error'},
                    'maintenance side effect': {'maintenance side effect'},
                    'configuration error': {'configuration error', 'deployment task'},
                    'network': {'internal api issue', 'internal network issue'},
                    'external': {'environmental conditions', 'shock event', 'third party'},
                    'load': {'increased load'},
                    'unknown': {'not provided'},
                    'unit': {'unhealthy unit'}}
    for k,v in superclasses.items():
        if bool(cell in v):
            return k

dfj['cause_broader'] = dfj.cause.map(broaden_causes)


In [12]:
dfj.columns

Index(['services', 'severity', 'range', 'users', 'cause', 'duration',
       'affected', 'service_id', 'service_name', 'location', 'status',
       'description', 'vendor', 'first_notification', 'last_notification',
       'monitor', 'org_type', 'half_desc', 'duration_min', 'hour_of_week',
       'cause_broader'],
      dtype='object')

In [41]:
for c in df.columns:
    print(c)
    if type(dfj[c].iloc[0]) == str:
        print(set(dfj[c].tolist()), '\n')
    else:
        print({x for sublist in dfj[c].tolist() for x in sublist}, '\n')

services
{'multiple', 'one', 'not provided'} 

severity
{'degraded performance', 'unavailable', 'not provided', 'visual'} 

range
{'multiple regions', 'single availability zone', 'not provided', 'single region'} 

users
{'some', 'all', 'not provided'} 

cause
{'configuration error', 'internal api issue', 'third party', 'not provided', 'shock event', 'environmental conditions', 'code error', 'deployment task', 'internal network issue', 'unhealthy unit', 'maintenance side effect', 'increased load'} 

duration
{'continuous', 'not provided', 'intermittent'} 

affected
{'user interface', 'external requests (apis)', 'external network/connectivity', 'not provided', 'storage', 'internal network', 'certificates/licenses', 'nodes/devices/instances', 'internal communication interfaces', 'processing backend'} 



In [10]:
dfj = dfj.astype({'affected': 'object',
                  'cause': 'str',
                  'cause_broader': 'str',
                  'description': 'str',
                  'duration': 'str',
                  'duration_min': 'float',
                  'first_notification': 'Int64',
                  'half_desc': 'str',
                  'hour_of_week': 'Int32',
                  'last_notification': 'Int64',
                  'location': 'str',
                  'monitor': 'str',
                  'org_type': 'str',
                  'range': 'str',
                  'service_id': 'str',
                  'service_name': 'str',
                  'services': 'str',
                  'severity': 'object',
                  'status': 'float',
                  'users': 'str'})


In [11]:
print(f'Amount of outages in a single AWS availability zone: {len(dfj[dfj.range == "single availability zone"])}')

dfj.range = dfj.range.map(lambda x: 'single region' if x == 'single availability zone' else x)

Amount of outages in a single AWS availability zone: 13


In [12]:
print("Final counts for all vendors:")
for vd, cnt in dfj.vendor.value_counts().iteritems():
    print(f'- {vd}: {cnt}')

Final counts for all vendors:
- GCP: 205
- AWS: 144
- Azure: 132


In [13]:
dfj.to_parquet("../data/preprocessed.parquet")