In [1]:
from collections import defaultdict
import pandas as pd
import math
df = pd.read_csv("csi_failure_dataset_open_source_systems.csv")
df = df.fillna('')
df_cloud_incidents = pd.read_csv("cloud_incidents_gcp_aws_azure.csv")
df_cloud_incidents = df_cloud_incidents.fillna('')

### Section 3:: Finding 1: Among 55 cloud incidents, 11 (20%) were caused by CSI failures, showing their catastrophic consequences.

In [2]:
cloud_incidents = list(df_cloud_incidents['Issue Link'])
cloud_failure_types = list(df_cloud_incidents['Failure Type'])
csi_failures = [failure_type for incident, failure_type in list(zip(cloud_incidents, cloud_failure_types)) if failure_type == "CSI"]
total_incidents = len(cloud_incidents)
print("Total cloud incidents: %s, number of incidents "\
      "caused by CSI failures: %s (%s%%)" % (total_incidents, len(csi_failures), 
                                             len(csi_failures)/total_incidents * 100))


Total cloud incidents: 55, number of incidents caused by CSI failures: 11 (20.0%)


### Section 4:: Table 1: Target systems and the number of corresponding CSI failures studied in this paper

In [3]:
upstream_systems = list(df['Upstream System'])
downstream_systems = list(df['Downstream System'])
total_issues = len(upstream_systems)
dic = defaultdict(int)
for systems in list(zip(upstream_systems, downstream_systems)):
    dic[systems[0] + "_" + systems[1]] += 1
system_order = [["Spark", "Hive"], ["Spark", "YARN"], ["Spark", "HDFS"], ["Spark", "Kafka"],
                ["Flink", "Kafka"], ["Flink", "YARN"], ["Flink", "Hive"], ["Flink", "HDFS"],
               ["Hive", "Spark"], ["Hive", "HBase"], ["Hive", "HDFS"], ["Hive", "Kafka"], ["Hive", "YARN"],
               ["HBase", "HDFS"], ["YARN", "HDFS"]]
upstream_sys_col = list()
downstream_sys_col = list()
freq_col = list()
for systems in system_order:
    upstream_sys_col.append(systems[0])
    downstream_sys_col.append(systems[1])
    freq_col.append(dic[systems[0] + "_" + systems[1]])
table1 = pd.DataFrame([upstream_sys_col + ["Total"], downstream_sys_col + [""], freq_col + [total_issues]]).transpose()
table1.columns = ['Upstream', 'Downstream','# CSI Failures']
table1

Unnamed: 0,Upstream,Downstream,# CSI Failures
0,Spark,Hive,26
1,Spark,YARN,19
2,Spark,HDFS,8
3,Spark,Kafka,5
4,Flink,Kafka,12
5,Flink,YARN,14
6,Flink,Hive,8
7,Flink,HDFS,3
8,Hive,Spark,6
9,Hive,HBase,3


### Section 5:: Table 2: Categorization by failure planes
### Section 5:: Finding 2: Data- and management-plane interactions contribute to significant percentages of CSI failures: 51% of CSI failures in our dataset manifest at the data plane, and 32% of CSI failures manifest at the management plane. Control-plane interactions contribute to 17%

In [4]:
issues =  list(df['Failure Plane'])
total_issues = len(issues)
count = pd.Series(issues).value_counts()
planes = ["Control", "Data", "Management"]
vals = [count[plane] for plane in planes]
percentages = [str(round(count[plane]/total_issues * 100, 2)) + "%" for plane in planes]
table2 = pd.DataFrame([planes + ["Total"], vals + [total_issues], percentages + ["100%"]]).transpose()
table2.columns = ['Plane', '# Failures','Percentage']
table2

Unnamed: 0,Plane,# Failures,Percentage
0,Control,20,16.67%
1,Data,61,50.83%
2,Management,39,32.5%
3,Total,120,100%


### Section 5:: Finding 3: Most (89/120) CSI failures are manifested through crashing behavior

In [5]:
planes =  list(df['Failure Plane'])
consequences = list(df['Failure Consequence'])
plane_to_consequences = defaultdict(lambda: defaultdict(int))
crashing_consequences = ["Runtime crash/hang", "Startup failure", "Job/task failure", "Job/task startup failure", "Job/task crash/hang"]
num_crashing_consequences = 0
for plane, consequence in list(zip(planes, consequences)):
    plane_to_consequences[plane][consequence.strip()] += 1
    if consequence in crashing_consequences:
        num_crashing_consequences += 1
print("Number of CSI failures that manifest through crashing behaviour: %s/%s" % (num_crashing_consequences, total_issues))

Number of CSI failures that manifest through crashing behaviour: 89/120


### Section 5:: Table 3: Failure symptoms

In [6]:
control_plane_rows = ["Runtime crash/hang", "Startup failure", "Performance issue", "Data loss", "Unexpected Behaviour"]
freq = list()
for consequence in control_plane_rows:
    freq.append(plane_to_consequences["Control"][consequence])
control_plane_table = pd.DataFrame([control_plane_rows, freq]).transpose()
control_plane_table.columns = ['Impact','#']
control_plane_table

Unnamed: 0,Impact,#
0,Runtime crash/hang,8
1,Startup failure,4
2,Performance issue,3
3,Data loss,2
4,Unexpected Behaviour,3


In [7]:
data_plane_rows = ["Job/task failure", "Job/task startup failure", "Wrong results", "Performance issues", "Resource leak", "Usability"]
freq = list()
for consequence in data_plane_rows:
    freq.append(plane_to_consequences["Data"][consequence])
data_plane_table = pd.DataFrame([data_plane_rows, freq]).transpose()
data_plane_table.columns = ['Impact','#']
data_plane_table

Unnamed: 0,Impact,#
0,Job/task failure,47
1,Job/task startup failure,6
2,Wrong results,3
3,Performance issues,2
4,Resource leak,2
5,Usability,1


In [8]:
management_plane_rows = ["Job/task crash/hang","Reduced observability", "Unexpected Behaviour", "Performance issue"]
freq = list()
for consequence in management_plane_rows:
    freq.append(plane_to_consequences["Management"][consequence])
management_plane_table = pd.DataFrame([management_plane_rows, freq]).transpose()
management_plane_table.columns = ['Impact','#']

management_plane_table

Unnamed: 0,Impact,#
0,Job/task crash/hang,24
1,Reduced observability,8
2,Unexpected Behaviour,5
3,Performance issue,2


### Section 6:: Table 4: Data properties in which discrepancies of data-plane CSI failures are rooted

In [9]:
data_plane_abstraction = list(df['Data Plane: Abstraction'])
data_plane_properties = list(df['Data Plane: Properties'])
total_data_plane = 0
property_to_count = defaultdict(int)
for prop in data_plane_properties:
    if prop:
        total_data_plane += 1
        property_to_count[prop] += 1

property_rows = ["Address", "Schema: Structure", "Schema: Value", "Custom Property", "API semantics"]
freq = list()
for prop in property_rows:
    freq.append(property_to_count[prop])
property_rows.append("Total")
freq.append(total_data_plane)
table4 = pd.DataFrame([property_rows, freq]).transpose()
table4.columns = ['Data Property', '#']
table4

Unnamed: 0,Data Property,#
0,Address,10
1,Schema: Structure,14
2,Schema: Value,18
3,Custom Property,8
4,API semantics,11
5,Total,61


### Section 6:: Finding 4: Discrepancies of data-plane CSI failures lie in many different data properties. The majority (50/61) of dataplane CSI failures are caused by metadata, namely typical metadata (42/61) such as addresses/names and data schemas, and custom metadata (8/61). The others (11/61) are caused by custom properties and API semantics. 

In [10]:
typical_metadata_props = ["Address", "Schema: Structure", "Schema: Value"]
custom_metadata_prop = "Custom Property"
api_semantics_prop = "API semantics"
custom_metadata_total = property_to_count[custom_metadata_prop]
api_semantics_total = property_to_count[api_semantics_prop]
typical_metadata_total = 0
for prop in typical_metadata_props:
    typical_metadata_total += property_to_count[prop]
print("Data-Plane CSI failures caused by typical metadata: %s/%s" % (typical_metadata_total, total_data_plane))
print("Data-Plane CSI failures caused by custom metadata: %s/%s" % (custom_metadata_total, total_data_plane))
print("Data-Plane CSI failures caused by metadata (typical metadata & custom metadata): %s/%s" % (str(typical_metadata_total + custom_metadata_total), total_data_plane))
print("Data-Plane CSI failures caused by custom properties & API semantics: %s/%s" % (api_semantics_total, total_data_plane))

Data-Plane CSI failures caused by typical metadata: 42/61
Data-Plane CSI failures caused by custom metadata: 8/61
Data-Plane CSI failures caused by metadata (typical metadata & custom metadata): 50/61
Data-Plane CSI failures caused by custom properties & API semantics: 11/61


### Section 6:: Table 5: Data abstractions in which discrepancies of data-plane CSI failures are rooted

In [11]:
property_rows = ["Address", "Schema: Structure", "Schema: Value", "Custom Property", "API semantics"]
abstraction_to_properties = defaultdict(lambda: defaultdict(int))
abstraction_to_count = defaultdict(int)
for abstraction, prop in list(zip(data_plane_abstraction, data_plane_properties)):
    abstraction_to_properties[abstraction][prop] += 1
    abstraction_to_count[abstraction] += 1

abstractions = ["Table", "File", "Stream", "KV Tuple"]
cols = list()
cols.append(abstractions)
for _prop in property_rows:
    col = list()
    total = 0
    for abstraction in abstractions:
        _freq = abstraction_to_properties.get(abstraction, {}).get(_prop, 0)
        col.append(_freq)
        total += _freq
    col.append(total)
    cols.append(col)
totals_col = list()
for abstraction in abstractions:
    totals_col.append(abstraction_to_count[abstraction])
totals_col.append(total_data_plane)
cols.append(totals_col)
property_rows.insert(0, "Data Abstraction")
property_rows.append("Total")
cols[0].append("Total")
table5 = pd.DataFrame(cols).transpose()
table5.columns = property_rows
table5

Unnamed: 0,Data Abstraction,Address,Schema: Structure,Schema: Value,Custom Property,API semantics,Total
0,Table,1,13,16,0,5,35
1,File,8,0,0,8,2,18
2,Stream,1,1,2,0,4,8
3,KV Tuple,0,0,0,0,0,0
4,Total,10,14,18,8,11,61


In [12]:
planes =  list(df['Failure Plane'])
discrepancies = list(df['Discrepancy pattern type'])
discrepancy_to_count = defaultdict(lambda: defaultdict(int))
for plane, discrepancy in list(zip(planes, discrepancies)):
    discrepancy_to_count[plane][discrepancy] += 1

### Section 6:: Finding 5: Complicated data abstractions (e.g., tables) are more error-prone to CSI failures, compared with simple data abstractions Finding 5: 57% (35/61) of data-plane CSI failures are induced by table-related operations.

In [13]:
table_related_issues = abstraction_to_count["Table"]
print("Data-Plane CSI failures caused by table-related operations: %s/%s" % (table_related_issues, total_data_plane))

Data-Plane CSI failures caused by table-related operations: 35/61


### Section 6:: Finding 6: 25% (15/61) data-plane CSI failures are root-caused by data serialization.

In [14]:
data_serde =  list(df['Data Plane: Involves Data Serialization'])
involves_data_serde_count = 0
for involves_data_serde in data_serde:
    if involves_data_serde == "Yes":
        involves_data_serde_count += 1
involves_data_serde_percentage = round(involves_data_serde_count/total_data_plane * 100, 2)
print("Data-Plane CSI failures caused by " \
      "data serialization: %s%% (%s/%s)" % (involves_data_serde_percentage, 
                                            involves_data_serde_count, total_data_plane))


Data-Plane CSI failures caused by data serialization: 24.59% (15/61)


### Section 6:: Table 6: Data-Plane Discrepancy patterns

In [15]:
data_plane_discrepancy_rows = ["Type Confusion", "Unsupported Operations","Unspoken Convention", "Undefined Values", "Wrong API Assumptions"]
freq = list()
total_freq = 0
for discrepancy in data_plane_discrepancy_rows:
    _freq = discrepancy_to_count["Data"][discrepancy]
    freq.append(_freq)
    total_freq += _freq
table6 = pd.DataFrame([data_plane_discrepancy_rows + ["Total"], freq + [total_freq]]).transpose()
table6.columns = ['Discrepancy Pattern','#']
table6

Unnamed: 0,Discrepancy Pattern,#
0,Type Confusion,12
1,Unsupported Operations,15
2,Unspoken Convention,9
3,Undefined Values,7
4,Wrong API Assumptions,18
5,Total,61


### Section 6:: Table 7: Management-Plane Discrepancy patterns
### Section 6:: Finding 7: CSI-failure-inducing configuration issues are very different from traditional configuration issues of individual systems. The former is mostly about failures of coherently configuring multiple involved systems, while the latter is mainly on correctness checking of erroneous configuration values.

In [16]:
management_plane_discrepancy_rows = ["Ignorance", "Unexpected Override", "Inconsistent Context", "Mishandling Configuration Value"]
freq = list()
total_freq = 0
for discrepancy in management_plane_discrepancy_rows:
    _freq = discrepancy_to_count["Management"][discrepancy]
    total_freq += _freq
    freq.append(_freq)
table8 = pd.DataFrame([management_plane_discrepancy_rows + ["Total"], freq + [total_freq]]).transpose()
table8.columns = ['Discrepancy Pattern','#']
table8

Unnamed: 0,Discrepancy Pattern,#
0,Ignorance,12
1,Unexpected Override,6
2,Inconsistent Context,10
3,Mishandling Configuration Value,2
4,Total,30


### Section 6:: Finding 8: Parameter-related configuration issues are the majority (21/30) of configuration-induced CSI failures. The rest (9/30) are in configuration components of the involved systems.

In [17]:
configuration_categorizations = list(df["Management Plane: Configuration Parameter or Component"])
parameter = 0
component = 0
total_configuration_plane = 0
for category in configuration_categorizations:
    if category == "Component":
        component += 1
    elif category == "Parameter":
        parameter += 1     
total_config_plane = component + parameter
print("Configuration-induced CSI failures caused by parameter misconfig: %s/%s" % (parameter, total_config_plane))
print("Configuration-induced CSI failures caused by component misconfig: %s/%s" % (component, total_config_plane))

Configuration-induced CSI failures caused by parameter misconfig: 21/30
Configuration-induced CSI failures caused by component misconfig: 9/30


### Section 6:: Table 8: Control-Plane Discrepancy patterns
### Section 6:: Finding 10: Most control-plane CSI failures are rooted in discrepancies of implicit properties, including implicit API semantics and state/resource inconsistencies

In [18]:
control_plane_discrepancy_rows = ["API Semantics Violation", "State/Resource Inconsistency", "Feature inconsistency"]
freq = list()
total_freq = 0
for discrepancy in control_plane_discrepancy_rows:
    _freq = discrepancy_to_count["Control"][discrepancy]
    freq.append(_freq)
    total_freq += _freq
table7 = pd.DataFrame([control_plane_discrepancy_rows + ["Total"], freq + [total_freq]]).transpose()
table7.columns = ['Discrepancy Pattern','#']
table7

Unnamed: 0,Discrepancy Pattern,#
0,API Semantics Violation,13
1,State/Resource Inconsistency,5
2,Feature inconsistency,2
3,Total,20


### Section 7:: Table 9: Fix patterns of the evaluated CSI failures

In [19]:
fix_patterns = list(df['Fix: Pattern'])
table9_rows = ["Checking", "Error handling", "Interaction", "Others"]
fix_pattern_to_count = defaultdict(int)
for pattern in fix_patterns:
    fix_pattern_to_count[pattern] += 1

freq = list()
for pattern in table9_rows:
    freq.append(fix_pattern_to_count[pattern])

table9 = pd.DataFrame([table9_rows + ["Total"], freq + [len(fix_patterns)]]).transpose()
table9.columns = ['Fix Pattern','#']
table9

Unnamed: 0,Fix Pattern,#
0,Checking,38
1,Error handling,8
2,Interaction,69
3,Others,5
4,Total,120


### Finding 12: In 40% (46/115) CSI failures, the merged fixes improve condition checking and error handling instead of repairing the failed interactions.

In [20]:
no_fix = fix_pattern_to_count["Others"]
total_issues_with_fix  = len(fix_patterns) - no_fix
error_handling_fixes = fix_pattern_to_count["Error handling"]
condition_checking_fixes = fix_pattern_to_count["Checking"]
afterthought_fixes = error_handling_fixes + condition_checking_fixes
print("CSI failures fixed by improving condition checking "\
      "and error handling instead of repairing "\
      "the failed interactions: %s%% (%s/%s)" %
      (afterthought_fixes/total_issues_with_fix * 100, afterthought_fixes, total_issues_with_fix))

CSI failures fixed by improving condition checking and error handling instead of repairing the failed interactions: 40.0% (46/115)


### Finding 13: In 69% (79/115) CSI failures, fixes were applied to code in the upstream system specific to interaction with a downstream system. Furthermore, among these 79 cases, fixes for 68 (86%) cases resided in dedicated “connector” modules.

In [21]:
fix_locations = list(df['Fix: Location'])
fix_location_to_count = defaultdict(int)
for location in fix_locations:
    fix_location_to_count[location.strip()] += 1
code_specific_to_interaction_connector = fix_location_to_count["Code specific to downstream interaction: Connector"]
code_specific_to_interaction_scattered = fix_location_to_count["Code specific to downstream interaction: Scattered"]
code_specific_to_interaction = code_specific_to_interaction_connector + code_specific_to_interaction_scattered
no_fix_or_documentation = fix_location_to_count["N.A."]
total_fixes = len(fix_locations) - no_fix_or_documentation
print("Fixes applied to code in the upstream system specific to interaction " \
      "with a downstream system: %s/%s (%s%%)" % (code_specific_to_interaction, total_fixes,
                                                  round(code_specific_to_interaction/total_fixes * 100, 2)))
print("Fixes applied to code in the upstream system " \
      "in dedicated 'connector' modules: %s/%s (%s%%)" % (code_specific_to_interaction_connector, code_specific_to_interaction,
                                                       round(code_specific_to_interaction_connector/code_specific_to_interaction * 100, 2)))

Fixes applied to code in the upstream system specific to interaction with a downstream system: 79/115 (68.7%)
Fixes applied to code in the upstream system in dedicated 'connector' modules: 68/79 (86.08%)
