In [None]:
from collections import defaultdict
import pandas as pd
import math
df = pd.read_csv("csi_failure_dataset_open_source_systems.csv")
df = df.fillna('')

### Section 4:: Table 1: Target systems and the number of corresponding CSI failures studied in this paper

In [None]:
upstream_systems = list(df['Upstream System'])
downstream_systems = list(df['Downstream System'])
total_issues = len(upstream_systems)
dic = defaultdict(int)
for systems in list(zip(upstream_systems, downstream_systems)):
    dic[systems[0] + "_" + systems[1]] += 1
system_order = [["Spark", "Hive"], ["Spark", "YARN"], ["Spark", "HDFS"], ["Spark", "Kafka"],
                ["Flink", "Kafka"], ["Flink", "YARN"], ["Flink", "Hive"], ["Flink", "HDFS"],
               ["Hive", "Spark"], ["Hive", "HBase"], ["Hive", "HDFS"], ["Hive", "Kafka"], ["Hive", "YARN"],
               ["HBase", "HDFS"], ["YARN", "HDFS"]]
upstream_sys_col = list()
downstream_sys_col = list()
freq_col = list()
for systems in system_order:
    upstream_sys_col.append(systems[0])
    downstream_sys_col.append(systems[1])
    freq_col.append(dic[systems[0] + "_" + systems[1]])
table1 = pd.DataFrame([upstream_sys_col + ["Total"], downstream_sys_col + [""], freq_col + [total_issues]]).transpose()
table1.columns = ['Upstream', 'Downstream','# CSI Failures']
table1

### Section 5:: Table 2: Categorization by failure planes

In [None]:
issues =  list(df['Failure Plane'])
total_issues = len(issues)
count = pd.Series(issues).value_counts()
planes = ["Control", "Data", "Management"]
vals = [count[plane] for plane in planes]
percentages = [str(round(count[plane]/total_issues * 100, 2)) + "%" for plane in planes]
table2 = pd.DataFrame([planes + ["Total"], vals + [total_issues], percentages + ["100%"]]).transpose()
table2.columns = ['Plane', '# Failures','Percentage']
table2

### Section 5:: Finding 3: Most (89/120) CSI failures are manifested through crashing behavior

In [None]:
planes =  list(df['Failure Plane'])
consequences = list(df['Failure Consequence'])
plane_to_consequences = defaultdict(lambda: defaultdict(int))
crashing_consequences = ["Runtime crash/hang", "Startup failure", "Job/task failure", "Job/task startup failure", "Job/task crash/hang"]
num_crashing_consequences = 0
for plane, consequence in list(zip(planes, consequences)):
    plane_to_consequences[plane][consequence.strip()] += 1
    if consequence in crashing_consequences:
        num_crashing_consequences += 1
print("Number of CSI failures that manifest through crashing behaviour: %s" % num_crashing_consequences)

### Section 5:: Table 3: Failure symptoms

In [None]:
control_plane_rows = ["Runtime crash/hang", "Startup failure", "Performance issue", "Data loss", "Unexpected Behaviour"]
freq = list()
for consequence in control_plane_rows:
    freq.append(plane_to_consequences["Control"][consequence])
control_plane_table = pd.DataFrame([control_plane_rows, freq]).transpose()
control_plane_table.columns = ['Impact','#']
control_plane_table

In [None]:
data_plane_rows = ["Job/task failure", "Job/task startup failure", "Wrong results", "Performance issues", "Resource leak", "Usability"]
freq = list()
for consequence in data_plane_rows:
    freq.append(plane_to_consequences["Data"][consequence])
data_plane_table = pd.DataFrame([data_plane_rows, freq]).transpose()
data_plane_table.columns = ['Impact','#']
data_plane_table

In [None]:
management_plane_rows = ["Job/task crash/hang","Reduced observability", "Unexpected Behaviour", "Performance issue"]
freq = list()
for consequence in management_plane_rows:
    freq.append(plane_to_consequences["Management"][consequence])
management_plane_table = pd.DataFrame([management_plane_rows, freq]).transpose()
management_plane_table.columns = ['Impact','#']

management_plane_table

### Section 6:: Table 4: Data properties in which discrepancies of data-plane CSI failures are rooted

In [None]:
data_plane_abstraction = list(df['Data Plane: Abstraction'])
data_plane_properties = list(df['Data Plane: Properties'])
total_data_plane = 0
property_to_count = defaultdict(int)
for prop in data_plane_properties:
    if prop:
        total_data_plane += 1
        property_to_count[prop] += 1

property_rows = ["Address", "Schema: Structure", "Schema: Value", "Custom Property", "API semantics"]
freq = list()
for prop in property_rows:
    freq.append(property_to_count[prop])
property_rows.append("Total")
freq.append(total_data_plane)
table4 = pd.DataFrame([property_rows, freq]).transpose()
table4.columns = ['Data Property', '#']
table4

### Finding 4: Discrepancies of data-plane CSI failures lie in many different data properties. The majority (50/61) of dataplane CSI failures are caused by metadata, namely typical metadata (42/61) such as addresses/names and data schemas, and custom metadata (8/61). The others (11/61) are caused by custom properties and API semantics. 

In [None]:
typical_metadata_props = ["Address", "Schema: Structure", "Schema: Value"]
custom_metadata_props = ["Custom Property"]
typical_metadata_total = 0
custom_metadata_total = 0
for prop in typical_metadata_props:
    tyical_metadata_total += property_to_count[prop]
for prop in custom_metadata_props:
    custom_metadata_total += property_to_count[prop]
print("CSI failures caused by typical metadata: %s" % typical_metadata_props)
print("CSI failures caused by custom metadata: %s" % custom_metadata_props)

### Section 6:: Table 5: Data abstractions in which discrepancies of data-plane CSI failures are rooted

In [None]:
property_rows = ["Address", "Schema: Structure", "Schema: Value", "Custom Property", "API semantics"]
abstraction_to_properties = defaultdict(lambda: defaultdict(int))
abstraction_to_count = defaultdict(int)
for abstraction, prop in list(zip(data_plane_abstraction, data_plane_properties)):
    abstraction_to_properties[abstraction][prop] += 1
    abstraction_to_count[abstraction] += 1

abstractions = ["Table", "File", "Stream", "KV Tuple"]
cols = list()
cols.append(abstractions)
for _prop in property_rows:
    col = list()
    total = 0
    for abstraction in abstractions:
        _freq = abstraction_to_properties.get(abstraction, {}).get(_prop, 0)
        col.append(_freq)
        total += _freq
    col.append(total)
    cols.append(col)
totals_col = list()
for abstraction in abstractions:
    totals_col.append(abstraction_to_count[abstraction])
totals_col.append(total_data_plane)
cols.append(totals_col)
property_rows.insert(0, "Data Abstraction")
property_rows.append("Total")
cols[0].append("Total")
table5 = pd.DataFrame(cols).transpose()
table5.columns = property_rows
table5

In [None]:
planes =  list(df['Failure Plane'])
discrepancies = list(df['Discrepancy pattern type'])
discrepancy_to_count = defaultdict(lambda: defaultdict(int))
for plane, discrepancy in list(zip(planes, discrepancies)):
    discrepancy_to_count[plane][discrepancy] += 1

### Section 6:: Table 6: Data-Plane Discrepancy patterns

In [None]:
data_plane_discrepancy_rows = ["Type Confusion", "Unsupported Operations","Unspoken Convention", "Undefined Values", "Wrong API Assumptions"]
freq = list()
total_freq = 0
for discrepancy in data_plane_discrepancy_rows:
    _freq = discrepancy_to_count["Data"][discrepancy]
    freq.append(_freq)
    total_freq += _freq
table6 = pd.DataFrame([data_plane_discrepancy_rows + ["Total"], freq + [total_freq]]).transpose()
table6.columns = ['Discrepancy Pattern','#']
table6

### Section 6:: Table 7: Management-Plane Discrepancy patterns

In [None]:
management_plane_discrepancy_rows = ["Ignorance", "Unexpected Override", "Inconsistent Context", "Mishandling Configuration Value"]
freq = list()
total_freq = 0
for discrepancy in management_plane_discrepancy_rows:
    _freq = discrepancy_to_count["Management"][discrepancy]
    total_freq += _freq
    freq.append(_freq)
table8 = pd.DataFrame([management_plane_discrepancy_rows + ["Total"], freq + [total_freq]]).transpose()
table8.columns = ['Discrepancy Pattern','#']
table8

### Section 6:: Table 8: Control-Plane Discrepancy patterns

In [None]:
control_plane_discrepancy_rows = ["API Semantics Violation", "State/Resource Inconsistency", "Feature inconsistency"]
freq = list()
total_freq = 0
for discrepancy in control_plane_discrepancy_rows:
    _freq = discrepancy_to_count["Control"][discrepancy]
    freq.append(_freq)
    total_freq += _freq
table7 = pd.DataFrame([control_plane_discrepancy_rows + ["Total"], freq + [total_freq]]).transpose()
table7.columns = ['Discrepancy Pattern','#']
table7

### Section 7:: Table 9: Fix patterns of the evaluated CSI failures

In [None]:
fix_patterns = list(df['Fix: Pattern'])
table9_rows = ["Checking", "Error handling", "Interaction", "Others"]
fix_pattern_to_count = defaultdict(int)
for pattern in fix_patterns:
    fix_pattern_to_count[pattern] += 1

freq = list()
for pattern in table9_rows:
    freq.append(fix_pattern_to_count[pattern])

table9 = pd.DataFrame([table9_rows + ["Total"], freq + [len(fix_patterns)]]).transpose()
table9.columns = ['Fix Pattern','#']
table9

### Finding 13: In 69% (79/115) CSI failures, fixes were applied to code in the upstream system specific to interaction with a downstream system. Furthermore, among these 79 cases, fixes for 68 (86%) cases resided in dedicated “connector” modules.

In [None]:
fix_locations = list(df['Fix: Location'])
fix_location_to_count = defaultdict(int)
for location in fix_locations:
    fix_location_to_count[location.strip()] += 1
code_specific_to_interaction_connector = fix_location_to_count["Code specific to downstream interaction: Connector"]
code_specific_to_interaction_scattered = fix_location_to_count["Code specific to downstream interaction: Scattered"]
code_specific_to_interaction = code_specific_to_interaction_connector + code_specific_to_interaction_scattered
no_fix_or_documentation = fix_location_to_count["N.A."]
total_fixes = len(fix_locations) - no_fix_or_documentation
print("Fixes applied to code in the upstream system specific to interaction " \
      "with a downstream system: %s (%s%%)" % (code_specific_to_interaction, round(code_specific_to_interaction/total_fixes * 100, 2)))
print("Fixes applied to code in the upstream system " \
      "in dedicated 'connector' modules: %s (%s%%)" % (code_specific_to_interaction_connector, round(code_specific_to_interaction_connector/total_fixes * 100, 2)))