In [None]:
germ_output_path = ""
nodes_path = ""
edges_path = "" 
debtors_path = ""
node_label_mapping_path = ""

In [None]:
from collections import defaultdict
import os
import sys
import textwrap

import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from dateutil.relativedelta import relativedelta
from math import ceil
from numpy import trapz

os.chdir(os.environ["EXPERIMENTS_HOMEDIR"])

%matplotlib inline

In [None]:
def _value_label_to_str(value_label):
    return {
        "0": "<=11%",
        "1": "12-31%",
        "2": "31-52%",
        "3": "52-79%",
        "4": ">79%",
    }[value_label]

def render_graph(g, node_labels=None, edge_labels=None):
    pos=nx.spring_layout(g)
    nx.draw(g, pos)
    if not edge_labels:
        edge_labels=dict([((u,v),d["label"] + "\n" + _value_label_to_str(d["value_label"])) for u,v,d in g.edges(data=True)])
    nx.draw_networkx_edge_labels(g,pos, edge_labels=edge_labels)
    if node_labels:
        nx.draw_networkx_labels(g, pos, labels=node_labels)
    # else:
    #     nx.draw_networkx_labels(
    #         g,
    #         pos,
    #         labels=dict([
    #             (u, node_label_2_name[int(d["label"])]) for u,d in g.nodes(data=True)
    #         ])
    #     )
    plt.show()
    
def calculate_confidence(graph, graphs):
    edges = graph["graph"].edges(data=True)
    max_label = max(map(lambda e: int(e[2]["label"]), list(edges)))
    selected_edges = [(ee[0], ee[1]) for ee in filter(lambda e: int(e[2]["label"]) < max_label, list(edges))]
    body = graph["graph"].edge_subgraph(selected_edges)
    
    for g in graphs:
        if nx.is_isomorphic(
            g["graph"], 
            body, 
            edge_match=lambda e1, e2: e1["label"] == e2["label"] and e1["value_label"] == e2["value_label"], 
            node_match=lambda n1, n2: n1["label"]==n2["label"]
        ):
            return graph["support"] / g["support"]
    else:
        return 0

def load_projections(path, max_num_projections=sys.maxsize):
    projections = defaultdict(list)
    current_projection = None
    graph_number = None
    skip = False
    for idx, line in enumerate(open(path)):
        if not line.startswith("t") and skip:
            continue
        if line.startswith("t"):
            skip = False
            graph_number = int(line.split(" ")[2])
            continue
        elif line.startswith("p"):
            if current_projection is not None:
                projections[graph_number].append(current_projection)
            if len(projections[graph_number]) >= max_num_projections:
                skip = True
                current_projection = None
                continue
            current_projection = nx.DiGraph()
        elif line.startswith("e"):
            from_, to, label, value_label = line.split(" ")[1:5]
            current_projection.add_edge(int(from_), int(to), label=int(label), value_label=value_label)

def _get_diff_months(end_date, min_date):
    delta = relativedelta(end_date, min_date)
    return delta.years * 12 + delta.months

In [None]:
# node_label_2_name = dict(
#     pd.read_csv(node_label_mapping_path)[["label", "idx"]].values
# )
# print(node_label_2_name)

In [None]:
nodes_df = pd.read_csv(nodes_path)
nodes_df["proposal_timestamp"] = pd.to_datetime(nodes_df["proposal_timestamp"])
edges_df = pd.read_csv(edges_path)
edges_df["due_date"] = pd.to_datetime(edges_df["due_date"])

In [None]:
debtors_num_agg = []
graph_number = None
debtors_set = {}
for line in open(debtors_path):
    if line.startswith("t"):
        if debtors_num_agg:
            debtors_set[graph_number] = debtors_num_agg
            pass
        debtors_num_agg = []
        graph_number = int(line.split(" ")[-1])
        continue
    else:
        debtors_num_agg.append(int(line.strip()))

def plot_time_to_bankruptcy_hist(debtors, plot=True):
    selected_nodes_df = nodes_df.merge(debtors, on="idx")
    selected_edges_df = edges_df.merge(selected_nodes_df[["idx", "id", "proposal_timestamp"]], left_on="src_id", right_on="id")
    selected_edges_df = selected_edges_df.sort_values(by="due_date")
    first_default_df = selected_edges_df.loc[selected_edges_df.groupby('src_id').due_date.idxmin()]
    diffs = first_default_df.apply(lambda row: _get_diff_months(row["proposal_timestamp"], row["due_date"]), axis=1)
    if plot:
        ax = diffs.hist(
            bins=20,
            figsize=(8,3),
            range=(0,100)
        )    
        ax.set_title(f"Time to bankruptcy (median={round(diffs.median(),2)}) months")
        plt.show()
    median_ = diffs.median()
    return median_
    
def plot_cumulative_hist(debtors, plot=True):
    selected_nodes_df = nodes_df.merge(debtors, on="idx")
    selected_edges_df = edges_df.merge(selected_nodes_df[["idx", "id", "proposal_timestamp"]], left_on="src_id", right_on="id")
    selected_edges_df = selected_edges_df.sort_values(by="due_date")
    accum = []
    for src_id in selected_edges_df.src_id.drop_duplicates():
        sub_df = selected_edges_df[selected_edges_df.src_id == src_id].copy()
        sub_df["quarter"] = sub_df["label_custom"] - sub_df["label_custom"].min()
        report_df = sub_df.groupby("quarter")[["value_percentage"]].sum()
        cumulative_series = report_df.value_percentage.cumsum()
        accum.append(cumulative_series)
    stats_df = pd.concat(accum, axis=1).sort_index().fillna(method="ffill").mean(axis=1)
    if plot:
        ax = stats_df.plot(figsize=(8,3))
        ax.set_title("Debt accumulation")
        ax.set_ylim(0,100)
        plt.show()
    return stats_df

In [None]:
tmp_g = None
support = None
graph_number = None
graphs = []
for line in open(germ_output_path):
    if line.startswith("t"):
        if tmp_g:
            graphs.append({"graph": tmp_g, "support": support, "number": graph_number})
            pass
        tmp_g = nx.DiGraph()
        support = int(line.split(" ")[-1])
        graph_number = int(line.split(" ")[-2])
        continue
    if line.startswith("v"):
        parsed = line.strip().split(" ")
        tmp_g.add_node(parsed[1], label=parsed[2])
    if line.startswith("e"):
        parsed = line.strip().split(" ")
        tmp_g.add_edge(parsed[1], parsed[2], label=parsed[3], value_label=parsed[4])  

for g in graphs:
    g["confidence"] = calculate_confidence(g, graphs)

In [None]:
medians, aucs = [], []
for row in sorted(graphs, key=lambda d: d["confidence"], reverse=True):
    # if not any(map(lambda v: v["label"] == '7', dict(row["graph"].nodes(data=True)).values())):
    #     continue
    # if not any(map(lambda e: e[2]["value_label"] == '3', list(row["graph"].edges(data=True)))):
    #     continue
    print("******************************************************")
    print("Number: " + str(row["number"]))
    print("Support: " + str(row["support"]))
    print("Confidence: " + str(round(row["confidence"]*100, 2)) + " %")   
    render_graph(row["graph"])
    median_ = plot_time_to_bankruptcy_hist(pd.DataFrame(debtors_set[row["number"]], columns=["idx"]))
    medians.append(median_)
    hist_df = plot_cumulative_hist(pd.DataFrame(debtors_set[row["number"]], columns=["idx"]))    
    aucs.append(trapz(hist_df))

## Correlation analysis

In [None]:
medians, aucs, value_labels = [], [], []
for row in sorted(graphs, key=lambda d: d["confidence"], reverse=True):
    value_labels.append([])
    if any(map(lambda e: e[2]["value_label"] == '0', list(row["graph"].edges(data=True)))):
        value_labels[-1].append("0")
    if any(map(lambda e: e[2]["value_label"] == '1', list(row["graph"].edges(data=True)))):
        value_labels[-1].append("1")
    if any(map(lambda e: e[2]["value_label"] == '2', list(row["graph"].edges(data=True)))):
        value_labels[-1].append("2")
    if any(map(lambda e: e[2]["value_label"] == '3', list(row["graph"].edges(data=True)))):
        value_labels[-1].append("3")
    if any(map(lambda e: e[2]["value_label"] == '4', list(row["graph"].edges(data=True)))):
        value_labels[-1].append("4")
    median_ = plot_time_to_bankruptcy_hist(pd.DataFrame(debtors_set[row["number"]], columns=["idx"]), False)
    medians.append(median_)
    hist_df = plot_cumulative_hist(pd.DataFrame(debtors_set[row["number"]], columns=["idx"]), False)    
    aucs.append(trapz(hist_df))

In [None]:
stats_df = pd.DataFrame(zip(medians, aucs, value_labels), columns=["median", "auc", "value_labels"])
stats_df["very_small"] = stats_df.value_labels.apply(lambda vl: "0" in vl)
stats_df["small"] = stats_df.value_labels.apply(lambda vl: "1" in vl)
stats_df["medium"] = stats_df.value_labels.apply(lambda vl: "2" in vl)
stats_df["large"] = stats_df.value_labels.apply(lambda vl: "3" in vl)
stats_df["very_large"] = stats_df.value_labels.apply(lambda vl: "4" in vl)

In [None]:
# global
stats_df["median"].corr(stats_df["auc"])

In [None]:
# very large
select_df = stats_df[stats_df["very_large"] == True]
print(len(select_df))
select_df["median"].corr(select_df["auc"])

In [None]:
# large
select_df = stats_df[stats_df["large"] == True]
print(len(select_df))
select_df["median"].corr(select_df["auc"])

In [None]:
# medium
select_df = stats_df[stats_df["medium"] == True]
print(len(select_df))
select_df["median"].corr(select_df["auc"])

In [None]:
# small
select_df = stats_df[stats_df["small"] == True]
print(len(select_df))
select_df["median"].corr(select_df["auc"])

In [None]:
# very_small
select_df = stats_df[stats_df["very_small"] == True]
print(len(select_df))
select_df["median"].corr(select_df["auc"])

# Projections analysis

In [None]:
# nodes_df = pd.read_csv(nodes_path)
# nodes_df.id = nodes_df.id.astype(str)
# 
# edges_df = pd.read_csv(edges_path)
# edges_df.src_id = edges_df.src_id.astype(str)
# edges_df.dst_id = edges_df.dst_id.astype(str)
# 
# node_data_dict = dict([(row["idx"], row) for _, row in nodes_df.iterrows()])
# node_idx_to_id = dict([(row["idx"], row["id"]) for _, row in nodes_df.iterrows()])
# edge_data_dict = dict([((str(row["src_id"]), row["dst_id"]), row) for _, row in edges_df.iterrows()])

In [None]:
# projections = load_projections(f"{germ_output_path}.projections")

In [None]:
# def render_projection(g):
#     node_labels = dict([(node_idx, "\n".join(textwrap.wrap(node_data_dict[node_idx]["name"], 10))) for node_idx in g.nodes])
#     edge_labels = dict([
#         (
#             (from_, to), 
#             "\n".join([
#                 str(edge_data_dict[(node_idx_to_id[from_], node_idx_to_id[to])]["label_monthly"]), 
#                 str(edge_data_dict[(node_idx_to_id[from_], node_idx_to_id[to])]["due_date"])
#             ])
#         ) for from_, to in g.edges
#     ])
#     render_graph(g, node_labels=node_labels, edge_labels=edge_labels)

In [None]:
# render_projection(projections[...][...])