In [None]:
graph_path = ""
nodes_path = ""
edges_path = ""
insolvency_data_path = ""

In [None]:
import networkx as nx
import pandas as pd
import numpy as np
import os

os.chdir(os.environ["EXPERIMENTS_HOMEDIR"])

%matplotlib inline

In [None]:
g = nx.read_gpickle(graph_path)
nodes_df = pd.read_csv(nodes_path)
nodes_df.id = nodes_df.id.astype(str)
edges_df = pd.read_csv(edges_path)
edges_df.src_id = edges_df.src_id.astype(str)
edges_df.dst_id = edges_df.dst_id.astype(str)

insolvency_data_df = pd.read_csv(insolvency_data_path)

print(f"Num nodes: {len(g.nodes())}")
print(f"Num edges: {len(g.edges())}")

In [None]:
in_degrees_df = pd.DataFrame(
    g.in_degree(), columns=["id", "in_degree"]
)
out_degrees_df = pd.DataFrame(
    g.out_degree(), columns=["id", "out_degree"]
)
degrees_df = in_degrees_df.merge(
    out_degrees_df, on="id"
).merge(nodes_df, on="id")

## Region stats

In [None]:
region_stats_df = out_degrees_df[out_degrees_df.out_degree > 0].merge(
    edges_df, left_on="id", right_on="src_id").merge(insolvency_data_df, on="insolvency_id").groupby("region").agg(count=("id", "count")
)
region_stats_df["percentate"] = region_stats_df["count"] / region_stats_df["count"].sum() * 100
region_stats_df.round(1)

## Degree stats

In [None]:
# In degrees ovewview
degrees_df.sort_values(by="in_degree", ascending=False)[:10]

In [None]:
# Categories overview
overview_df = degrees_df.groupby("category")[["in_degree"]].sum()
overview_df["in_degree_perc"] = (overview_df["in_degree"] / overview_df.in_degree.sum()) * 100
overview_df.round(2)

In [None]:
# Out degrees overview
degrees_df.sort_values(by="out_degree", ascending=False)[:10]

## Other stats

In [None]:
edges_df = pd.DataFrame(
    [(e[0], e[1], e[2]["due_date"], e[2]["label"], e[2]["value"], e[2]["value_percentage"], e[2]["publish_date"]) for e in g.edges(data=True)],
    columns=["from", "to", "due_date", "label", "value", "value_percentage", "publish_date"]
)
edges_df["due_date"] = pd.to_datetime(edges_df["due_date"])
edges_df["due_date_year"] = edges_df.due_date.apply(lambda dd: dd.year)

node_ids_set = set(list(edges_df["from"]) + list(edges_df["to"]))
business_nodes_df = nodes_df[nodes_df["id"].apply(lambda nid: nid in node_ids_set)]

In [None]:
hist_df = edges_df[["due_date_year", "due_date"]].groupby("due_date_year").count()
ax = hist_df.plot(kind="bar", legend=False, title="Histogram of due date (yearly)", figsize=(12,3))
ax.set_xlabel("Frequency")
ax.set_ylabel("Year")

In [None]:
edges_df.due_date.hist(bins=100, figsize=(12,4))

In [None]:
# Proposal date histogram
pd.to_datetime(business_nodes_df.proposal_timestamp).hist(bins=100, figsize=(12,4))

In [None]:
# Publish date histogram
pd.to_datetime(edges_df.publish_date).hist(bins=100, figsize=(12,4))

In [2]:
ax = edges_df.value_percentage.hist(bins=100, figsize=(12,3))
ax.set_title("Histogram of receivable value ratios")
ax.set_xlabel("Frequency")
ax.set_ylabel("Ratio")
pass

In [None]:
hist, bins = np.histogram(edges_df.value_percentage, bins=100)
diff=np.abs(np.diff(hist, n=2))
ax = pd.Series(diff).plot(kind="bar", logy=True, figsize=(12,3))
for label in ax.xaxis.get_ticklabels()[1::2]:
    label.set_visible(False)