In [None]:
import pandas as pd
import networkx as nx
import seaborn as sns
sns.set()
%pylab inline

In [None]:
nodes_tsv = ""
edges_tsv = ""

In [None]:
nodes_df = pd.read_csv(nodes_tsv, encoding="utf-8", sep="\t")
edges_df = pd.read_csv(edges_tsv, encoding="utf-8", sep="\t")

# Basic Stats

In [None]:
nodes_df.groupby("node_type").count()[["id"]]

In [None]:
nodes_df\
    .groupby("region_id").count()[["id"]]\
    .plot(kind="bar", title="Number of insolvencies per region", figsize=(14,4));

In [None]:
edges_df["start_date"] = pd.to_datetime(edges_df["start_date"])
edges_df["day_month"] = edges_df["start_date"].apply(lambda d: d.replace(day=1))

ins_per_month = edges_df.groupby("day_month").count()["source_id"].sort_index()
ins_per_month.plot(figsize=(12,4), title="Number of insolvencies per month");

In [None]:
ins_per_month[ins_per_month.index > pd.datetime(2016, 1, 1)]\
    .plot(figsize=(12,4), title="Number of insolvencies per month (2016-2018)");

# Graph Stats

In [None]:
g = nx.DiGraph()

for _, node in nodes_df.iterrows():
    g.add_node(node.id, name=node.name, 
               node_type=node.node_type, person_type=node.person_type)
    
for _, edge in edges_df.iterrows():
    g.add_edge(edge.source_id, edge.target_id, 
               edge_type=edge.edge_type)

pr = nx.pagerank(g)
hits_h, hits_a = nx.hits(g)

In [None]:
pr_df = pd.DataFrame([(k,v) for k, v in pr.items()], columns=["id", "pr"])
pr_df = pr_df.merge(nodes_df, on="id")
ax = pr_df[pr_df.node_type == "debtor"]["pr"].hist(bins=100, xrot=45)
ax.set_yscale('log')
ax.set_title("Debtor PageRank Hist (LOG SCALE!)");

In [None]:
ax = pr_df[pr_df.node_type == "creditor"]["pr"].hist(bins=100, xrot=45)
ax.set_title("Creditor PageRank Hist (Normal SCALE!)");

In [None]:
if len(pr_df[pr_df.node_type == "administrator"]) > 0:
    ax = pr_df[pr_df.node_type == "administrator"]["pr"].hist(bins=100, xrot=45)
    ax.set_title("Administrator PageRank Hist (Normal SCALE!)");

In [None]:
hits_hubs_df = pd.DataFrame([(k,v) for k, v in hits_h.items()], 
                            columns=["id", "hub"])
hits_auth_df = pd.DataFrame([(k,v) for k, v in hits_a.items()], 
                            columns=["id", "auth"])
hits_df = hits_hubs_df\
    .merge(hits_auth_df, on="id")\
    .merge(nodes_df, on="id")
ax = hits_df[hits_df.node_type == "debtor"]["hub"].hist(bins=100, xrot=45);
ax.set_title("Debtor HubScore Hist (Normal SCALE!)");

In [None]:
ax = hits_df[hits_df.node_type == "debtor"]["auth"].hist(bins=100, xrot=45)
ax.set_yscale('log')
ax.set_title("Debtor AuthScore Hist (LOG SCALE!)");

In [None]:
ax = hits_df[hits_df.node_type == "creditor"]["auth"].hist(bins=100, xrot=45)
ax.set_title("Creditor AuthScore Hist (Normal SCALE!)");

In [None]:
if len(hits_df[hits_df.node_type == "administrator"]) > 0:
    ax = hits_df[hits_df.node_type == "administrator"]["auth"].hist(bins=100, xrot=45)
    ax.set_yscale('log')
    ax.set_title("Administrator AuthScore Hist (LOG SCALE!)");

In [None]:
if len(hits_df[hits_df.node_type == "administrator"]) > 0:    
    ax = hits_df[hits_df.node_type == "administrator"]["auth"].hist(bins=100, xrot=45)
    ax.set_title("Administrator AuthScore Hist (Normal SCALE!)");