In [1]:
import codecs
import argparse
import datetime as dt
import psycopg2
import psycopg2.extras
from sets import Set
import re

In [2]:
def get_senate(reference_number):
    matcher = re.search('([A-Z]+) ([0-9]+) INS .*', reference_number)
    if not matcher:
        return None    
    return matcher.group(1) + "-" + matcher.group(2)

In [3]:
conn = psycopg2.connect(database="isir_prod_db", 
                        user='developer123',
                        password='5AWi7e1l8JKE',  
                        port=6667, 
                        host='isir.datlab.cz') 
cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)

In [18]:
legal_form_removal_regexp='(sro|as|ks|spol|spolsro|družstvo|sp|splikv|vos|ops|soukromáspolečnostsručenímomezeným)$'
char_removal_regexp='[,."-&+\s]*'
output_folder = '/home/ipython/data/isir/networks_2.0/'

In [19]:
creditors = {
    u"GE Money Bank a.s.":"25672720",
    u"Česká spořitelna, a.s":"45244782",
    u"CETELEM ČR, a.s.":"25085689",
    u"Provident Financial s.r.o.":"25621351",
    u"PROFI CREDIT Czech, a.s.":"61860069",
    u"SMART Capital, a.s.":"26865297",
    u"Home Credit, a.s.":"26978636",
    u"Essox s.r.o.":"26764652",
    u"Telefónica Czech Republic,a.s.":"60193336",
    u"T-Mobile Czech Republic, a.s.":"64949681",
    u"Všeobecná zdravotní pojišťovna České Republiky":"41197518",
    u"Komerční banka, a.s.":"45317054",
    u"ČEZ Prodej, s.r.o":"27232433",
    u"COFIDIS s.r.o.":"27179907",
    u"Československá obchodní banka,a.s.":"1350",
    u"Profidebt, s.r.o.":"27221971",
    u"Citibank Europe, plc":"28198131",
    u"Raiffeisenbank, a.s.":"49240901",
    u"Česká podnikatelská pojišťovna, a.s.":"63998530",
    u"Kooperativa pojišťovna, a.s.":"47116617",
    u"Bohemia Faktoring, s.r.o.":"27242617"
}
creditors_inverted = dict([(val, key) for key,val in creditors.items()])

In [20]:
creditor_ids = [re.sub(legal_form_removal_regexp, '', re.sub(char_removal_regexp, '', cname.lower())) \
                for cname in creditors.keys()]

In [21]:
def get_senate_num(reference_number):
    matcher = re.search('([A-Z]+) ([0-9]+) INS .*', reference_number)
    return matcher.group(2) if matcher else None

In [22]:
get_senate('KSPL 27 INS 2047 / 2014')

'KSPL-27'

In [25]:
def export_graph(year, region, region_name):
    cur.execute("SELECT regexp_replace(regexp_replace(lower(creditor), %s, '', 'g'), %s, '', 'g') as creditor_id, max(creditor) as creditor_name, count(*) as c  \
             FROM v_creditors_receivables ft \
             WHERE regexp_replace(regexp_replace(lower(creditor), %s, '', 'g'), %s, '', 'g') = ANY(%s) \
             GROUP BY creditor_id \
             ORDER BY c DESC \
             LIMIT 1000", (char_removal_regexp, legal_form_removal_regexp, char_removal_regexp, legal_form_removal_regexp, creditor_ids))
    creditors_db = cur.fetchall()
    
    cur.execute("SELECT at.id, at.name  \
                      FROM administrators_tab at \
                      JOIN insolvencies_administrators_tab iat ON iat.administrator_id=at.id \
                      JOIN (SELECT it.id, it.debtor_name \
                             FROM insolvency_tab it \
                                 JOIN v_creditors_receivables ft2 ON it.id=ft2.insolvency_id \
                                 JOIN (SELECT regexp_replace(regexp_replace(lower(ft.creditor), %s, '', 'g'), %s, '', 'g') as creditor_id, count(*) as c \
                                      FROM v_creditors_receivables ft \
                                      WHERE regexp_replace(regexp_replace(lower(creditor), %s, '', 'g'), %s, '', 'g') = ANY(%s) \
                                      GROUP BY creditor_id \
                                      ORDER BY c DESC \
                                      LIMIT 1000) as ct ON regexp_replace(regexp_replace(lower(ft2.creditor), %s, '', 'g'), %s, '', 'g') = ct.creditor_id \
                            WHERE ft2.creditor is not null \
                                  AND EXTRACT(YEAR FROM proposal_timestamp) = %s \
                                  AND it.region_id IN (%s) \
                            GROUP BY it.id, it.debtor_name) as its \
                      ON iat.insolvency_id=its.id \
                      GROUP BY at.id, at.name", (char_removal_regexp, legal_form_removal_regexp, 
                                                          char_removal_regexp, legal_form_removal_regexp, creditor_ids, 
                                                          char_removal_regexp, legal_form_removal_regexp,
                                                          year, region))
    administrators = cur.fetchall()
    
    cur.execute("SELECT it3.id as id, it3.debtor_name as debtor_name, it3.person_type as person_type, it3.reference_number as reference_number \
                FROM insolvency_tab it3  \
                JOIN \
                    (SELECT it.id \
                    FROM insolvency_tab it \
                         JOIN v_creditors_receivables ft2 ON it.id=ft2.insolvency_id \
                         JOIN (SELECT regexp_replace(regexp_replace(lower(ft.creditor), %s, '', 'g'), %s, '', 'g') as creditor_id, count(*) as c \
                              FROM v_creditors_receivables ft \
                              WHERE regexp_replace(regexp_replace(lower(creditor), %s, '', 'g'), %s, '', 'g') = ANY(%s) \
                              GROUP BY creditor_id \
                              ORDER BY c DESC \
                              LIMIT 1000) as ct ON regexp_replace(regexp_replace(lower(ft2.creditor), %s, '', 'g'), %s, '', 'g') = ct.creditor_id \
                    WHERE ft2.creditor is not null \
                          AND EXTRACT(YEAR FROM proposal_timestamp) = %s \
                          AND it.region_id IN (%s) \
                    GROUP BY it.id) as insolvencies \
                ON it3.id=insolvencies.id", 
                (char_removal_regexp, legal_form_removal_regexp, 
                char_removal_regexp, legal_form_removal_regexp, creditor_ids, 
                char_removal_regexp, legal_form_removal_regexp,
                year, region))
    insolvencies = cur.fetchall()
    
    node_ids = Set()
    with codecs.open(output_folder + 'nodes_%s_%d.tsv' % (region_name, year), "w", encoding="utf-8") as nodes_file:
        print >> nodes_file, "id\ttype\tname\tperson_type"
        for creditor in creditors_db:
            node_ids.add(creditor['creditor_id'])
            print >> nodes_file, unicode("%s\t%s\t%s\t" % (creditor['creditor_id'], "creditor", creditor['creditor_name']), 'utf-8')
        for administrator in administrators:
            node_ids.add(administrator['id'])
            print >> nodes_file, unicode("adm_%s\t%s\t%s\t" % (administrator['id'], "administrator", administrator['name']), 'utf-8')
        for insolvency in insolvencies:
            node_ids.add(insolvency['id'])
            print >> nodes_file, unicode("%s\t%s\t%s\t%s" % (insolvency['id'], "debtor", insolvency['debtor_name'], insolvency['person_type']), 'utf-8')
        for insolvency in insolvencies:            
            senate = get_senate(insolvency['reference_number'])
            if not senate:
                continue
            node_ids.add(senate)
            print >> nodes_file, unicode("jud_%s\t%s\tjud_%s\t" % (senate, "senate", senate), 'utf-8')
            
            
    cur.execute("SELECT it.id AS insolvency_id, it.reference_number, iat.administrator_id \
                 FROM insolvencies_administrators_tab iat JOIN insolvency_tab it ON iat.insolvency_id=it.id")
    administrator_debtor_edges = cur.fetchall()

    cur.execute("SELECT insolvency_id, regexp_replace(regexp_replace(lower(creditor), %s, '', 'g'), %s, '', 'g') as creditor_id \
                                 FROM v_creditors_receivables WHERE creditor is not null", (char_removal_regexp, legal_form_removal_regexp))
    creditor_debtor_edges = cur.fetchall()

    cur.execute("SELECT it1.id as insolvency_id1, it2.id as insolvency_id2 FROM insolvency_tab it1 JOIN insolvency_tab it2 ON it1.debtor_address = it2.debtor_address \
                 WHERE it1.debtor_address is not null  \
                       AND it2.debtor_address is not null \
                       AND it1.debtor_name <> it2.debtor_name \
                       AND it1.region_id IN (%s) \
                       AND it2.region_id IN (%s) \
                       AND it1.id <> it2.id", (region, region))
    addresses = cur.fetchall()        

    with codecs.open(output_folder + 'edges_%s_%d.tsv' % (region_name, year), "w", encoding="utf-8") as edges_file:
        print >> edges_file, "Source\tTarget\trelation_type"
        for administrator_debtor in administrator_debtor_edges:
            if administrator_debtor['administrator_id'] in node_ids and administrator_debtor['insolvency_id'] in node_ids:
                senate = get_senate(administrator_debtor["reference_number"])
                print >> edges_file, unicode("jud_%s\tadm_%s\tappoints" % (senate, administrator_debtor['administrator_id']), 'utf-8')    
                print >> edges_file, unicode("adm_%s\t%s\tadministers" % (administrator_debtor['administrator_id'], administrator_debtor['insolvency_id']), 'utf-8')
        for creditor_debtor in creditor_debtor_edges:
            if creditor_debtor['creditor_id'] in node_ids and creditor_debtor['insolvency_id'] in node_ids:
                print >> edges_file, unicode("%s\t%s\towes" % (creditor_debtor['insolvency_id'], creditor_debtor['creditor_id']), 'utf-8')
        for address in addresses:
            if address['insolvency_id1'] in node_ids and address['insolvency_id2'] in node_ids:
                print >> edges_file, unicode("%s\t%s\tshare_address" % (address['insolvency_id1'], address['insolvency_id2']), 'utf-8')    
        for insolvency in insolvencies:
            senate = get_senate(insolvency['reference_number'])
            if not senate:
                continue
            print >> edges_file, unicode("jud_%s\t%s\tjudges" % (senate, insolvency['id']), 'utf-8')    

In [None]:
cur.execute('SELECT id, name FROM regions_tab')
for [region_id, region_name] in cur.fetchall():    
    print "Processing region: %s" % region_name
    for year in range(2015, 2016):
        print "\tYear: %d" % year
        export_graph(year, region_id, region_name.lower().replace(' ', ''))

In [None]:

graph.add_edge()