In [2]:
import pandas as pd
import calmap
import numpy as np
import os

In [3]:
from pathlib import Path
from os import environ, write

import re
from datetime import datetime, time
import datetime as dt
from collections import defaultdict

import requests

In [4]:
def load_data():
    if environ.get("USE_LOCAL_FILES"):
        paths = Path("extracted").rglob("*.csv")
        df_parts = []

        for path in sorted(paths):

            print("Reading for analysis:", path)

            with open(path, "r", encoding="utf-8") as fp:
                df_part = pd.read_csv(fp, delimiter=",", dtype=str)
                df_parts.append(df_part)

            # if df is None:
            #     df = df_part
            # else:
            #     df = df.append(df_part, ignore_index=True)

        df = pd.concat(df_parts, ignore_index=True)

    else:
        raise NotImplementedError("Remote data not yet implemented")
    
    df['date_of_birth'] = pd.to_datetime(df['date_of_birth'], errors = 'coerce')
    df['date_of_proceeding'] = pd.to_datetime(df['date_of_proceeding'], errors = 'coerce')
    df['date_of_publication'] = pd.to_datetime(df['date_of_publication'], errors = 'coerce')

    return df

In [5]:
df = load_data()

Reading for analysis: extracted/buckets/insolvenzenard/insolvenzbekanntmachungen-scraper/2021-01-17T08-05-41.jsonl.csv
Reading for analysis: extracted/buckets/insolvenzenard/insolvenzbekanntmachungen-scraper/2021-01-24T08-05-43.jsonl.csv
Reading for analysis: extracted/buckets/insolvenzenard/insolvenzbekanntmachungen-scraper/2021-01-31T03-50-40.jsonl.csv
Reading for analysis: extracted/buckets/insolvenzenard/insolvenzbekanntmachungen-scraper/2021-02-07T03-47-56.jsonl.csv
Reading for analysis: extracted/buckets/insolvenzenard/insolvenzbekanntmachungen-scraper/2021-02-14T03-54-29.jsonl.csv
Reading for analysis: extracted/buckets/insolvenzenard/insolvenzbekanntmachungen-scraper/2021-02-21T04-08-08.jsonl.csv
Reading for analysis: extracted/buckets/insolvenzenard/insolvenzbekanntmachungen-scraper/2021-02-28T04-20-16.jsonl.csv
Reading for analysis: extracted/buckets/insolvenzenard/insolvenzbekanntmachungen-scraper/2021-03-07T07-20-48.jsonl.csv
Reading for analysis: extracted/buckets/insolven

Reading for analysis: extracted/buckets/insolvenzenard/insolvenzbekanntmachungen-scraper/2021-10-16T20-42-50.jsonl.csv
Reading for analysis: extracted/buckets/insolvenzenard/insolvenzbekanntmachungen-scraper/2021-10-17T05-51-55.jsonl.csv
Reading for analysis: extracted/buckets/insolvenzenard/insolvenzbekanntmachungen-scraper/2021-10-17T10-40-49.jsonl.csv
Reading for analysis: extracted/buckets/insolvenzenard/insolvenzbekanntmachungen-scraper/2021-10-20T00-26-46.jsonl.csv
Reading for analysis: extracted/buckets/insolvenzenard/insolvenzbekanntmachungen-scraper/2021-10-20T07-31-07.jsonl.csv
Reading for analysis: extracted/buckets/insolvenzenard/insolvenzbekanntmachungen-scraper/2021-10-20T13-43-00.jsonl.csv
Reading for analysis: extracted/buckets/insolvenzenard/insolvenzbekanntmachungen-scraper/2021-10-20T20-17-09.jsonl.csv
Reading for analysis: extracted/buckets/insolvenzenard/insolvenzbekanntmachungen-scraper/2021-10-21T06-38-42.jsonl.csv
Reading for analysis: extracted/buckets/insolven

In [6]:
df.drop_duplicates(subset ="description_hash", keep = "last", inplace = True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 840283 entries, 159 to 1819831
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   _key                 34818 non-null   object        
 1   case_nr              840283 non-null  object        
 2   court                840283 non-null  object        
 3   date_of_birth        601075 non-null  datetime64[ns]
 4   date_of_proceeding   744511 non-null  datetime64[ns]
 5   date_of_publication  840283 non-null  datetime64[ns]
 6   description_hash     840283 non-null  object        
 7   federal_state        840283 non-null  object        
 8   file_name            430671 non-null  object        
 9   kind                 839738 non-null  object        
 10  title                430671 non-null  object        
 11  type_of_proceeding   840283 non-null  object        
 12  zipcode              836266 non-null  object        
 13  detail_form

In [20]:
for plz in df.zipcode.dropna().unique():
    if isinstance(plz, float): 
        print(plz, type(plz))

In [25]:
unique_plzs = df.zipcode.dropna().unique()
unique_plzs.sort()
unique_plzs

array(['00000', '00037', '00165', ..., '99996', '99998', '99999'],
      dtype=object)

# Bundesweit

### Filtern nach Insolvenzart

In [31]:
df1 = df.replace(to_replace = ["nw"],
                   value ="Nordrhein-Westfalen")

In [32]:
df2 = df1.replace(to_replace = ["bb"],
                   value ="Brandenburg")

In [33]:
df3 = df2.replace(to_replace = ["sh"],
                   value ="Schleswig-Holstein")

In [34]:
df4 = df3.replace(to_replace = ["ns"],
                   value ="Niedersachsen")

In [35]:
df5 = df4.replace(to_replace = ["by"],
                   value ="Bayern")

In [36]:
df6 = df5.replace(to_replace = ["rp"],
                   value ="Rheinland-Pfalz")

In [37]:
df7 = df6.replace(to_replace = ["sn"],
                   value ="Sachsen")

In [38]:
df8 = df7.replace(to_replace = ["bw"],
                   value ="Baden-Württemberg")

In [39]:
df9 = df8.replace(to_replace = ["be"],
                   value ="Berlin")

In [40]:
df10 = df9.replace(to_replace = ["st"],
                   value ="Brandenburg")

In [41]:
df11 = df10.replace(to_replace = ["he"],
                   value ="Hessen")

In [42]:
df12 = df11.replace(to_replace = ["hh"],
                   value ="Hamburg")

In [43]:
df13 = df12.replace(to_replace = ["sl"],
                   value ="Saarland")

In [44]:
df14 = df13.replace(to_replace = ["hb"],
                   value ="Bremen")

In [45]:
df15 = df14.replace(to_replace = ["th"],
                   value ="Thüringen")

In [46]:
df16 = df15.replace(to_replace = ["mv"],
                   value ="Mecklenburg-Vorpommern")

In [47]:
series = pd.Series(df16.zipcode.unique())

In [47]:
series.dtypes

dtype('O')

In [48]:
# IN = Regelinsolvenz

def in_kind(df16):
    in_unternehmen = (df16[df16['kind'] == 'in'])
    return in_unternehmen

# Funktion def wird ausgeführt
data_in = in_kind(df16)

In [49]:
def in_eröffnung(data_in):
    in_eröffnet = data_in[data_in['type_of_proceeding'].isin(['Eröffnungen', 'Eroeffnung', 'Eroeffnung_Insolvenzverfahren', 'Eroeffnungen'])]
    return in_eröffnet

# Funktion def wird ausgeführt
data_eröffnet_in = in_eröffnung(data_in)

In [50]:
data_nw_ik = data_eröffnet_in[data_eröffnet_in["federal_state"].isin(['Nordrhein-Westfalen'])]


In [51]:
date_range_nw_ik = data_nw_ik[data_nw_ik.date_of_publication.between('2021-10-07', '2021-11-07')]

In [52]:
date_range_nw_ik.head()

Unnamed: 0,_key,case_nr,court,date_of_birth,date_of_proceeding,date_of_publication,description_hash,federal_state,file_name,kind,title,type_of_proceeding,zipcode,detail_form_name,format,name,register,request_fingerprint
1607891,,145 IN 426/21,Wuppertal,1950-10-22,2021-10-05,2021-10-07,8de4d43a44ad47de7423662557dbcb14b41ac63e8db7f0...,Nordrhein-Westfalen,,in,,Eröffnungen,42281,tbl_ergebnis:12:frm_detail,neu,"Denz, Hans-Albert",,3c0654cd788642fa18457dc8258bb759f173cc88a081ea...
1608139,,80 IN 453/21,Bochum,NaT,2021-10-05,2021-10-07,a9cbece12d176566010efe9b428bdcf9a1521201915ede...,Nordrhein-Westfalen,,in,,Eröffnungen,45657,tbl_ergebnis:80:frm_detail,neu,YourPlane AG,"Recklinghausen, HRB 533",a5556a24b30ca4cfea91aed0ce6c56ffd58b04e9711e80...
1608142,,70k IN 233/20,Köln,NaT,2021-10-01,2021-10-07,296305dcfb8c83eddc49878f1ea489e7ac2a7bfe3d6e19...,Nordrhein-Westfalen,,in,,Eröffnungen,50676,tbl_ergebnis:77:frm_detail,neu,Xclusivhair UG (haftungsbeschränkt),"Köln, HRB 88008",bbcd5b7842a86046274123e9d19960f9b213f7f7663661...
1608144,,43 IN 450/21,Bielefeld,NaT,2021-10-05,2021-10-07,b614c8e60fab2603a6dbbf02fd0e967c847be79ac545a8...,Nordrhein-Westfalen,,in,,Eröffnungen,32584,tbl_ergebnis:76:frm_detail,neu,WTL Werkzeugtechnik GmbH,"Bad Oeynhausen, HRB 3049",378bab9b0cb207af34658c716d82b0a4b9501a0471fc74...
1608158,,503 IN 109/20,Düsseldorf,NaT,2021-10-05,2021-10-07,fc37b17084bcac6482875de5f5d43bcfbf25ea301d8cf5...,Nordrhein-Westfalen,,in,,Eröffnungen,90172,tbl_ergebnis:67:frm_detail,neu,Travel to Fairs GmbH,"Düsseldorf, HRB 90172",592d6a354d6cde2da30a4bc6a02bd1dc7728483355cc61...


In [53]:
eröffnung_court_nw = date_range_nw_ik.groupby(['zipcode'], as_index=False).count()
eröffnung_court_nw.sample(10)

Unnamed: 0,zipcode,_key,case_nr,court,date_of_birth,date_of_proceeding,date_of_publication,description_hash,federal_state,file_name,kind,title,type_of_proceeding,detail_form_name,format,name,register,request_fingerprint
261,59394,0,2,2,2,2,2,2,2,0,2,0,2,2,2,2,0,2
95,44139,0,2,2,2,2,2,2,2,0,2,0,2,2,2,2,0,2
162,47918,0,1,1,1,1,1,1,1,0,1,0,1,1,1,1,0,1
253,59065,0,1,1,1,1,1,1,1,0,1,0,1,1,1,1,0,1
119,45356,0,1,1,1,1,1,1,1,0,1,0,1,1,1,1,0,1
66,41065,0,1,1,1,1,1,1,1,0,1,0,1,1,1,1,0,1
273,83590,0,1,1,0,1,1,1,1,0,1,0,1,1,1,1,1,1
77,41469,0,1,1,1,1,1,1,1,0,1,0,1,1,1,1,0,1
6,13868,0,1,1,0,1,1,1,1,0,1,0,1,1,1,1,1,1
256,59227,0,1,1,1,1,1,1,1,0,1,0,1,1,1,1,0,1


In [74]:
for plz in eröffnung_court_nw.zipcode:
    if not isinstance(plz, str):
        print(plz, type(plz))

In [82]:
df_osm_plz

Unnamed: 0_level_0,ags,landkreis,bundesland
zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
01067,14612000,,Sachsen
01069,14612000,,Sachsen
01097,14612000,,Sachsen
01099,14612000,,Sachsen
01108,14612000,,Sachsen
...,...,...,...
99988,16064074,Unstrut-Hainich-Kreis,Thüringen
99991,16064076,Unstrut-Hainich-Kreis,Thüringen
99994,16064043,Unstrut-Hainich-Kreis,Thüringen
99996,16064072,Unstrut-Hainich-Kreis,Thüringen


In [90]:
df_osm_plz = pd.read_csv("plz_de.csv", dtype=str)
df_osm_plz = df_osm_plz[["plz", "ags", "landkreis", "bundesland"]]

df_osm_plz.ags = df_osm_plz.ags.str.slice(0, 5)

df_osm_plz.rename(columns={"plz": "zipcode"}, inplace=True)
df_osm_plz = df_osm_plz.groupby("zipcode").last()


joined = eröffnung_court_nw.join(df_osm_plz, on=["zipcode"], how="left")
joined.sample(50)

Unnamed: 0,zipcode,_key,case_nr,court,date_of_birth,date_of_proceeding,date_of_publication,description_hash,federal_state,file_name,...,title,type_of_proceeding,detail_form_name,format,name,register,request_fingerprint,ags,landkreis,bundesland
26,26206,0,1,1,0,1,1,1,1,0,...,0,1,1,1,1,1,1,,,
170,48369,0,1,1,1,1,1,1,1,0,...,0,1,1,1,1,0,1,5566.0,Kreis Steinfurt,Nordrhein-Westfalen
163,48151,0,1,1,1,1,1,1,1,0,...,0,1,1,1,1,0,1,5515.0,,Nordrhein-Westfalen
278,95743,0,1,1,0,1,1,1,1,0,...,0,1,1,1,1,1,1,,,
264,59759,0,1,1,0,1,1,1,1,0,...,0,1,1,1,1,1,1,5958.0,Hochsauerlandkreis,Nordrhein-Westfalen
140,46397,0,1,1,1,1,1,1,1,0,...,0,1,1,1,1,0,1,5554.0,Kreis Borken,Nordrhein-Westfalen
100,44339,0,1,1,1,1,1,1,1,0,...,0,1,1,1,1,0,1,5913.0,,Nordrhein-Westfalen
31,30429,0,1,1,0,1,1,1,1,0,...,0,1,1,1,1,1,1,,,
48,33106,0,1,1,1,1,1,1,1,0,...,0,1,1,1,1,0,1,5774.0,Kreis Paderborn,Nordrhein-Westfalen
35,32257,0,2,2,2,2,2,2,2,0,...,0,2,2,2,2,0,2,5758.0,Kreis Herford,Nordrhein-Westfalen


In [None]:
nw_eröffnet_in_week.to_csv('bb_eröffnung_in_month.csv')

In [30]:
nw_eröffnet_ik_week = (
    data_nw_ik[["date_of_publication", "description_hash"]]
    .groupby([pd.Grouper(key="date_of_publication", freq="W-MON", label="left")])
    .count()
)

In [118]:
eröffnung_court_nw.to_csv('eröffnung_zip_nw_30T_IN.csv')

In [44]:
df.federal_state.unique()

array(['bb', 'nw', 'sh', 'ns', 'by', 'sn', 'bw', 'be', 'st', 'he', 'rp',
       'hb', 'mv', 'hh', 'sl', 'th', 'Bayern', 'Rheinland-Pfalz',
       'Thüringen', 'Schleswig-Holstein', 'Sachsen-Anhalt', 'Sachsen',
       'Saarland', 'Nordrhein-Westfalen', 'Niedersachsen',
       'Mecklenburg-Vorpommern', 'Hessen', 'Hamburg', 'Bremen',
       'Brandenburg', 'Berlin', 'Baden-Württemberg'], dtype=object)

In [None]:
# IK = Verbraucherinsolvenz

def ik_kind(df):
    ik_verbraucher = (df[df['kind'] == 'ik'])
    return ik_verbraucher

# Funktion def wird ausgeführt
data_ik = ik_kind(df)

### Art des Verfahrens

In [18]:
df.type_of_proceeding.unique()

array(['Entscheidungen_im_Verfahren', 'Eroeffnung',
       'Entscheidungen_im_Restschuldbefreiungsverfahren', 'Termine',
       'Entscheidungen_nach_Aufhebung',
       'Verwalter_Treuhaender_Verteilungsverzeichnis',
       'Entscheidungen_im_Verfahren_Aufhebung_Einstellung',
       'Restschuldbefreiung', 'Entscheidungen_im_Verfahren_mit_Termine',
       'Sonstiges', 'Bestimmung_Termine', 'InsO_d_Verw_Treuh_',
       'Sonstiges_ausserhalb_des_Verfahresn', 'Eroeffnungen',
       'Eroeffnung_Insolvenzverfahren',
       'Restschuldbefreiung_Erteilung_Versagung',
       'Abweisungen_mangels_Masse', 'Sicherungsmassnahmen',
       'Anordnung_Sicherungsmassnahmen',
       'Verteilungsverzeichnis_Nachtragsvert_Loeschung',
       'Entscheidungen_nach_Aufhebung_des_Verfahrens',
       'Sicherungsmassnahme_Aufhebung', 'InsO_Plan_Ueberwachung',
       'InsO_Plan_Aufhebung_Ueberwachung', 'Ueberwachte_Insolvenzplaene',
       'InsO_Plan_Ankuendigung_Ueberwachung',
       'Verteilungsverzeichnisse_(§_

In [25]:
def in_eröffnung(data_in):
    in_eröffnet = data_in[data_in['type_of_proceeding'].isin(['Eröffnungen', 'Eroeffnung', 'Eroeffnung_Insolvenzverfahren', 'Eroeffnungen'])]
    return in_eröffnet

# Funktion def wird ausgeführt
data_eröffnet_in = in_eröffnung(data_in)

In [36]:
data_eröffnet_in.to_csv('data_eröffnung_in_bund.csv')

# Verbraucher

In [None]:
def ik_eröffnung(df):
    ik_eröffnet = df[df['type_of_proceeding'].isin(['Eröffnungen', 'Eroeffnung', 'Eroeffnung_Insolvenzverfahren', 'Eroeffnungen'])]
    return ik_eröffnet

# Funktion def wird ausgeführt
data_eröffnet = ik_eröffnung(df)

In [None]:
def ik_masse(df):
    ik_mangels_masse = df[df['type_of_proceeding'].isin(['Abweisungen_mangels_Masse'])]
    return ik_mangels_masse

# Funktion def wird ausgeführt
data_masse = ik_masse(df)

In [None]:
def ik_sicher(df):
    ik_sicherung = df[df['type_of_proceeding'].isin(['Anordnung_Sicherungsmassnahmen', 'Sicherungsmassnahmen'])]
    return ik_sicherung

# Funktion def wird ausgeführt
data_sicher = ik_sicher(df)

### groupby Bundesland descrptionhash

### groupby zipcode Bundesweit

In [None]:
freq_in_nw_eröffnung_zipcode = data_nw_eröffnet.groupby(['zipcode'], as_index=False).count()
freq_in_nw_eröffnung_zipcode.sample(10)

### Groupby Inolvenzgericht

### Groupby week

In [32]:
data_eröffnet_in_week = (
    data_eröffnet_in[["date_of_publication", "description_hash"]]
    .groupby(pd.Grouper(key="date_of_publication", freq="W-MON", label="left"))
    .count()
)

In [33]:
data_eröffnet_in_week.head()

Unnamed: 0_level_0,description_hash
date_of_publication,Unnamed: 1_level_1
2018-12-31,392
2019-01-07,249
2019-01-14,231
2019-01-21,233
2019-01-28,445


In [35]:
data_eröffnet_in_week.to_csv('freq_in_eröffnung_bund.csv')

### Alter 

In [None]:
freq_in_nw_eröffnung_birth = freq_in_nw_eröffnung_birth.groupby(['date_of_birth'], as_index=False).count()
freq_in_nw_eröffnung_birth.sample(10)

In [6]:
data_nw = df[df["federal_state"].isin(['Nordrhein-Westfalen', "nw"])]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 204015 entries, 1341 to 1819821
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   _key                 8959 non-null    object        
 1   case_nr              204015 non-null  object        
 2   court                204015 non-null  object        
 3   date_of_birth        103479 non-null  datetime64[ns]
 4   date_of_proceeding   203559 non-null  datetime64[ns]
 5   date_of_publication  204015 non-null  datetime64[ns]
 6   description_hash     204015 non-null  object        
 7   federal_state        204015 non-null  object        
 8   file_name            101866 non-null  object        
 9   kind                 203915 non-null  object        
 10  title                101866 non-null  object        
 11  type_of_proceeding   204015 non-null  object        
 12  zipcode              203738 non-null  object        
 13  detail_for

In [7]:
data_nw.isnull().sum() # die Hälfte der Einträge hat keinen Wert in den geburtsdaten. Es macht also keinen Sinn hier eine Berechnung anzuwenden, da die Fehlerquote ienfach hoch wäre.

_key                   195056
case_nr                     0
court                       0
date_of_birth          100536
date_of_proceeding        456
date_of_publication         0
description_hash            0
federal_state               0
file_name              102149
kind                      100
title                  102149
type_of_proceeding          0
zipcode                   277
detail_form_name       101866
format                 101866
name                   101866
register               187410
request_fingerprint    101866
dtype: int64

In [9]:
def nw_eröffnung(data_nw):
    nw_eröffnet = data_nw[data_nw['type_of_proceeding'].isin(['Eröffnungen', 'Eroeffnung', 'Eroeffnung_Insolvenzverfahren'])]
    return nw_eröffnet

# Funktion def wird ausgeführt
data_nw_eröffnet = nw_eröffnung(data_nw)

In [11]:
data_nw_eröffnet.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32800 entries, 21335 to 1818646
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   _key                 507 non-null    object        
 1   case_nr              32800 non-null  object        
 2   court                32800 non-null  object        
 3   date_of_birth        26980 non-null  datetime64[ns]
 4   date_of_proceeding   32666 non-null  datetime64[ns]
 5   date_of_publication  32800 non-null  datetime64[ns]
 6   description_hash     32800 non-null  object        
 7   federal_state        32800 non-null  object        
 8   file_name            9769 non-null   object        
 9   kind                 32781 non-null  object        
 10  title                9769 non-null   object        
 11  type_of_proceeding   32800 non-null  object        
 12  zipcode              32790 non-null  object        
 13  detail_form_name     2303

In [10]:
data_nw_eröffnet.isnull().sum() # 21,57 % der Werte fehlen !

_key                   32293
case_nr                    0
court                      0
date_of_birth           5820
date_of_proceeding       134
date_of_publication        0
description_hash           0
federal_state              0
file_name              23031
kind                      19
title                  23031
type_of_proceeding         0
zipcode                   10
detail_form_name        9769
format                  9769
name                    9769
register               27911
request_fingerprint     9769
dtype: int64