In [271]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os



In [272]:
# Import banking data
# Note: This data was extracted on 1999

# Each record describes characteristics of a client
client = pd.read_csv('./data/raw/data_berka/client.asc', sep=';')

# Each record describes static characteristics of an account
account = pd.read_csv('./data/raw/data_berka/account.asc', sep=';')

# Each record describes a credit card issued to an account
card = pd.read_csv('./data/raw/data_berka/card.asc', sep=';')

# Each record describes demographic characteristics of a district
district = pd.read_csv('./data/raw/data_berka/district.asc', sep=';')

# Each record relates together a client with an account
# i.e. this relation describes the rights of clients to operate accounts
disp = pd.read_csv('./data/raw/data_berka/disp.asc', sep=';')

# Each record describes characteristics of a payment order (debits only)
order = pd.read_csv('./data/raw/data_berka/order.asc', sep=';')

# Each record describes one transaction on an account
trans = pd.read_csv('./data/raw/data_berka/trans.asc', sep=';', low_memory=False)

# Each record describes a loan granted for a given account
loan = pd.read_csv('./data/raw/data_berka/loan.asc', sep=';')

In [273]:
def checkEmpty(df):
    empty = {}
    for column in df.columns[0:]:
            empty[column] = df[column].astype(str).str.contains(r'^\s*$', na=False)
    return empty 

In [274]:
def fillEmpty(df, empty):
    for column in df.columns[0:]:
        if df[column].dtype == object:
            df.loc[emptyrows[column], column] = 'Other'
    return df

In [275]:
def renameColumn(table, df):
    for column in df.columns[1:]:
        if table not in column:
            column_name =  table + '_' + column
            df.rename(columns={column : column_name}, inplace = True)
    return df

In [276]:
# Checking data
print(client.head())
# Checking structure
print(client.info())
client = renameColumn('client', client)
# Checking missing columns
client['client_birth_year'] = client['client_birth_number'].astype(str).str[:2].astype(int) 
client['client_birth_year'] = client['client_birth_year'] + 1900
client['client_birth_month'] = client['client_birth_number'].astype(str).str[2:4].astype(int)
client['gender'] = 'M'
client.loc[client['client_birth_month'] > 50, 'gender'] = 'F'
client.loc[client['client_birth_month'] > 50, 'client_birth_month'] = client['client_birth_month'] - 50
client['client_birth_day'] = client['client_birth_number'].astype(str).str[4:6].astype(int)
# Checking empty columns
emptyrows = checkEmpty(client)
for column in emptyrows:
    print("the empty rows in " + column + " are: ")
    print(client.loc[emptyrows[column], column])
client =  fillEmpty(client, emptyrows)
print(client.head())

   client_id  birth_number  district_id
0          1        706213           18
1          2        450204            1
2          3        406009            1
3          4        561201            5
4          5        605703            5
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5369 entries, 0 to 5368
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   client_id     5369 non-null   int64
 1   birth_number  5369 non-null   int64
 2   district_id   5369 non-null   int64
dtypes: int64(3)
memory usage: 126.0 KB
None
the empty rows in client_id are: 
Series([], Name: client_id, dtype: int64)
the empty rows in client_birth_number are: 
Series([], Name: client_birth_number, dtype: int64)
the empty rows in client_district_id are: 
Series([], Name: client_district_id, dtype: int64)
the empty rows in client_birth_year are: 
Series([], Name: client_birth_year, dtype: int32)
the empty rows in client_birth_month are: 
Ser

In [277]:
# Checking data
print(account.head())
# Checking structure
print(account.info())
account = renameColumn('account', account)
# Checking missing columns
print("The number of missing columns are: " +  str(account.isna().sum().sum()))
#Processing Date
account['account_year'] = account['account_date'].astype(str).str[:2].astype(int) 
account['account_year'] = account['account_year'] + 1900
account['account_month'] = account['account_date'].astype(str).str[2:4].astype(int)
account['account_day'] = account['account_date'].astype(str).str[4:6].astype(int)
# Checking empty columns
emptyrows = checkEmpty(account)
for column in emptyrows:
    print("the empty rows in " + column + " are: ")
    print(account.loc[emptyrows[column], column])
account =  fillEmpty(account, emptyrows)
print(account.head())

   account_id  district_id         frequency    date
0         576           55  POPLATEK MESICNE  930101
1        3818           74  POPLATEK MESICNE  930101
2         704           55  POPLATEK MESICNE  930101
3        2378           16  POPLATEK MESICNE  930101
4        2632           24  POPLATEK MESICNE  930102
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4500 entries, 0 to 4499
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   account_id   4500 non-null   int64 
 1   district_id  4500 non-null   int64 
 2   frequency    4500 non-null   object
 3   date         4500 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 140.8+ KB
None
The number of missing columns are: 0
the empty rows in account_id are: 
Series([], Name: account_id, dtype: int64)
the empty rows in account_district_id are: 
Series([], Name: account_district_id, dtype: int64)
the empty rows in account_frequency are: 
Series([], Name: acc

In [278]:
# Checking data
print(card.head())
# Checking structure
print(card.info())
#Rename Columns
card = renameColumn('card', card)
# Checking missing columns
print("The number of missing columns are: " +  str(card.isna().sum().sum()))
#Processing Date
card['card_issued_year'] = card['card_issued'].astype(str).str[:2].astype(int) 
card['card_issued_year'] = card['card_issued_year'] + 1900
card['card_issued_month'] = card['card_issued'].astype(str).str[2:4].astype(int)
card['card_issued_day'] = card['card_issued'].astype(str).str[4:6].astype(int)
# Checking empty columns
emptyrows = checkEmpty(card)
for column in emptyrows:
    print("the empty rows in " + column + " are: ")
    print(card.loc[emptyrows[column], column])
card =  fillEmpty(card, emptyrows)
print(card.head())

   card_id  disp_id     type           issued
0     1005     9285  classic  931107 00:00:00
1      104      588  classic  940119 00:00:00
2      747     4915  classic  940205 00:00:00
3       70      439  classic  940208 00:00:00
4      577     3687  classic  940215 00:00:00
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 892 entries, 0 to 891
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   card_id  892 non-null    int64 
 1   disp_id  892 non-null    int64 
 2   type     892 non-null    object
 3   issued   892 non-null    object
dtypes: int64(2), object(2)
memory usage: 28.0+ KB
None
The number of missing columns are: 0
the empty rows in card_id are: 
Series([], Name: card_id, dtype: int64)
the empty rows in card_disp_id are: 
Series([], Name: card_disp_id, dtype: int64)
the empty rows in card_type are: 
Series([], Name: card_type, dtype: object)
the empty rows in card_issued are: 
Series([], Name: card_issued, dtype: o

In [286]:
# Checking data
print(district.head())
# Checking structure
print(district.info())
# Checking missing columns
print("The number of missing columns are: " +  str(district.isna().sum().sum()))
#Checking values in A12, A15 column prior to converting to float
print(district['A12'].unique())
print(district['A15'].unique())
#Replacing '?' in A12, A15 column with 0.0 to remove impact
district.loc[district['A12'] == '?', 'A12'] = 0.0
district.loc[district['A15'] == '?', 'A15'] = 0
#Changing to column to Float, int
district['A12'] = district['A12'].astype(float)
district['A15'] = district['A15'].astype(int)
# Checking structure
print(district.info())


   A1           A2               A3       A4  A5  A6  A7  A8  A9    A10  \
0   1  Hl.m. Praha           Prague  1204953   0   0   0   1   1  100.0   
1   2      Benesov  central Bohemia    88884  80  26   6   2   5   46.7   
2   3       Beroun  central Bohemia    75232  55  26   4   1   5   41.7   
3   4       Kladno  central Bohemia   149893  63  29   6   2   6   67.4   
4   5        Kolin  central Bohemia    95616  65  30   4   1   6   51.4   

     A11       A12   A13  A14    A15    A16  
0  12541  0.290039  0.43  167  85677  99107  
1   8507  1.669922  1.85  132   2159   2674  
2   8980  1.950195  2.21  111   2824   2813  
3   9753  4.640625  5.05  109   5244   5892  
4   9307  3.849609  4.43  118   2616   3040  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A1      77 non-null     int64  
 1   A2      77 non-null     object 
 2   A3      77 non-nu

In [287]:
#Renaming district columns
district.rename(columns={'A1' : 'District_Code'}, inplace = True)
district.rename(columns={'A2' : 'District_Name'}, inplace = True)
district.rename(columns={'A3' : 'Region'}, inplace = True)
district.rename(columns={'A4' : 'n_inhabitants'}, inplace = True)
district.rename(columns={'A5' : 'municipalities_inh_499'}, inplace = True)
district.rename(columns={'A6' : 'municipalities_500_1999'}, inplace = True)
district.rename(columns={'A7' : 'municipalities_2000_9999'}, inplace = True)
district.rename(columns={'A8' : 'municipalities_10000'}, inplace = True)
district.rename(columns={'A9' : 'n_cities'}, inplace = True)
district.rename(columns={'A10' : 'ratio_urban_inh'}, inplace = True)
district.rename(columns={'A11' : 'average_salary'}, inplace = True)
district.rename(columns={'A12' : 'unemploymant_rate_95'}, inplace = True)
district.rename(columns={'A13' : 'unemploymant_rate_96'}, inplace = True)
district.rename(columns={'A14' : 'entrepreneurs_per_1000'}, inplace = True)
district.rename(columns={'A15' : 'committed_crimes_95'}, inplace = True)
district.rename(columns={'A16' : 'committed_crimes_96'}, inplace = True)

# Checking empty columns
emptyrows = checkEmpty(district)
for column in emptyrows:
    print("the empty rows in " + column + " are: ")
    print(district.loc[emptyrows[column], column])
district =  fillEmpty(district, emptyrows)
print(district.head())

the empty rows in District_Code are: 
Series([], Name: District_Code, dtype: int64)
the empty rows in District_Name are: 
Series([], Name: District_Name, dtype: object)
the empty rows in Region are: 
Series([], Name: Region, dtype: object)
the empty rows in n_inhabitants are: 
Series([], Name: n_inhabitants, dtype: int64)
the empty rows in municipalities_inh_499 are: 
Series([], Name: municipalities_inh_499, dtype: int64)
the empty rows in municipalities_500_1999 are: 
Series([], Name: municipalities_500_1999, dtype: int64)
the empty rows in municipalities_2000_9999 are: 
Series([], Name: municipalities_2000_9999, dtype: int64)
the empty rows in municipalities_10000 are: 
Series([], Name: municipalities_10000, dtype: int64)
the empty rows in n_cities are: 
Series([], Name: n_cities, dtype: int64)
the empty rows in ratio_urban_inh are: 
Series([], Name: ratio_urban_inh, dtype: float64)
the empty rows in average_salary are: 
Series([], Name: average_salary, dtype: int64)
the empty rows i

In [302]:
district.loc[district['unemploymant_rate_96'] > district['unemploymant_rate_95'], 'increase_in_unemploymant_rate'] = district['unemploymant_rate_96'] - district['unemploymant_rate_95']
district.loc[district['committed_crimes_96'] > district['committed_crimes_95'], 'increase_in_committed_crimes'] = district['committed_crimes_96'] - district['committed_crimes_95']
district.loc[district['unemploymant_rate_96'] < district['unemploymant_rate_95'], 'decrease_in_unemploymant_rate'] = district['unemploymant_rate_95'] - district['unemploymant_rate_96']
district.loc[district['committed_crimes_96'] < district['committed_crimes_95'], 'decrease_in_committed_crimes'] = district['committed_crimes_95'] - district['committed_crimes_96']
district['increase_in_unemploymant_rate'] = district['increase_in_unemploymant_rate'].fillna(0)
district['increase_in_committed_crimes'] = district['increase_in_committed_crimes'].fillna(0)
district['decrease_in_unemploymant_rate'] = district['decrease_in_unemploymant_rate'].fillna(0)
district['decrease_in_committed_crimes'] = district['decrease_in_committed_crimes'].fillna(0)

district.loc[:,'total_municipalities'] = district['municipalities_inh_499'] + district['municipalities_500_1999'] + district['municipalities_2000_9999'] + district['municipalities_10000']
district.loc[:, 'number_of_employed_population_96'] =  district['n_inhabitants'] - district['unemploymant_rate_96']/100 * district['n_inhabitants']
district.loc[:, 'number_of_employed_urban_population_96'] =  district['number_of_employed_population_96'] - district['number_of_employed_population_96'] / district['ratio_urban_inh']
district['number_of_employed_population_96'] = district['number_of_employed_population_96'].astype(int)
district['number_of_employed_urban_population_96'] = district['number_of_employed_urban_population_96'].astype(int)
print(district.head())

   District_Code District_Name           Region  n_inhabitants  \
0              1   Hl.m. Praha           Prague        1204953   
1              2       Benesov  central Bohemia          88884   
2              3        Beroun  central Bohemia          75232   
3              4        Kladno  central Bohemia         149893   
4              5         Kolin  central Bohemia          95616   

   municipalities_inh_499  municipalities_500_1999  municipalities_2000_9999  \
0                       0                        0                         0   
1                      80                       26                         6   
2                      55                       26                         4   
3                      63                       29                         6   
4                      65                       30                         4   

   municipalities_10000  n_cities  ratio_urban_inh  ...  \
0                     1         1            100.0  ...   
1   

In [303]:
# Checking data
print(disp.head())
# Checking structure
print(disp.info())
#Rename Columns
disp = renameColumn('disp', disp)
# Checking missing columns
print("The number of missing columns are: " +  str(disp.isna().sum().sum()))
# Checking empty columns
emptyrows = checkEmpty(disp)
for column in emptyrows:
    print("the empty rows in " + column + " are: ")
    print(disp.loc[emptyrows[column], column])
disp =  fillEmpty(disp, emptyrows)
print(disp.head())

   disp_id  client_id  account_id       type
0        1          1           1      OWNER
1        2          2           2      OWNER
2        3          3           2  DISPONENT
3        4          4           3      OWNER
4        5          5           3  DISPONENT
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5369 entries, 0 to 5368
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   disp_id     5369 non-null   int64 
 1   client_id   5369 non-null   int64 
 2   account_id  5369 non-null   int64 
 3   type        5369 non-null   object
dtypes: int64(3), object(1)
memory usage: 167.9+ KB
None
The number of missing columns are: 0
the empty rows in disp_id are: 
Series([], Name: disp_id, dtype: int64)
the empty rows in disp_client_id are: 
Series([], Name: disp_client_id, dtype: int64)
the empty rows in disp_account_id are: 
Series([], Name: disp_account_id, dtype: int64)
the empty rows in disp_type are: 
Series([],

In [304]:
# Checking data
print(order.head())
# Checking structure
print(order.info())
#Rename Columns
order = renameColumn('order', order)
# Checking missing columns
print("The number of missing columns are: " +  str(order.isna().sum().sum()))
# Checking empty columns
emptyrows = checkEmpty(order)
for column in emptyrows:
    print("the empty rows in " + column + " are: ")
    print(order.loc[emptyrows[column], column])
order =  fillEmpty(order, emptyrows)
print(order.head())

   order_id  account_id bank_to  account_to  amount k_symbol
0     29401           1      YZ    87144583  2452.0     SIPO
1     29402           2      ST    89597016  3372.7     UVER
2     29403           2      QR    13943797  7266.0     SIPO
3     29404           3      WX    83084338  1135.0     SIPO
4     29405           3      CD    24485939   327.0         
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6471 entries, 0 to 6470
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   order_id    6471 non-null   int64  
 1   account_id  6471 non-null   int64  
 2   bank_to     6471 non-null   object 
 3   account_to  6471 non-null   int64  
 4   amount      6471 non-null   float64
 5   k_symbol    6471 non-null   object 
dtypes: float64(1), int64(3), object(2)
memory usage: 303.5+ KB
None
The number of missing columns are: 0
the empty rows in order_id are: 
Series([], Name: order_id, dtype: int64)
the empty rows in or

In [305]:
# Checking data
print(trans.head())
# Checking structure
print(trans.info())
#Rename Columns
trans = renameColumn('trans', trans)
# Checking missing columns
print("The number of missing columns are: " +  str(trans.isna().sum().sum()))

#Processing Date
trans['trans_year'] = trans['trans_date'].astype(str).str[:2].astype(int) 
trans['trans_year'] = trans['trans_year'] + 1900
trans['trans_month'] = trans['trans_date'].astype(str).str[2:4].astype(int)
trans['trans_day'] = trans['trans_date'].astype(str).str[4:6].astype(int)

emptyrows = checkEmpty(trans)
#check na per column 
for column in emptyrows:
    print("the empty rows in " + column + " are: ")
    print(trans[column].isna().sum())

# Checking empty columns
for column in emptyrows:
    print("the empty rows in " + column + " are: ")
    print(trans.loc[emptyrows[column], column])
trans =  fillEmpty(trans, emptyrows)
print(trans.head())

   trans_id  account_id    date    type operation  amount  balance k_symbol  \
0    695247        2378  930101  PRIJEM     VKLAD   700.0    700.0      NaN   
1    171812         576  930101  PRIJEM     VKLAD   900.0    900.0      NaN   
2    207264         704  930101  PRIJEM     VKLAD  1000.0   1000.0      NaN   
3   1117247        3818  930101  PRIJEM     VKLAD   600.0    600.0      NaN   
4    579373        1972  930102  PRIJEM     VKLAD   400.0    400.0      NaN   

  bank  account  
0  NaN      NaN  
1  NaN      NaN  
2  NaN      NaN  
3  NaN      NaN  
4  NaN      NaN  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1056320 entries, 0 to 1056319
Data columns (total 10 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   trans_id    1056320 non-null  int64  
 1   account_id  1056320 non-null  int64  
 2   date        1056320 non-null  int64  
 3   type        1056320 non-null  object 
 4   operation   873206 non-null   object 
 5 

In [151]:
print(trans['k_symbol'].value_counts())
print(trans['bank'].value_counts())
print(trans['account'].value_counts())

UROK           183114
SLUZBY         155832
SIPO           118065
Other           53433
DUCHOD          30338
POJISTNE        18500
UVER            13580
SANKC. UROK      1577
Name: k_symbol, dtype: int64
QR    22285
AB    21720
ST    21711
YZ    21582
GH    21499
EF    21293
KL    21234
UV    21167
OP    21094
IJ    20525
WX    20178
MN    19623
CD    19597
Name: bank, dtype: int64
0.0           21881
66487163.0      140
13943797.0      130
95437645.0       95
69415771.0       94
              ...  
58251345.0        1
74396675.0        1
5624032.0         1
54632151.0        1
29951918.0        1
Name: account, Length: 7665, dtype: int64


In [306]:
# Checking data
print(loan.head())
# Checking structure
print(loan.info())
#Rename Columns
loan = renameColumn('loan', loan)
# Checking missing columns
print("The number of missing columns are: " +  str(loan.isna().sum().sum()))
#Processing Date
loan['loan_year'] = loan['loan_date'].astype(str).str[:2].astype(int) 
loan['loan_year'] = loan['loan_year'] + 1900
loan['loan_month'] = loan['loan_date'].astype(str).str[2:4].astype(int)
loan['loan_day'] = loan['loan_date'].astype(str).str[4:6].astype(int)
# Checking empty columns
emptyrows = checkEmpty(loan)
for column in emptyrows:
    print("the empty rows in " + column + " are: ")
    print(loan.loc[emptyrows[column], column])
loan =  fillEmpty(loan, emptyrows)
print(loan.head())

   loan_id  account_id    date  amount  duration  payments status
0     5314        1787  930705   96396        12    8033.0      B
1     5316        1801  930711  165960        36    4610.0      A
2     6863        9188  930728  127080        60    2118.0      A
3     5325        1843  930803  105804        36    2939.0      A
4     7240       11013  930906  274740        60    4579.0      A
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 682 entries, 0 to 681
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   loan_id     682 non-null    int64  
 1   account_id  682 non-null    int64  
 2   date        682 non-null    int64  
 3   amount      682 non-null    int64  
 4   duration    682 non-null    int64  
 5   payments    682 non-null    float64
 6   status      682 non-null    object 
dtypes: float64(1), int64(5), object(1)
memory usage: 37.4+ KB
None
The number of missing columns are: 0
the empty rows in loan_id 

In [372]:
# Merging account and disp table
base_table = pd.merge(account, disp, left_on = 'account_id', right_on = 'disp_account_id', how='inner' )
# checking for type = 'OWNER'
base_table =  base_table[base_table['disp_type'] == 'OWNER']
# Merging Account to base_table
base_table = base_table.merge(client, left_on = 'disp_client_id', right_on = 'client_id', how='inner')
#Merging card to base_table
base_table = base_table.merge(card, left_on = 'disp_id', right_on = 'card_disp_id', how='inner')
#Merging district to base_table for account
# base_table = base_table.merge(district, left_on = 'account_district_id', right_on = 'A1', how='inner')
#Merging district to base_table for client
base_table = base_table.merge(district, left_on = 'client_district_id', right_on = 'District_Code', how='inner')
#Merging order to base_table for client
base_table = base_table.merge(order, left_on = 'account_id', right_on = 'order_account_id', how='inner')
#Merging trans to base_table for client
base_table = base_table.merge(trans, left_on = 'account_id', right_on = 'trans_account_id', how='inner')
#Merging loan to base_table for client
base_table = base_table.merge(loan, left_on = 'account_id', right_on = 'loan_account_id', how='inner')
print(base_table.head())
print(base_table.info())

   account_id  account_district_id account_frequency  account_date  \
0       10065                   76  POPLATEK MESICNE        960330   
1       10065                   76  POPLATEK MESICNE        960330   
2       10065                   76  POPLATEK MESICNE        960330   
3       10065                   76  POPLATEK MESICNE        960330   
4       10065                   76  POPLATEK MESICNE        960330   

   account_year  account_month  account_day  disp_id  disp_client_id  \
0          1996              3           30    12070           12378   
1          1996              3           30    12070           12378   
2          1996              3           30    12070           12378   
3          1996              3           30    12070           12378   
4          1996              3           30    12070           12378   

   disp_account_id  ... loan_id  loan_account_id  loan_date  loan_amount  \
0            10065  ...    7050            10065     970923       3112

In [327]:
print(base_table.loc[base_table['loan_year'] == 1996])


        account_id  account_district_id account_frequency  account_date  \
915           5593                   15  POPLATEK MESICNE        960227   
916           5593                   15  POPLATEK MESICNE        960227   
917           5593                   15  POPLATEK MESICNE        960227   
918           5593                   15  POPLATEK MESICNE        960227   
919           5593                   15  POPLATEK MESICNE        960227   
...            ...                  ...               ...           ...   
102003        1862                   19    POPLATEK TYDNE        950415   
102004        1862                   19    POPLATEK TYDNE        950415   
102005        1862                   19    POPLATEK TYDNE        950415   
102006        1862                   19    POPLATEK TYDNE        950415   
102007        1862                   19    POPLATEK TYDNE        950415   

        account_year  account_month  account_day  disp_id  disp_client_id  \
915             1996  

In [374]:
summary_trans_ksymbol_count = base_table.loc[base_table['trans_year'] == 1996].groupby(['District_Code', 'trans_k_symbol'])[['trans_id']].agg(['count'])
base_table = pd.merge(base_table, summary_trans_ksymbol_count, left_on = ['District_Code', 'trans_k_symbol'], right_on = ['District_Code', 'trans_k_symbol'], how = 'left')
base_table.rename(columns={('trans_id', 'count') : 'district_trans_ksymbol_count'}, inplace = True)
summary_trans_kymbol_amount = base_table.loc[base_table['trans_year'] == 1996].groupby(['District_Code', 'trans_k_symbol'])[['trans_amount']].agg(['sum'])
base_table = pd.merge(base_table, summary_trans_kymbol_amount, left_on = ['District_Code', 'trans_k_symbol'], right_on = ['District_Code', 'trans_k_symbol'], how = 'left')
base_table.rename(columns={('trans_amount', 'sum') : 'district_trans_kymbol_amount'}, inplace = True)
print(summary_trans_ksymbol_count)
print(summary_trans_kymbol_amount)

summary_trans_operation_count = base_table.loc[base_table['trans_year'] == 1996].groupby(['District_Code', 'trans_operation'])[['trans_id']].agg(['count'])
base_table = pd.merge(base_table, summary_trans_operation_count, left_on = ['District_Code', 'trans_operation'], right_on = ['District_Code', 'trans_operation'], how = 'left')
base_table.rename(columns={('trans_id', 'count') : 'district_trans_operation_count'}, inplace = True)
summary_trans_operation_amount = base_table.loc[base_table['trans_year'] == 1996].groupby(['District_Code', 'trans_operation'])[['trans_amount']].agg(['sum'])
base_table = pd.merge(base_table, summary_trans_operation_amount, left_on = ['District_Code', 'trans_operation'], right_on = ['District_Code', 'trans_operation'], how = 'left')
base_table.rename(columns={('trans_amount', 'sum') : 'district_trans_operation_amount'}, inplace = True)
print(summary_trans_operation_count)
print(summary_trans_operation_amount)


summary_trans_type_count = base_table.loc[base_table['trans_year'] == 1996].groupby(['District_Code', 'trans_type'])[['trans_id']].agg(['count'])
base_table = pd.merge(base_table, summary_trans_type_count, left_on = ['District_Code', 'trans_type'], right_on = ['District_Code', 'trans_type'], how = 'left')
base_table.rename(columns={('trans_id', 'count') : 'district_trans_type_count'}, inplace = True)
summary_trans_type_amount = base_table.loc[base_table['trans_year'] == 1996].groupby(['District_Code', 'trans_type'])[['trans_amount']].agg(['sum'])
base_table = pd.merge(base_table, summary_trans_type_amount, left_on = ['District_Code', 'trans_type'], right_on = ['District_Code', 'trans_type'], how = 'left')
base_table.rename(columns={('trans_amount', 'sum') : 'district_trans_type_amount'}, inplace = True)
print(summary_trans_type_count)
print(summary_trans_type_amount)
print(base_table.head())



                             trans_id
                                count
District_Code trans_k_symbol         
1             Other                95
              POJISTNE             12
              SIPO                134
              SLUZBY              164
              UROK                310
...                               ...
76            UROK                  9
77            SIPO                  8
              SLUZBY                8
              UROK                 16
              UVER                  6

[239 rows x 1 columns]
                             trans_amount
                                      sum
District_Code trans_k_symbol             
1             Other              393705.0
              POJISTNE            72492.0
              SIPO               539588.0
              SLUZBY               2948.8
              UROK                56260.6
...                                   ...
76            UROK                 2514.5
77            SIPO      



                              trans_id
                                 count
District_Code trans_operation         
1             PREVOD NA UCET       311
              PREVOD Z UCTU         54
              VKLAD                259
              VYBER                628
              VYBER KARTOU           9
...                                ...
76            VKLAD                 22
              VYBER                 31
77            PREVOD NA UCET        14
              VKLAD                 21
              VYBER                 34

[219 rows x 1 columns]
                              trans_amount
                                       sum
District_Code trans_operation             
1             PREVOD NA UCET     1391725.8
              PREVOD Z UCTU      2046722.0
              VKLAD              5735066.0
              VYBER              5895555.8
              VYBER KARTOU         16800.0
...                                    ...
76            VKLAD               655930.0




                         trans_id
                            count
District_Code trans_type         
1             PRIJEM          623
              VYBER            31
              VYDAJ           917
2             PRIJEM          161
              VYDAJ           321
...                           ...
76            PRIJEM           31
              VYBER             2
              VYDAJ            29
77            PRIJEM           37
              VYDAJ            48

[154 rows x 1 columns]
                         trans_amount
                                  sum
District_Code trans_type             
1             PRIJEM        7838048.6
              VYBER          388107.0
              VYDAJ         6915974.6
2             PRIJEM        1322171.6
              VYDAJ         1233748.0
...                               ...
76            PRIJEM         658444.5
              VYBER           21719.0
              VYDAJ          522302.2
77            PRIJEM         358247.4
      

In [373]:
summary_card_type_count = base_table.loc[base_table['card_issued_year'] == 1996].groupby(['District_Code', 'card_type'])[['card_id']].agg(['count'])
base_table = pd.merge(base_table, summary_card_type_count, left_on = ['District_Code', 'card_type'], right_on = ['District_Code', 'card_type'], how = 'left')
base_table.rename(columns={('card_id', 'count') : 'district_card_type_count'}, inplace = True)
print(summary_card_type_count)
print(base_table.head())



                        card_id
                          count
District_Code card_type        
1             junior       1017
2             classic       844
              junior       1395
5             junior        783
14            classic       410
15            classic       582
27            classic       858
28            classic      2415
32            classic       254
              junior       1929
34            classic      2070
38            junior       1317
39            gold         1610
50            classic      3140
52            classic      1504
55            gold          202
58            classic      1740
59            classic       427
62            classic       283
64            classic      1098
70            classic      1804
74            classic       832
   account_id  account_district_id account_frequency  account_date  \
0       10065                   76  POPLATEK MESICNE        960330   
1       10065                   76  POPLATEK MESICNE        

In [367]:
summary_loan_status_count = base_table.loc[base_table['loan_year'] == 1996].groupby(['District_Code', 'loan_status'])[['loan_id']].agg(['count'])
base_table = pd.merge(base_table, summary_loan_status_count, left_on = ['District_Code', 'loan_status'], right_on = ['District_Code', 'loan_status'], how = 'left')
base_table.rename(columns={('loan_id', 'count') : 'district_loan_status_count'}, inplace = True)
summary_loan_status_amount = base_table.loc[base_table['loan_year'] == 1996].groupby(['District_Code', 'loan_status'])[['loan_amount']].agg(['sum'])
base_table = pd.merge(base_table, summary_loan_status_amount, left_on = ['District_Code', 'loan_status'], right_on = ['District_Code', 'loan_status'], how = 'left')
base_table.rename(columns={('loan_amount', 'sum') : 'district_loan_status_amount'}, inplace = True)
print(summary_loan_status_count)
print(summary_loan_status_amount.head())
print(base_table.head())



                          loan_id
                            count
District_Code loan_status        
1             C              1473
              D               301
9             A              1156
12            C               678
14            C               795
15            A               582
19            C               275
27            C               388
28            C              2415
31            A              1096
32            A               254
37            A               784
              C               297
38            C              1317
39            A              1610
40            A               198
47            C               296
50            C               293
52            A               260
              C              1504
55            C               202
61            A              1134
62            C               283
64            A               407
              C               355
68            C               426
70            

In [368]:
summary_account_frequency_count = base_table.loc[base_table['account_year'] == 1996].groupby(['District_Code', 'account_frequency'])[['account_id']].agg(['count'])
base_table = pd.merge(base_table, summary_account_frequency_count, left_on = ['District_Code', 'account_frequency'], right_on = ['District_Code', 'account_frequency'], how = 'left')
base_table.rename(columns={('account_id', 'count') : 'district_account_frequency_count'}, inplace = True)
print(summary_account_frequency_count.head())
print(base_table.head())



                                account_id
                                     count
District_Code account_frequency           
1             POPLATEK MESICNE        4194
2             POPLATEK MESICNE         338
              POPLATEK TYDNE           147
5             POPLATEK MESICNE        1179
6             POPLATEK MESICNE         105
   account_id  account_district_id account_frequency  account_date  \
0       10065                   76  POPLATEK MESICNE        960330   
1       10065                   76  POPLATEK MESICNE        960330   
2       10065                   76  POPLATEK MESICNE        960330   
3       10065                   76  POPLATEK MESICNE        960330   
4       10065                   76  POPLATEK MESICNE        960330   

   account_year  account_month  account_day  disp_id  disp_client_id  \
0          1996              3           30    12070           12378   
1          1996              3           30    12070           12378   
2          1996    

In [370]:
base_table.to_csv(os.path.join('./data/interim',r'base_table.csv'))