In [135]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [136]:
# Import banking data
# Note: This data was extracted on 1999

# Each record describes characteristics of a client
client = pd.read_csv('./data/raw/data_berka/client.asc', sep=';')

# Each record describes static characteristics of an account
account = pd.read_csv('./data/raw/data_berka/account.asc', sep=';')

# Each record describes a credit card issued to an account
card = pd.read_csv('./data/raw/data_berka/card.asc', sep=';')

# Each record describes demographic characteristics of a district
district = pd.read_csv('./data/raw/data_berka/district.asc', sep=';')

# Each record relates together a client with an account
# i.e. this relation describes the rights of clients to operate accounts
disp = pd.read_csv('./data/raw/data_berka/disp.asc', sep=';')

# Each record describes characteristics of a payment order (debits only)
order = pd.read_csv('./data/raw/data_berka/order.asc', sep=';')

# Each record describes one transaction on an account
trans = pd.read_csv('./data/raw/data_berka/trans.asc', sep=';', low_memory=False)

# Each record describes a loan granted for a given account
loan = pd.read_csv('./data/raw/data_berka/loan.asc', sep=';')

In [137]:
def checkEmpty(df):
    empty = {}
    for column in df.columns[0:]:
            empty[column] = df[column].astype(str).str.contains(r'^\s*$', na=False)
    return empty 

In [138]:
def fillEmpty(df, empty):
    for column in df.columns[0:]:
        if df[column].dtype == object:
            df.loc[emptyrows[column], column] = 'Other'
    return df

In [139]:
# Checking data
print(client.head())
# Checking structure
print(client.info())
# Checking missing columns
print("The number of missing columns are: " +  str(client.isna().sum().sum()))
# Checking empty columns
emptyrows = checkEmpty(client)
for column in emptyrows:
    print("the empty rows in " + column + " are: ")
    print(client.loc[emptyrows[column], column])
client =  fillEmpty(client, emptyrows)
print(client.head())

   client_id  birth_number  district_id
0          1        706213           18
1          2        450204            1
2          3        406009            1
3          4        561201            5
4          5        605703            5
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5369 entries, 0 to 5368
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   client_id     5369 non-null   int64
 1   birth_number  5369 non-null   int64
 2   district_id   5369 non-null   int64
dtypes: int64(3)
memory usage: 126.0 KB
None
The number of missing columns are: 0
the empty rows in client_id are: 
Series([], Name: client_id, dtype: int64)
the empty rows in birth_number are: 
Series([], Name: birth_number, dtype: int64)
the empty rows in district_id are: 
Series([], Name: district_id, dtype: int64)
   client_id  birth_number  district_id
0          1        706213           18
1          2        450204            1
2        

In [141]:
# Checking data
print(account.head())
# Checking structure
print(account.info())
# Checking missing columns
print("The number of missing columns are: " +  str(account.isna().sum().sum()))
# Checking empty columns
emptyrows = checkEmpty(account)
for column in emptyrows:
    print("the empty rows in " + column + " are: ")
    print(account.loc[emptyrows[column], column])
account =  fillEmpty(account, emptyrows)
print(account.head())

   account_id  district_id         frequency    date
0         576           55  POPLATEK MESICNE  930101
1        3818           74  POPLATEK MESICNE  930101
2         704           55  POPLATEK MESICNE  930101
3        2378           16  POPLATEK MESICNE  930101
4        2632           24  POPLATEK MESICNE  930102
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4500 entries, 0 to 4499
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   account_id   4500 non-null   int64 
 1   district_id  4500 non-null   int64 
 2   frequency    4500 non-null   object
 3   date         4500 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 140.8+ KB
None
The number of missing columns are: 0
the empty rows in account_id are: 
Series([], Name: account_id, dtype: int64)
the empty rows in district_id are: 
Series([], Name: district_id, dtype: int64)
the empty rows in frequency are: 
Series([], Name: frequency, dtype: object)
t

In [142]:
# Checking data
print(card.head())
# Checking structure
print(card.info())
# Checking missing columns
print("The number of missing columns are: " +  str(card.isna().sum().sum()))
# Checking empty columns
emptyrows = checkEmpty(card)
for column in emptyrows:
    print("the empty rows in " + column + " are: ")
    print(card.loc[emptyrows[column], column])
card =  fillEmpty(card, emptyrows)
print(card.head())

   card_id  disp_id     type           issued
0     1005     9285  classic  931107 00:00:00
1      104      588  classic  940119 00:00:00
2      747     4915  classic  940205 00:00:00
3       70      439  classic  940208 00:00:00
4      577     3687  classic  940215 00:00:00
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 892 entries, 0 to 891
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   card_id  892 non-null    int64 
 1   disp_id  892 non-null    int64 
 2   type     892 non-null    object
 3   issued   892 non-null    object
dtypes: int64(2), object(2)
memory usage: 28.0+ KB
None
The number of missing columns are: 0
the empty rows in card_id are: 
Series([], Name: card_id, dtype: int64)
the empty rows in disp_id are: 
Series([], Name: disp_id, dtype: int64)
the empty rows in type are: 
Series([], Name: type, dtype: object)
the empty rows in issued are: 
Series([], Name: issued, dtype: object)
   card_id  disp_id    

In [144]:
# Checking data
print(district.head())
# Checking structure
print(district.info())
# Checking missing columns
print("The number of missing columns are: " +  str(district.isna().sum().sum()))
# Checking empty columns
emptyrows = checkEmpty(district)
print(emptyrows)
for column in emptyrows:
    print("the empty rows in " + column + " are: ")
    print(district.loc[emptyrows[column], column])
district =  fillEmpty(district, emptyrows)
print(district.head())

   A1           A2               A3       A4  A5  A6  A7  A8  A9    A10  \
0   1  Hl.m. Praha           Prague  1204953   0   0   0   1   1  100.0   
1   2      Benesov  central Bohemia    88884  80  26   6   2   5   46.7   
2   3       Beroun  central Bohemia    75232  55  26   4   1   5   41.7   
3   4       Kladno  central Bohemia   149893  63  29   6   2   6   67.4   
4   5        Kolin  central Bohemia    95616  65  30   4   1   6   51.4   

     A11   A12   A13  A14    A15    A16  
0  12541  0.29  0.43  167  85677  99107  
1   8507  1.67  1.85  132   2159   2674  
2   8980  1.95  2.21  111   2824   2813  
3   9753  4.64  5.05  109   5244   5892  
4   9307  3.85  4.43  118   2616   3040  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A1      77 non-null     int64  
 1   A2      77 non-null     object 
 2   A3      77 non-null     object 
 3   A4  

In [145]:
# Checking data
print(disp.head())
# Checking structure
print(disp.info())
# Checking missing columns
print("The number of missing columns are: " +  str(disp.isna().sum().sum()))
# Checking empty columns
emptyrows = checkEmpty(disp)
for column in emptyrows:
    print("the empty rows in " + column + " are: ")
    print(disp.loc[emptyrows[column], column])
disp =  fillEmpty(disp, emptyrows)
print(disp.head())

   disp_id  client_id  account_id       type
0        1          1           1      OWNER
1        2          2           2      OWNER
2        3          3           2  DISPONENT
3        4          4           3      OWNER
4        5          5           3  DISPONENT
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5369 entries, 0 to 5368
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   disp_id     5369 non-null   int64 
 1   client_id   5369 non-null   int64 
 2   account_id  5369 non-null   int64 
 3   type        5369 non-null   object
dtypes: int64(3), object(1)
memory usage: 167.9+ KB
None
The number of missing columns are: 0
the empty rows in disp_id are: 
Series([], Name: disp_id, dtype: int64)
the empty rows in client_id are: 
Series([], Name: client_id, dtype: int64)
the empty rows in account_id are: 
Series([], Name: account_id, dtype: int64)
the empty rows in type are: 
Series([], Name: type, dtype: objec

In [146]:
# Checking data
print(order.head())
# Checking structure
print(order.info())
# Checking missing columns
print("The number of missing columns are: " +  str(order.isna().sum().sum()))
# Checking empty columns
emptyrows = checkEmpty(order)
for column in emptyrows:
    print("the empty rows in " + column + " are: ")
    print(order.loc[emptyrows[column], column])
order =  fillEmpty(order, emptyrows)
print(order.head())

   order_id  account_id bank_to  account_to  amount k_symbol
0     29401           1      YZ    87144583  2452.0     SIPO
1     29402           2      ST    89597016  3372.7     UVER
2     29403           2      QR    13943797  7266.0     SIPO
3     29404           3      WX    83084338  1135.0     SIPO
4     29405           3      CD    24485939   327.0         
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6471 entries, 0 to 6470
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   order_id    6471 non-null   int64  
 1   account_id  6471 non-null   int64  
 2   bank_to     6471 non-null   object 
 3   account_to  6471 non-null   int64  
 4   amount      6471 non-null   float64
 5   k_symbol    6471 non-null   object 
dtypes: float64(1), int64(3), object(2)
memory usage: 303.5+ KB
None
The number of missing columns are: 0
the empty rows in order_id are: 
Series([], Name: order_id, dtype: int64)
the empty rows in ac

In [148]:
# Checking data
print(trans.head())
# Checking structure
print(trans.info())
# Checking missing columns
print("The number of missing columns are: " +  str(trans.isna().sum().sum()))

#check na per column 
for column in emptyrows:
    print("the empty rows in " + column + " are: ")
    print(trans[column].isna().sum())

# Checking empty columns
emptyrows = checkEmpty(trans)
for column in emptyrows:
    print("the empty rows in " + column + " are: ")
    print(trans.loc[emptyrows[column], column])
trans =  fillEmpty(trans, emptyrows)
print(trans.head())

   trans_id  account_id    date    type operation  amount  balance k_symbol  \
0    695247        2378  930101  PRIJEM     VKLAD   700.0    700.0      NaN   
1    171812         576  930101  PRIJEM     VKLAD   900.0    900.0      NaN   
2    207264         704  930101  PRIJEM     VKLAD  1000.0   1000.0      NaN   
3   1117247        3818  930101  PRIJEM     VKLAD   600.0    600.0      NaN   
4    579373        1972  930102  PRIJEM     VKLAD   400.0    400.0      NaN   

  bank  account  
0  NaN      NaN  
1  NaN      NaN  
2  NaN      NaN  
3  NaN      NaN  
4  NaN      NaN  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1056320 entries, 0 to 1056319
Data columns (total 10 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   trans_id    1056320 non-null  int64  
 1   account_id  1056320 non-null  int64  
 2   date        1056320 non-null  int64  
 3   type        1056320 non-null  object 
 4   operation   873206 non-null   object 
 5 

In [151]:
print(trans['k_symbol'].value_counts())
print(trans['bank'].value_counts())
print(trans['account'].value_counts())

UROK           183114
SLUZBY         155832
SIPO           118065
Other           53433
DUCHOD          30338
POJISTNE        18500
UVER            13580
SANKC. UROK      1577
Name: k_symbol, dtype: int64
QR    22285
AB    21720
ST    21711
YZ    21582
GH    21499
EF    21293
KL    21234
UV    21167
OP    21094
IJ    20525
WX    20178
MN    19623
CD    19597
Name: bank, dtype: int64
0.0           21881
66487163.0      140
13943797.0      130
95437645.0       95
69415771.0       94
              ...  
58251345.0        1
74396675.0        1
5624032.0         1
54632151.0        1
29951918.0        1
Name: account, Length: 7665, dtype: int64


In [149]:
# Checking data
print(loan.head())
# Checking structure
print(loan.info())
# Checking missing columns
print("The number of missing columns are: " +  str(loan.isna().sum().sum()))
# Checking empty columns
emptyrows = checkEmpty(loan)
for column in emptyrows:
    print("the empty rows in " + column + " are: ")
    print(loan.loc[emptyrows[column], column])
loan =  fillEmpty(loan, emptyrows)
print(loan.head())

   loan_id  account_id    date  amount  duration  payments status
0     5314        1787  930705   96396        12    8033.0      B
1     5316        1801  930711  165960        36    4610.0      A
2     6863        9188  930728  127080        60    2118.0      A
3     5325        1843  930803  105804        36    2939.0      A
4     7240       11013  930906  274740        60    4579.0      A
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 682 entries, 0 to 681
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   loan_id     682 non-null    int64  
 1   account_id  682 non-null    int64  
 2   date        682 non-null    int64  
 3   amount      682 non-null    int64  
 4   duration    682 non-null    int64  
 5   payments    682 non-null    float64
 6   status      682 non-null    object 
dtypes: float64(1), int64(5), object(1)
memory usage: 37.4+ KB
None
The number of missing columns are: 0
the empty rows in loan_id 

NameError: name 'loan' is not defined