# ORGANIZING IMPORTS

In [17]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import datetime

# READING DATA FILES

In [18]:
# Import banking data
# Note: This data was extracted on 1999

# Each record describes characteristics of a client
client = pd.read_csv('./data/raw/data_berka/client.asc', sep=';')

# Each record describes static characteristics of an account
account = pd.read_csv('./data/raw/data_berka/account.asc', sep=';')

# Each record describes a credit card issued to an account
card = pd.read_csv('./data/raw/data_berka/card.asc', sep=';')

# Each record describes demographic characteristics of a district
district = pd.read_csv('./data/raw/data_berka/district.asc', sep=';')

# Each record relates together a client with an account
# i.e. this relation describes the rights of clients to operate accounts
disp = pd.read_csv('./data/raw/data_berka/disp.asc', sep=';')

# Each record describes characteristics of a payment order (debits only)
order = pd.read_csv('./data/raw/data_berka/order.asc', sep=';')

# Each record describes one transaction on an account
trans = pd.read_csv('./data/raw/data_berka/trans.asc', sep=';', low_memory=False)

# Each record describes a loan granted for a given account
loan = pd.read_csv('./data/raw/data_berka/loan.asc', sep=';')

## FUNCTION CHECKS FOR EMPTY VALUES IN A CELL

In [19]:
#To check for the empty values in the data

def checkEmpty(df):
        """Will check for empty cells in all columns for dataframe and return dict containing column name 
        and list of booleans containing empty rows.

        Parameters
        ----------
        df : dataframe,
            dataframe to be evaluated for empty cells

        """
        empty = {}
        for column in df.columns[0:]:
            empty[column] = df[column].astype(str).str.contains(r'^\s*$', na=False)
        return empty 

# FUNCTION WILL FILL EMPTY CELLS WITH OTHER IF DATA TYPE OF COLUMN IS STRING

In [20]:
#To Fill the Empty rows with Value 'Other'

def fillEmpty(df, empty):
    """Will fill empty cells in all String columns for dataframe with Other

        Parameters
        ----------
        df : dataframe,
            dataframe to be evaluated for empty cells
        empty : dict,
            Dict for columns to be updated   
    """
    for column in df.columns[0:]:
        if df[column].dtype == object:
            df.loc[empty[column], column] = 'Other'
    return df

# FUNCTION WILL APPEND TABLE NAME TO ALL COLUMNS

In [21]:
#To Rename Colum names with the Prefix - table name

def renameColumn(table, df):
    """Will rename columns in a dataframe by appending table name at the begining

        Parameters
        ----------
        table : string,
            name of the table to be appended at start 
        df : dataframe,
            dataframe to be evaluated for empty cells   
    """
    for column in df.columns[1:]:
        if table not in column:
            column_name =  table + '_' + column
            df.rename(columns={column : column_name}, inplace = True)
    return df

# CLIENT DATA 

In [22]:
#initial Client Data
client.head(2)

Unnamed: 0,client_id,birth_number,district_id
0,1,706213,18
1,2,450204,1


## PROCESSING CLIENT DATA

In [23]:
#Renames Columns with the client Prefix
client = renameColumn('client', client)

# Checking missing columns
print("The number of missing columns are: " +  str(client.isna().sum().sum()))

#Getting Birth Year
client['client_birth_year'] = client['client_birth_number'].astype(str).str[:2].astype(int)
client['client_birth_year'] = client['client_birth_year'] + 1900

#Client Birth Month & Birth Day
client['client_birth_month'] = client['client_birth_number'].astype(str).str[2:4].astype(int)
client['client_birth_day'] = client['client_birth_number'].astype(str).str[4:6].astype(int)

#Client Gender
client['client_gender'] = 'M'
client.loc[client['client_birth_month'] > 50, 'client_gender'] = 'F'
client.loc[client['client_birth_month'] > 50, 'client_birth_month'] = client['client_birth_month'] - 50

#calculate the age of clients
client['client_age'] = 1997 - client['client_birth_year']

#Age Group for clients
client['client_age_group'] = client['client_age'] // 10 * 10

#client Category as per thier Age Group
client["client_category"]=np.where(client['client_age_group'] < 21 ,"Young(< 21)",
                                     (np.where(client['client_age_group'] < 55 , "Adult(21-55)" ,"Senior Citizen(> 55)")))

# Checking empty columns
emptyrows = checkEmpty(client)
client =  fillEmpty(client, emptyrows)
client.head()

The number of missing columns are: 0


Unnamed: 0,client_id,client_birth_number,client_district_id,client_birth_year,client_birth_month,client_birth_day,client_gender,client_age,client_age_group,client_category
0,1,706213,18,1970,12,13,F,27,20,Young(< 21)
1,2,450204,1,1945,2,4,M,52,50,Adult(21-55)
2,3,406009,1,1940,10,9,F,57,50,Adult(21-55)
3,4,561201,5,1956,12,1,M,41,40,Adult(21-55)
4,5,605703,5,1960,7,3,F,37,30,Adult(21-55)


# ACCOUNT DATA

In [24]:
#Initial Account Data
account.head(2)

Unnamed: 0,account_id,district_id,frequency,date
0,576,55,POPLATEK MESICNE,930101
1,3818,74,POPLATEK MESICNE,930101


## PROCESSING ACCOUNT DATA

In [25]:
#Renames Column names
account = renameColumn('account', account)

# Checking missing columns
print("The number of missing columns are: " +  str(account.isna().sum().sum()))

#Processing Date ( Year, Month & Day)
account['account_year'] = account['account_date'].astype(str).str[:2].astype(int) 
account['account_year'] = account['account_year'] + 1900
account['account_month'] = account['account_date'].astype(str).str[2:4].astype(int)
account['account_day'] = account['account_date'].astype(str).str[4:6].astype(int)

#getting Length of relationship (LOR) of the accounts
account['account_lor'] = 1997 - account['account_year']


# Checking empty columns
emptyrows = checkEmpty(account)
account =  fillEmpty(account, emptyrows)
account.head(2)

The number of missing columns are: 0


Unnamed: 0,account_id,account_district_id,account_frequency,account_date,account_year,account_month,account_day,account_lor
0,576,55,POPLATEK MESICNE,930101,1993,1,1,4
1,3818,74,POPLATEK MESICNE,930101,1993,1,1,4


# DISPOSITION DATA

In [26]:
#initial Disposition Data
disp.head(2)

Unnamed: 0,disp_id,client_id,account_id,type
0,1,1,1,OWNER
1,2,2,2,OWNER


## PROCESSING DISPOSITION DATA

In [27]:
#Rename Columns
disp = renameColumn('disp', disp)

# Checking missing columns
print("The number of missing columns are: " +  str(disp.isna().sum().sum()))

# Checking empty columns
emptyrows = checkEmpty(disp)
disp =  fillEmpty(disp, emptyrows)
disp.head(2)

The number of missing columns are: 0


Unnamed: 0,disp_id,disp_client_id,disp_account_id,disp_type
0,1,1,1,OWNER
1,2,2,2,OWNER


## MERGING ACCOUNT AND CLIENT DATA

In [28]:
#Merging Account and Disposition 
account_disp_merge = account.merge(disp, left_on = 'account_id', right_on = 'disp_account_id', how = 'left')

In [29]:
account_disp_merge.head(1)

Unnamed: 0,account_id,account_district_id,account_frequency,account_date,account_year,account_month,account_day,account_lor,disp_id,disp_client_id,disp_account_id,disp_type
0,576,55,POPLATEK MESICNE,930101,1993,1,1,4,692,692,576,OWNER


In [30]:
#Merging Client Data with the Account and Disposition

client_disp_merge = account_disp_merge.merge(client, left_on = 'disp_client_id', right_on = 'client_id', how = 'left')
client_disp_merge.loc[client_disp_merge['disp_type'] == 'DISPONENT','acc_has_disp'] = 1
client_disp_merge['acc_has_disp'] = client_disp_merge['acc_has_disp'].fillna(0)

#Check for accounts with more than one owner
summary = client_disp_merge[client_disp_merge['disp_type'] == 'OWNER'].groupby('disp_account_id')['disp_client_id'].count()
summary.unique()

#There are no accounts with more than one owner

array([1], dtype=int64)

## AGGREGATING AND MERGING DISPOSITION DATA BASED ON ACCOUNT

In [31]:
account_client_count = client_disp_merge.groupby(['disp_account_id'])[['acc_has_disp']].agg('sum')
client_disp_merge = account_client_count.merge(client_disp_merge, left_on = 'disp_account_id', right_on = 'disp_account_id', how = 'left')
client_disp_merge = client_disp_merge[client_disp_merge['disp_type'] == 'OWNER']

#Rename columns and drop duplicated or redundant columns
client_disp_merge.drop(['disp_client_id', 'client_birth_number', 'acc_has_disp_x'], axis=1, inplace=True)
client_disp_merge.rename(columns={'acc_has_disp_y' : 'num_disp'}, inplace = True)
client_disp_merge.head()

Unnamed: 0,disp_account_id,account_id,account_district_id,account_frequency,account_date,account_year,account_month,account_day,account_lor,disp_id,...,client_id,client_district_id,client_birth_year,client_birth_month,client_birth_day,client_gender,client_age,client_age_group,client_category,num_disp
0,1,1,18,POPLATEK MESICNE,950324,1995,3,24,2,1,...,1,18,1970,12,13,F,27,20,Young(< 21),0.0
1,2,2,1,POPLATEK MESICNE,930226,1993,2,26,4,2,...,2,1,1945,2,4,M,52,50,Adult(21-55),0.0
3,3,3,5,POPLATEK MESICNE,970707,1997,7,7,0,4,...,4,5,1956,12,1,M,41,40,Adult(21-55),0.0
5,4,4,12,POPLATEK MESICNE,960221,1996,2,21,1,6,...,6,12,1919,9,22,M,78,70,Senior Citizen(> 55),0.0
6,5,5,15,POPLATEK MESICNE,970530,1997,5,30,0,7,...,7,15,1929,1,25,M,68,60,Senior Citizen(> 55),0.0


# DISTRICT DATA

In [32]:
district.head(1)

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,1,Hl.m. Praha,Prague,1204953,0,0,0,1,1,100.0,12541,0.29,0.43,167,85677,99107


## PROCESSING DISTRICT DATA

In [33]:
# Checking missing columns
print("The number of missing columns are: " +  str(district.isna().sum().sum()))

#Checking values in A12, A15 column prior to converting to float
print(district['A12'].unique())
print(district['A15'].unique())

The number of missing columns are: 0
['0.29' '1.67' '1.95' '4.64' '3.85' '2.95' '2.26' '1.25' '3.39' '0.56'
 '0.45' '3.83' '2.77' '1.42' '3.13' '1.12' '2.38' '2.83' '2.65' '1.51'
 '1.10' '1.79' '1.39' '2.47' '2.64' '0.65' '1.62' '2.82' '3.38' '3.52'
 '2.80' '5.75' '6.43' '1.02' '3.33' '4.46' '7.08' '7.34' '6.49' '3.32'
 '2.41' '1.72' '2.79' '2.28' '1.78' '1.89' '4.83' '2.51' '2.52' '2.53'
 '1.60' '1.88' '4.69' '3.73' '3.24' '3.45' '4.76' '1.29' '3.79' '5.74'
 '3.51' '5.77' '4.09' '?' '6.63' '5.93' '3.80' '4.75' '5.38' '4.73' '4.01']
['85677' '2159' '2824' '5244' '2616' '2640' '4289' '5179' '2987' '3810'
 '3475' '3804' '1597' '6604' '1845' '1874' '1003' '1740' '999' '1563'
 '2299' '1089' '2879' '5198' '1822' '6041' '1029' '1580' '818' '2985'
 '1328' '4340' '4650' '5323' '3384' '5796' '4147' '2653' '4947' '6949'
 '6445' '1658' '4085' '2166' '2080' '2854' '6079' '1655' '1660' '2123'
 '3496' '2564' '1850' '18721' '3659' '3729' '2212' '2595' '1879' '2112'
 '2719' '1562' '4484' '2157' '2247'

In [34]:
#Replacing '?' in A12, A15 column with 0.0 to remove impact
district.loc[district['A12'] == '?', 'A12'] = 0.0
district.loc[district['A15'] == '?', 'A15'] = 0

#Changing to column to Float, int
district['A12'] = district['A12'].astype(float)
district['A15'] = district['A15'].astype(int)

#Renaming district columns
district.rename(columns={'A1' : 'District_Code'}, inplace = True)
district.rename(columns={'A2' : 'District_Name'}, inplace = True)
district.rename(columns={'A3' : 'Region'}, inplace = True)
district.rename(columns={'A4' : 'n_inhabitants'}, inplace = True)
district.rename(columns={'A5' : 'municipalities_inh_499'}, inplace = True)
district.rename(columns={'A6' : 'municipalities_500_1999'}, inplace = True)
district.rename(columns={'A7' : 'municipalities_2000_9999'}, inplace = True)
district.rename(columns={'A8' : 'municipalities_10000'}, inplace = True)
district.rename(columns={'A9' : 'n_cities'}, inplace = True)
district.rename(columns={'A10' : 'ratio_urban_inh'}, inplace = True)
district.rename(columns={'A11' : 'average_salary'}, inplace = True)
district.rename(columns={'A12' : 'unemploymant_rate_95'}, inplace = True)
district.rename(columns={'A13' : 'unemploymant_rate_96'}, inplace = True)
district.rename(columns={'A14' : 'entrepreneurs_per_1000'}, inplace = True)
district.rename(columns={'A15' : 'committed_crimes_95'}, inplace = True)
district.rename(columns={'A16' : 'committed_crimes_96'}, inplace = True)

# Checking empty columns
emptyrows = checkEmpty(district)
district =  fillEmpty(district, emptyrows)
district.head(3)

Unnamed: 0,District_Code,District_Name,Region,n_inhabitants,municipalities_inh_499,municipalities_500_1999,municipalities_2000_9999,municipalities_10000,n_cities,ratio_urban_inh,average_salary,unemploymant_rate_95,unemploymant_rate_96,entrepreneurs_per_1000,committed_crimes_95,committed_crimes_96
0,1,Hl.m. Praha,Prague,1204953,0,0,0,1,1,100.0,12541,0.29,0.43,167,85677,99107
1,2,Benesov,central Bohemia,88884,80,26,6,2,5,46.7,8507,1.67,1.85,132,2159,2674
2,3,Beroun,central Bohemia,75232,55,26,4,1,5,41.7,8980,1.95,2.21,111,2824,2813


In [35]:
#Calculating Increase and Decrease in the unemployment rates & Committed crimes 

district.loc[district['unemploymant_rate_96'] > district['unemploymant_rate_95'], 'increase_in_unemploymant_rate'] = district['unemploymant_rate_96'] - district['unemploymant_rate_95']
district.loc[district['committed_crimes_96'] > district['committed_crimes_95'], 'increase_in_committed_crimes'] = district['committed_crimes_96'] - district['committed_crimes_95']
district.loc[district['unemploymant_rate_96'] < district['unemploymant_rate_95'], 'decrease_in_unemploymant_rate'] = district['unemploymant_rate_95'] - district['unemploymant_rate_96']
district.loc[district['committed_crimes_96'] < district['committed_crimes_95'], 'decrease_in_committed_crimes'] = district['committed_crimes_95'] - district['committed_crimes_96']


district['increase_in_unemploymant_rate'] = district['increase_in_unemploymant_rate'].fillna(0)
district['increase_in_committed_crimes'] = district['increase_in_committed_crimes'].fillna(0)
district['decrease_in_unemploymant_rate'] = district['decrease_in_unemploymant_rate'].fillna(0)
district['decrease_in_committed_crimes'] = district['decrease_in_committed_crimes'].fillna(0)

#COMMENTS 

district.loc[:,'total_municipalities'] = district['municipalities_inh_499'] + district['municipalities_500_1999'] + district['municipalities_2000_9999'] + district['municipalities_10000']
district.loc[:, 'number_of_employed_population_96'] =  district['n_inhabitants'] - district['unemploymant_rate_96']/100 * district['n_inhabitants']
district.loc[:, 'number_of_employed_urban_population_96'] =  district['number_of_employed_population_96'] - district['number_of_employed_population_96'] / district['ratio_urban_inh']
district['number_of_employed_population_96'] = district['number_of_employed_population_96'].astype(int)
district['number_of_employed_urban_population_96'] = district['number_of_employed_urban_population_96'].astype(int)
district.head(3)

Unnamed: 0,District_Code,District_Name,Region,n_inhabitants,municipalities_inh_499,municipalities_500_1999,municipalities_2000_9999,municipalities_10000,n_cities,ratio_urban_inh,...,entrepreneurs_per_1000,committed_crimes_95,committed_crimes_96,increase_in_unemploymant_rate,increase_in_committed_crimes,decrease_in_unemploymant_rate,decrease_in_committed_crimes,total_municipalities,number_of_employed_population_96,number_of_employed_urban_population_96
0,1,Hl.m. Praha,Prague,1204953,0,0,0,1,1,100.0,...,167,85677,99107,0.14,13430.0,0.0,0.0,1,1199771,1187773
1,2,Benesov,central Bohemia,88884,80,26,6,2,5,46.7,...,132,2159,2674,0.18,515.0,0.0,0.0,114,87239,85371
2,3,Beroun,central Bohemia,75232,55,26,4,1,5,41.7,...,111,2824,2813,0.26,0.0,0.0,11.0,86,73569,71805


## MERGING DISTRICT DATA TO CLIENT ACCOUNT AND DISPOSITION TABLE

In [36]:
#Merging District data with the data we have so far

client_disp_dist_merge = client_disp_merge.merge(district, left_on = 'client_district_id', right_on='District_Code', how = 'left' )

#Dropping redundant columns
client_disp_merge.drop(['account_district_id', 'account_date', 'disp_account_id', 'account_date','client_district_id','disp_account_id'], axis=1, inplace=True)

client_disp_merge.head(3)

Unnamed: 0,account_id,account_frequency,account_year,account_month,account_day,account_lor,disp_id,disp_type,client_id,client_birth_year,client_birth_month,client_birth_day,client_gender,client_age,client_age_group,client_category,num_disp
0,1,POPLATEK MESICNE,1995,3,24,2,1,OWNER,1,1970,12,13,F,27,20,Young(< 21),0.0
1,2,POPLATEK MESICNE,1993,2,26,4,2,OWNER,2,1945,2,4,M,52,50,Adult(21-55),0.0
3,3,POPLATEK MESICNE,1997,7,7,0,4,OWNER,4,1956,12,1,M,41,40,Adult(21-55),0.0


# ORDER DATA

In [37]:
#Initial Order Data
order.head(1)

Unnamed: 0,order_id,account_id,bank_to,account_to,amount,k_symbol
0,29401,1,YZ,87144583,2452.0,SIPO


## PROCESSING ORDER DATA

In [38]:
#Rename Columns
order = renameColumn('order', order)

# Checking missing columns
print("The number of missing columns are: " +  str(order.isna().sum().sum()))

# Checking empty columns and fill them
emptyrows = checkEmpty(order)
order =  fillEmpty(order, emptyrows)
order.head(2)


The number of missing columns are: 0


Unnamed: 0,order_id,order_account_id,order_bank_to,order_account_to,order_amount,order_k_symbol
0,29401,1,YZ,87144583,2452.0,SIPO
1,29402,2,ST,89597016,3372.7,UVER


## AGGREGATING RESULTS IN ORDER DATA

In [39]:
#Calculating Order Frequency
order_freq1 = order.groupby(by=['order_account_id'], as_index=True)['order_amount'].agg(["count","sum"])
order_ksymbol_freq = order.groupby(by=['order_account_id', 'order_k_symbol'])['order_amount'].sum().unstack('order_k_symbol', fill_value=0)
order_freq1.head(2)


Unnamed: 0_level_0,count,sum
order_account_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,2452.0
2,2,10638.7


In [40]:
order_ksymbol_freq.head(3)

order_k_symbol,LEASING,Other,POJISTNE,SIPO,UVER
order_account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.0,0.0,0.0,2452.0,0.0
2,0.0,0.0,0.0,7266.0,3372.7
3,0.0,327.0,3539.0,1135.0,0.0


## MERGING AGGREGATED RESULTS FOR ORDER TO PREVIOUSLY MERGED TABLE

In [41]:
#Merging the Data

processed_table = client_disp_dist_merge.merge(order_freq1, left_on = 'account_id', right_on = 'order_account_id', how = 'left')
processed_table = processed_table.merge(order_ksymbol_freq, left_on = 'account_id', right_on = 'order_account_id', how = 'left')

In [42]:
#Renaming Required Columns
processed_table.rename(columns={'count' : 'num_order','sum' : 'total_order_amount' }, inplace = True)

In [43]:
#Data till this point - Account,Client, Disposition, Order,District 

processed_table.head(3)

Unnamed: 0,disp_account_id,account_id,account_district_id,account_frequency,account_date,account_year,account_month,account_day,account_lor,disp_id,...,total_municipalities,number_of_employed_population_96,number_of_employed_urban_population_96,num_order,total_order_amount,LEASING,Other,POJISTNE,SIPO,UVER
0,1,1,18,POPLATEK MESICNE,950324,1995,3,24,2,1,...,76,68330,67284,1.0,2452.0,0.0,0.0,0.0,2452.0,0.0
1,2,2,1,POPLATEK MESICNE,930226,1993,2,26,4,2,...,1,1199771,1187773,2.0,10638.7,0.0,0.0,0.0,7266.0,3372.7
2,3,3,5,POPLATEK MESICNE,970707,1997,7,7,0,4,...,100,91380,89602,3.0,5001.0,0.0,327.0,3539.0,1135.0,0.0


# LOAN DATA

In [44]:
#Initial Loan Data
loan.head(2)

Unnamed: 0,loan_id,account_id,date,amount,duration,payments,status
0,5314,1787,930705,96396,12,8033.0,B
1,5316,1801,930711,165960,36,4610.0,A


## PROCESSING LOAN DATA

In [45]:
#Rename Columns
loan = renameColumn('loan', loan)
# Checking missing columns
print("The number of missing columns are: " +  str(loan.isna().sum().sum()))


#Processing Date to get Year, Month & Day
loan['loan_year'] = loan['loan_date'].astype(str).str[:2].astype(int) 
loan['loan_year'] = loan['loan_year'] + 1900
loan['loan_month'] = loan['loan_date'].astype(str).str[2:4].astype(int)
loan['loan_day'] = loan['loan_date'].astype(str).str[4:6].astype(int)


#Current Loan Status - Getting the meaning of the Loan Status
loan["cur_loan_Status"] = 'Not Available'
loan.loc[loan["loan_status"]=="A","cur_loan_Status"]="Closed"
loan.loc[loan["loan_status"]=="B","cur_loan_Status"]="Closed - Defaulted"
loan.loc[loan["loan_status"]=="C","cur_loan_Status"]="Running"
loan.loc[loan["loan_status"]=="D","cur_loan_Status"]="Running - in Debt"


#Loan Category based on Loan Amount - Low, Medium & High
loan["loan_category"]=np.where(loan['loan_amount'] < 196940 ,"Low",
                                     (np.where(loan['loan_amount'] < 393880.0 , "Medium" ,"High")))

 

#creating target Variable - (loan_granted)
loan['loan_granted'] = 0
loan.loc[loan["loan_year"]  == 1997,'loan_granted'] = 1

# Checking empty columns
emptyrows = checkEmpty(loan)
loan =  fillEmpty(loan, emptyrows)
loan.head(3)

The number of missing columns are: 0


Unnamed: 0,loan_id,loan_account_id,loan_date,loan_amount,loan_duration,loan_payments,loan_status,loan_year,loan_month,loan_day,cur_loan_Status,loan_category,loan_granted
0,5314,1787,930705,96396,12,8033.0,B,1993,7,5,Closed - Defaulted,Low,0
1,5316,1801,930711,165960,36,4610.0,A,1993,7,11,Closed,Low,0
2,6863,9188,930728,127080,60,2118.0,A,1993,7,28,Closed,Low,0


## MERGING LOAN DATA TO PROCESSED TABLE

In [46]:
#Check if loan has unique account id
#loan = loan.groupby('loan_account_id').sum()

#Merging the Loan data to the processed_table
processed_table = processed_table.merge(loan, left_on = 'account_id', right_on = 'loan_account_id', how = 'left')
processed_table.head()

Unnamed: 0,disp_account_id,account_id,account_district_id,account_frequency,account_date,account_year,account_month,account_day,account_lor,disp_id,...,loan_amount,loan_duration,loan_payments,loan_status,loan_year,loan_month,loan_day,cur_loan_Status,loan_category,loan_granted
0,1,1,18,POPLATEK MESICNE,950324,1995,3,24,2,1,...,,,,,,,,,,
1,2,2,1,POPLATEK MESICNE,930226,1993,2,26,4,2,...,80952.0,24.0,3373.0,A,1994.0,1.0,5.0,Closed,Low,0.0
2,3,3,5,POPLATEK MESICNE,970707,1997,7,7,0,4,...,,,,,,,,,,
3,4,4,12,POPLATEK MESICNE,960221,1996,2,21,1,6,...,,,,,,,,,,
4,5,5,15,POPLATEK MESICNE,970530,1997,5,30,0,7,...,,,,,,,,,,


# CARD DATA

In [47]:
#Initial Card Data
card.head(2)

Unnamed: 0,card_id,disp_id,type,issued
0,1005,9285,classic,931107 00:00:00
1,104,588,classic,940119 00:00:00


## PROCESSING CARD DATA

In [48]:
#Rename Columns
card = renameColumn('card', card)
# Checking missing columns
print("The number of missing columns are: " +  str(card.isna().sum().sum()))

#Processing Date to get Year,Month and issued_day
card['card_issued_year'] = card['card_issued'].astype(str).str[:2].astype(int) 
card['card_issued_year'] = card['card_issued_year'] + 1900
card['card_issued_month'] = card['card_issued'].astype(str).str[2:4].astype(int)
card['card_issued_day'] = card['card_issued'].astype(str).str[4:6].astype(int)

#change the date format
card['card_date_issued'] = pd.to_datetime(card['card_issued']).dt.date

#Length of Relationship for the Card (card_LOR)
card["card_LOR"] = datetime.datetime(1997,1,1) - (pd.to_datetime(card.card_date_issued))

#removing negative values for LOR
card['card_LOR'] = abs(card['card_LOR'])

#creating target Variable - (card_issued_97) 
card['card_issued_97'] = 0 
card.loc[card["card_issued_year"]  == 1997,'card_issued_97'] = 1



# Checking empty columns
emptyrows = checkEmpty(card)
card =  fillEmpty(card, emptyrows)
card.head(2)

The number of missing columns are: 0


Unnamed: 0,card_id,card_disp_id,card_type,card_issued,card_issued_year,card_issued_month,card_issued_day,card_date_issued,card_LOR,card_issued_97
0,1005,9285,classic,931107 00:00:00,1993,11,7,1993-11-07,1151 days,0
1,104,588,classic,940119 00:00:00,1994,1,19,1994-01-19,1078 days,0


In [49]:
#Checking Cards per Account

card_disp_merge = disp.merge(card, left_on = 'disp_id', right_on = 'card_disp_id', how = 'inner')
summary = card_disp_merge.groupby('disp_account_id')['card_id'].agg('count')
summary.unique()

#All accounts have have one card assoiciated with the account

array([1], dtype=int64)

## MERGING CARD DATA TO PROCESSED TABLE

In [50]:
#Merging Card Data to the processed_table

processed_table = processed_table.merge(card_disp_merge, left_on = 'account_id', right_on = 'disp_account_id', how = 'left')
processed_table.drop(['disp_account_id_x', 'disp_id_x', 'disp_type_x', 'disp_id_y', 'disp_account_id_y', 'disp_type_y', 'card_disp_id'], axis=1, inplace=True)
processed_table.info()

KeyError: 'disp_account_id'

# TRANSACTION DATA

In [None]:
#initial trans data
trans.head(2)

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account
0,695247,2378,930101,PRIJEM,VKLAD,700.0,700.0,,,
1,171812,576,930101,PRIJEM,VKLAD,900.0,900.0,,,


## PROCESSING TRANSACTION DATA

In [None]:
#Rename Columns
trans = renameColumn('trans', trans)
# Checking missing columns
print("The number of missing columns are: " +  str(trans.isna().sum().sum()))

#Chcking for empty rows
emptyrows = checkEmpty(trans)

#check na per column 
#for column in emptyrows:
    #print("the empty rows in " + column + " are: ")
    #print(trans[column].isna().sum())

#Fill Empty Rows
trans =  fillEmpty(trans, emptyrows)

The number of missing columns are: 2208738


In [None]:
#Rename Columns 

trans.rename(columns={'k_symbol' : 'trans_k_symbol'}, inplace = True)
trans.rename(columns={'bank' : 'partner_bank'}, inplace = True)
trans.rename(columns={'account' : 'partner_account'}, inplace = True)
trans.rename(columns={'operation' : 'trans_operation'}, inplace = True)
trans.rename(columns={'type' : 'trans_type'}, inplace = True)
trans.rename(columns={'date' : 'trans_date'}, inplace = True)
trans.rename(columns={'amount' : 'trans_amount'}, inplace = True)
trans.rename(columns={'balance' : 'trans_balance'}, inplace = True)

#renaming the trans_type and trans_operation values 

trans['trans_type'] = trans['trans_type'].replace(['PRIJEM', 'VYDAJ', 'VYBER'], ['Credit', 'Withdrawal', 'Withdrawal'])

trans['trans_operation'] = trans['trans_operation'].replace(['VYBER KARTOU', 'VKLAD','PREVOD Z UCTU','VYBER','PREVOD NA UCET'], 
                                                  ['card_withdrawal', 'credit_cash','credit_otherbank','cash_withdrawal','transfer_otherbank'])



# Extract the transaction day
trans['trans_day'] = trans['trans_date'].astype(str).str[-2:].astype(int)

# Extract the transaction month
trans['trans_month'] = trans['trans_date'].astype(str).str[2:4].astype(int)

#Extract the transaction year
trans['trans_year'] = trans['trans_date'].astype(str).str[:2].astype(int) + 1900


#changing the trans_date format
trans['trans_date'] = '19' + trans['trans_date'].astype(str)
trans['trans_date'] = pd.to_datetime(trans['trans_date'], format='%Y%m%d').dt.strftime("%d-%m-%Y")
trans['date'] = pd.DatetimeIndex(trans['trans_date']).date

In [None]:
#Renaming trans_account_id to trans_account_id
trans.rename(columns={'trans_account_id' : 'account_id'}, inplace = True)

In [None]:
#RECENY FREQUENCY AND MONETARY VALUES FOR TRANSACTIONS

# Calculating RECENCY of the transactions
recency_df = trans.groupby(by='account_id', as_index=False)['date'].max()

#naming extracted last_trans_date column
recency_df.columns = ['account_id','last_trans_date']
recent_date=recency_df.last_trans_date.max()

#Calculating recency from the last date
recency_df['trans_recency'] = recency_df['last_trans_date'].apply(lambda x: (recent_date - x).days)
recency_df.head(1)



# Calculating FREQUENCY (How often a tansaction is made) of the transaction
freq_df = trans.groupby(by=['account_id'], as_index=False)['trans_id'].count()
freq_df.columns = ['account_id','trans_frequency']
freq_df.head(1)

# Calculating Monetary 
monetary_df = trans.groupby(by='account_id',as_index=False).agg({'trans_amount': 'sum'})
monetary_df.columns = ['account_id','monetary']
monetary_df.head(1)


# RFM Table
# Merge recency dataframe with frequency dataframe
temp_df = recency_df.merge(freq_df,on='account_id')

# Merge monetary_df to temp_df 
rfm_df = temp_df.merge(monetary_df,on='account_id')
rfm_df.head(3)

Unnamed: 0,account_id,last_trans_date,trans_recency,trans_frequency,monetary
0,1,1998-12-31,0,239,375174.5
1,2,1998-12-31,0,478,3151479.3
2,3,1998-12-31,0,117,295021.8


In [None]:
# RFM SCORE - ranking r,f,m  and normalize the values to calculate  the rfm score 


# Rank each metric Recency , Frequency & Monetary
rfm_df['r_rank'] = rfm_df['trans_recency'].rank( ascending=False)
rfm_df['f_rank'] = rfm_df['trans_frequency'].rank(ascending=True)
rfm_df['m_rank'] = rfm_df['monetary'].rank(ascending=True)


# normalize each rank with Max rank
rfm_df['r_rank_norm']=(rfm_df['r_rank']/rfm_df['r_rank'].max())*100
rfm_df['f_rank_norm']=(rfm_df['f_rank']/rfm_df['f_rank'].max())*100
rfm_df['m_rank_norm']=(rfm_df['m_rank']/rfm_df['m_rank'].max())*100


# RFM_Score = 0.15*R_rank_norm + 0.28*F_rank_norm + 0.57M_rank_norm**
rfm_df['rfm_Score']=0.15*rfm_df['r_rank_norm']+0.28*rfm_df['f_rank_norm']+0.57*rfm_df['m_rank_norm']
rfm_df=rfm_df.round(0)

#dropping varibales used to calculate rfm_score
rfm_df = rfm_df.drop(columns =['r_rank', 'f_rank','m_rank','r_rank_norm','f_rank_norm','m_rank_norm'])

# # Segment Accounts based on RFM score
# 0 - 30 - Low Value
# 30 - 70 - Gold
# 70 - 100 - Platinum
rfm_df["account_segment"]=np.where(rfm_df['rfm_Score'] > 70 ,"Platinum",
                                   (np.where(rfm_df['rfm_Score'] < 30 , "Low value" ,"Gold")))




In [None]:
rfm_df.head(3)

Unnamed: 0,account_id,last_trans_date,trans_recency,trans_frequency,monetary,rfm_Score,account_segment
0,1,1998-12-31,0,239,375174.0,43.0,Gold
1,2,1998-12-31,0,478,3151479.0,93.0,Platinum
2,3,1998-12-31,0,117,295022.0,30.0,Gold


## MERGE AGGREGATED TRANSACTION DATA TO PROCESSED TABLE

In [None]:
#Merging rfm_df to the trans

processed_table =  processed_table.merge(rfm_df, left_on = 'account_id', right_on = 'account_id', how = 'left')
#p =  pd.merge(trans,rfm_df,'left',on = 'account_id')

In [None]:
#Calculating cash_credit & cash_withdrawal
trans["cash_credit"] = 0
trans["cash_withdrawal"] = 0


trans.loc[trans['trans_operation'] == 'cash_credit', 'cash_credit'] = trans.loc[trans['trans_operation'] == 'cash_credit','trans_amount']


trans.loc[trans['trans_operation'] == 'cash_withdrawal', 'cash_withdrawal'] = trans.loc[trans['trans_operation'] == 'cash_withdrawal','trans_amount']


#Grouping the aggregated values
trans_temp = trans.groupby(['account_id'], as_index = False).agg({'trans_amount' : 'sum','cash_credit' : 'sum',
                                                               'cash_withdrawal' : 'sum'})



In [None]:
trans_temp.rename(columns={'trans_amount' : 'total_trans_amount'}, inplace = True)

In [None]:
#Dropping cash_credit , cash_withdrawal from trans and Merging trans_temp to trans

trans = trans.drop(columns =['cash_credit', 'cash_withdrawal'])

processed_table = pd.merge(processed_table,trans_temp,on = 'account_id',how = 'left')

In [None]:
#Average Balance and Amount Transacted per account 

avg_df = trans.groupby(by='account_id', as_index=False).agg({'trans_balance': 'mean','trans_amount':'mean' })
avg_df.columns = ['account_id','trans_avgbalance','trans_avgamount']
avg_df.trans_avgbalance = avg_df.trans_avgbalance.round()
avg_df.trans_avgamount = avg_df.trans_avgamount.round()
avg_df.head(1)

Unnamed: 0,account_id,trans_avgbalance,trans_avgamount
0,1,15994.0,1570.0


## MERGING AVERAGE TO PROCESSED DATA

In [None]:
#Merging avg_df to Processed_table
processed_table =  processed_table.merge(avg_df, left_on = 'account_id', right_on = 'account_id', how = 'left')


In [None]:
trans.rename(columns={'account_id' : 'trans_account_id'}, inplace = True)

## CREATING AGGREAGATED RESULTS FOR TRANSACTION

In [None]:
trans_k_symbol_agg = trans.loc[trans['trans_year'] == 1996].groupby(['trans_account_id', 'trans_k_symbol'])['trans_amount'].agg(['count','sum']).unstack('trans_k_symbol')
processed_table =  processed_table.merge(trans_k_symbol_agg, left_on = 'account_id', right_on = 'trans_account_id', how = 'left')

trans_operation_agg = trans.loc[trans['trans_year'] == 1996].groupby(['trans_account_id', 'trans_operation'])['trans_amount'].agg(['count','sum']).unstack('trans_operation')
processed_table =  processed_table.merge(trans_operation_agg, left_on = 'account_id', right_on = 'trans_account_id', how = 'left')

trans_type_agg = trans.loc[trans['trans_year'] == 1996].groupby(['trans_account_id', 'trans_type'])['trans_amount'].agg(['count','sum']).unstack('trans_type')
processed_table =  processed_table.merge(trans_type_agg, left_on = 'account_id', right_on = 'trans_account_id', how = 'left')


  return merge(


In [None]:
processed_table.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4500 entries, 0 to 4499
Columns: 112 entries, account_id to ('sum', 'Withdrawal')
dtypes: float64(68), int32(12), int64(18), object(13), timedelta64[ns](1)
memory usage: 3.7+ MB


In [None]:
processed_table.head()

Unnamed: 0,account_id,account_district_id,account_frequency,account_date,account_year,account_month,account_day,account_lor,client_id,client_district_id,...,"(count, transfer_otherbank)","(sum, card_withdrawal)","(sum, cash_withdrawal)","(sum, credit_cash)","(sum, credit_otherbank)","(sum, transfer_otherbank)","(count, Credit)","(count, Withdrawal)","(sum, Credit)","(sum, Withdrawal)"
0,1,18,POPLATEK MESICNE,950324,1995,3,24,2,1,18,...,12.0,,26155.2,5100.0,44148.0,29424.0,29.0,45.0,50066.0,55579.2
1,2,1,POPLATEK MESICNE,930226,1993,2,26,4,2,1,...,12.0,,187382.2,23500.0,263068.0,87192.0,29.0,56.0,288542.9,274574.2
2,3,5,POPLATEK MESICNE,970707,1997,7,7,0,4,5,...,,,,,,,,,,
3,4,12,POPLATEK MESICNE,960221,1996,2,21,1,6,12,...,11.0,,13877.6,800.0,55530.0,18100.0,17.0,22.0,56920.5,31977.6
4,5,15,POPLATEK MESICNE,970530,1997,5,30,0,7,15,...,,,,,,,,,,


## DROPPING AND RENAMING COLUMNS

In [None]:
processed_table.drop(['account_date', 'client_district_id', 'loan_account_id', 'loan_date', 'card_issued', 'card_date_issued'], axis=1, inplace=True)

processed_table.rename(columns={('count', 'DUCHOD') : 'num_of_oldage_pension'}, inplace = True)
processed_table.rename(columns={('count', 'Other') : 'num_of_other'}, inplace = True)
processed_table.rename(columns={('count', 'POJISTNE') : 'num_of_insurrance_payment'}, inplace = True)
processed_table.rename(columns={('count', 'SANKC. UROK') : 'num_of_interest_negativebalance'}, inplace = True)
processed_table.rename(columns={('count', 'SLUZBY') : 'num_of_statement_payment'}, inplace = True)
processed_table.rename(columns={('count', 'UROK') : 'num_of_interest_credited'}, inplace = True)
processed_table.rename(columns={('count', 'UVER') : 'num_of_loan_payment'}, inplace = True)
processed_table.rename(columns={('count', 'SIPO') : 'num_of_household_payment'}, inplace = True)

processed_table.rename(columns={('sum', 'DUCHOD') : 'amount_of_oldage_pension'}, inplace = True)
processed_table.rename(columns={('sum', 'Other') : 'amount_of_other'}, inplace = True)
processed_table.rename(columns={('sum', 'POJISTNE') : 'amount_of_insurrance_payment'}, inplace = True)
processed_table.rename(columns={('sum', 'SANKC. UROK') : 'amount_of_interest_negativebalance'}, inplace = True)
processed_table.rename(columns={('sum', 'SLUZBY') : 'amount_of_statement_payment'}, inplace = True)
processed_table.rename(columns={('sum', 'UROK') : 'amount_of_interest_credited'}, inplace = True)
processed_table.rename(columns={('sum', 'UVER') : 'amount_of_loan_payment'}, inplace = True)
processed_table.rename(columns={('sum', 'SIPO') : 'amount_of_household_payment'}, inplace = True)

processed_table.rename(columns={('count', 'card_withdrawal') : 'num_of_card_withdrawal'}, inplace = True)
processed_table.rename(columns={('count', 'cash_withdrawal') : 'num_of_cash_withdrawal'}, inplace = True)
processed_table.rename(columns={('count', 'credit_cash') : 'num_of_credit_cash'}, inplace = True)
processed_table.rename(columns={('count', 'credit_otherbank') : 'num_of_credit_otherbank'}, inplace = True)
processed_table.rename(columns={('count', 'transfer_otherbank') : 'num_of_transfer_otherbank'}, inplace = True)

processed_table.rename(columns={('sum', 'card_withdrawal') : 'amount_of_card_withdrawal'}, inplace = True)
processed_table.rename(columns={('sum', 'cash_withdrawal') : 'amount_of_cash_withdrawal'}, inplace = True)
processed_table.rename(columns={('sum', 'credit_cash') : 'amount_of_credit_cash'}, inplace = True)
processed_table.rename(columns={('sum', 'credit_otherbank') : 'amount_of_credit_otherbank'}, inplace = True)
processed_table.rename(columns={('sum', 'transfer_otherbank') : 'amount_of_transfer_otherbank'}, inplace = True)


processed_table.rename(columns={('count', 'Credit') : 'num_of_credits'}, inplace = True)
processed_table.rename(columns={('count', 'Withdrawal') : 'num_of_withdrawals'}, inplace = True)

processed_table.rename(columns={('sum', 'Credit') : 'amount_of_credit'}, inplace = True)
processed_table.rename(columns={('sum', 'Withdrawal') : 'amount_of_withdrawals'}, inplace = True)


In [None]:
for column in processed_table.columns[0:]:
    print(column)

account_id
account_district_id
account_frequency
account_year
account_month
account_day
account_lor
client_id
client_birth_year
client_birth_month
client_birth_day
client_gender
client_age
client_age_group
client_category
num_disp
District_Code
District_Name
Region
n_inhabitants
municipalities_inh_499
municipalities_500_1999
municipalities_2000_9999
municipalities_10000
n_cities
ratio_urban_inh
average_salary
unemploymant_rate_95
unemploymant_rate_96
entrepreneurs_per_1000
committed_crimes_95
committed_crimes_96
increase_in_unemploymant_rate
increase_in_committed_crimes
decrease_in_unemploymant_rate
decrease_in_committed_crimes
total_municipalities
number_of_employed_population_96
number_of_employed_urban_population_96
num_order
total_order_amount
LEASING
Other
POJISTNE
SIPO
UVER
loan_id
loan_amount
loan_duration
loan_payments
loan_status
loan_year
loan_month
loan_day
cur_loan_Status
loan_category
loan_granted
disp_client_id
card_id
card_type
card_issued_year
card_issued_month
card_iss

In [None]:
processed_table.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4500 entries, 0 to 4499
Columns: 106 entries, account_id to amount_of_withdrawals
dtypes: float64(66), int32(12), int64(16), object(11), timedelta64[ns](1)
memory usage: 3.5+ MB


# WRITING DATA TO CSV FILE

In [None]:
processed_table.to_csv(os.path.join('./data/processed',r'processed_table.csv'))