In [9]:
import pandas as pd
import numpy as np

In [10]:
AML = pd.read_csv('large.csv', index_col = 'cif') 
labels = pd.read_csv('jeopardy.csv', index_col = 'cif') 

Removing customers who are not using their account

In [11]:
AML = AML[AML['turnover'] > 0]
labels = labels.loc[AML.index,]

Adding AML and CPI country score

In [12]:
# our challenge country-id list
country = pd.read_csv('country.csv')
country.country_name = country.country_name.str.lower()
country.set_index('country_name',inplace=True, drop=True)
# downloaded ranking and AWL score for countries
country_aml = pd.read_csv('AML_risk_ranking.csv', names = ['Country', 'country_score', 'country_ranking'])
country_aml.Country =  country_aml.Country.str.lower()
country_aml.set_index('Country',inplace=True, drop=True)
# adding AML index and ranking to initial country list
country = country.join(country_aml)
# adding AML index and ranking to transaction datasource
AML = AML.join(country.set_index('country_id'), on = 'nationality')

In [14]:
# downloaded corruption_index_cross-over for countries
corr_index = pd.read_csv('corruption_index_cross-over.csv')
corr_index_df = corr_index[['country_id', 'country_name', 'CPI Score 2018', 'Rank']]
# adding CPI index and ranking to transaction datasource
AML = AML.join(corr_index_df.set_index('country_id'), on = 'nationality')

In [15]:
#Rename scoring and ranking columns
AML.rename(columns={"country_score": "aml_risk_score", 
                    "country_ranking": "aml_risk_ranking", 
                    "CPI Score 2018": "cpi_risk_score", 
                    "Rank": "cpi_risk_ranking"}, inplace = True)

Adding normalized and logged features

In [16]:
# adding normalized atm_features
AML['atm_withdrawal_norm'] = AML['atm_withdrawal']/AML['turnover']
AML['atm_deposit_norm'] = AML['atm_deposit']/AML['turnover']
AML['transaction_avg'] = AML['turnover']/AML['transaction_count']
#logging features
AML['turnover_log'] = np.log10(1+AML['turnover'])
AML['atm_withdrawal_log'] = np.log10(1+AML['atm_withdrawal'])
AML['atm_deposit_log'] = np.log10(1+AML['atm_deposit'])
AML['transaction_count_log'] = np.log10(1+AML['transaction_count'])
AML['distinct_counterparties_log'] = np.log10(1+AML['distinct_counterparties'])
AML['atm_withdrawal_norm_log'] = np.log10(1+AML['atm_withdrawal_norm'])
AML['atm_deposit_norm_log'] = np.log10(1+AML['atm_deposit_norm'])
AML['inactive_days_average_log'] = np.log10(1+AML['inactive_days_average'])

Adding new categorie

In [17]:
AML['new_category'] = AML['category']
AML.loc[AML['is_pep']==1, 'new_category']=3

Adding new names for categories

In [18]:
AML["category_name"] = ''
AML.loc[AML.new_category == 0, 'category_name'] = 'normal_ind'
AML.loc[AML.new_category == 3, 'category_name'] = 'pep_ind'
AML.loc[AML.new_category == 1, 'category_name'] = 'company'
AML.loc[AML.new_category == 2, 'category_name'] = 'institution'

Adding new columns about not using ATM

In [19]:
AML['no_atm_deposit'] = 0
AML.loc[AML['atm_deposit']==0, 'no_atm_deposit']=1

AML['no_atm_withdrawal'] = 0
AML.loc[AML['atm_withdrawal']==0, 'no_atm_withdrawal']=1

In [20]:
AML.to_csv('large_cleaned.csv')