In [4]:
import pandas as pd
import numpy as np
import timeit

In [5]:
segment_dict = {'MM':1, 'MA':2, 'PB':3 }

subsegment_dict = {'XNA':-1,
                   'EMPL':1, 
                   'RETIRED':2, 
                   'STUD':3, 
                   'UNEMPL':4, 
                   'CHILD_ADOL':5, 
                   'SELF_EMPL':6, 
                   'PUBL_SERV':7, 
                   'DOCTOR':8, 
                   'LAWYER':9, 
                   'OWN_EMPL_ACT':99, 
                   'ASSOC_FAM':99, 
                   'CIV_ENG':99, 
                   'FREELANCER':99, 
                   'PUBL_ACC':99, 
                   'NOTAR':99, 
                   'DENTIST':99, 
                   'VETERINARIES':99, 
                   'PHARM':99, 
                   'MEDIATORI':99, 
                   'PUBL_ACC':99, 
                   'OWN_EMPL_RET':99, 
                   'OTHERS':99}

gender_dict = {'F':1, 'M':2}

marital_status_dict = {'X': -1, 'C':1, 'N':2, 'D':3, 'V':4} # Married, not married, divorced, widow

education_dict = {'XNA':-1, 
                  'Alte forme de invatamant':1,
                  'Scoala primara':2, 
                  'Scoala profesionala':3, 
                  'Scoala postliceala':4,
                  'Gimnaziu':5,
                  'Colegiu':6,
                  'Liceu':7,
                  'Master':8,
                  'Universitate':9
                 }

bcr_employee_dict = {'N':0, 'Y':1}
workout_flag_dict = {'N':0, 'Y':1}

rating_value_dict = {'N':-1, 'R':1, 'D2':2, 'D1':3, 'C2':4, 'C1':5, 'B2':6, 'B1':7, 'A2':8, 'A1':9}

marketing_agreement_dict = {'X':-1, 'N':0, 'Y':1}

In [39]:
def process_data(df):
    df['CIC'] = df['CIC'].astype(np.uint32)
    
    df['AGE'] = (df.apply(lambda x: (x['DAX'].year - int(x['BIRTH_DATE'][:4])) * 12 +
                                     x['DAX'].month - int(x['BIRTH_DATE'][5:7])
                                     if x['DAX'].year > 1900 else -1                 # data quality
                          , axis=1).astype(np.int8))
    df.drop('BIRTH_DATE', axis = 1, inplace = True)

    df['TENOR'] = (df.apply(lambda x: (x['DAX'].year - int(x['CLIENT_START_DATE'][:4])) * 12 +
                                       x['DAX'].month - int(x['CLIENT_START_DATE'][5:7])
                                       if not pd.isnull(x['CLIENT_START_DATE'])
                                          and int(x['CLIENT_START_DATE'][:4]) >= 1990   # data quality
                                       else -1
                          , axis=1).astype(np.int8))
    df.drop('CLIENT_START_DATE', axis = 1, inplace = True)
    
    df['GENDER'] = df['GENDER'].map(lambda x: gender_dict[x]).astype(np.int8)
    df['MARITAL_STATUS'] = df['MARITAL_STATUS'].map(lambda x: marital_status_dict[x]).astype(np.int8)
    df['SEGMENT'] = df['SEGMENT'].map(lambda x: segment_dict[x]).astype(np.int8)
    df['SUBSEGMENT'] = df['SUBSEGMENT'].map(lambda x: subsegment_dict['XNA' if pd.isnull(x)
                                                                      else x]).astype(np.int8)
    df['EDUCATION'] = df['EDUCATION'].map(lambda x: education_dict['XNA' if pd.isnull(x) else x]).astype(np.int8)
        
    df['BCR_EMPLOYEE'] = df['BCR_EMPLOYEE'].map(lambda x: bcr_employee_dict[x]).astype(np.int8)
    df['WORKOUT_FLAG'] = df['WORKOUT_FLAG'].map(lambda x: workout_flag_dict[x]).astype(np.int8)

    df['RATING_VALUE'] = df['RATING_VALUE'].map(lambda x: rating_value_dict[x]).astype(np.int8)
   
    df['MARKETING_AGREEMENT'] = df['MARKETING_AGREEMENT'].map(lambda x: marketing_agreement_dict['XNA' if pd.isnull(x)
                                                                                                 else x]).astype(np.int8)

    df['ACCOUNT'] = (df.apply(lambda x: 6 if x['TOTAL'] != 0 else
                                        5 if x['CLASIC'] != 0 else
                                        3 if x['COMOD'] != 0 else
                                        2 if x['CAMPUS'] != 0 else
                                        1 if x['JUNIOR'] != 0 else 4, axis=1).astype(np.int8))
    df.drop(['JUNIOR', 'CAMPUS', 'COMOD', 'CLASIC', 'TOTAL'], axis=1, inplace=True)

    df['PBS_TYPE'] = df['PBS_TYPE'].map(lambda x: -1 if x == 'XNA' else 1).astype(np.int8)

    c_int8 = ['FLAG_ACTIVE_34', 'UNSECURED', 'SECURED', 'CREDITCARD', 'OVERDRAFT', 
              'DEPOZIT', 'SAVING_PLAN', 'MAXICONT',
              'PPI', 'UL_KI', 'INDX_LINK', 'HEALTH', 'ACP',
              'ASSET', 'TITLURI', 'AUR', 'PENSII',
              'DIRECT_DEBIT', 'STANDING_ORDER', 'TRANZACTII_POS', 'NET_BANKING']   
    df[c_int8] = df[c_int8].apply(np.int8)

    df['PAD'] = df['PAD'].map(lambda x: -1 if np.isnan(x) else x).astype(np.int8)
    
    c_int16 = ['BRANCH_CODE', 'CLIENT_DPD']
    df[c_int16] = df[c_int16].apply(np.uint16)
    
    df['PROFESSION'] = df['PROFESSION'].astype(np.int16)
       
    c_float16 = ['CM1_A', 'CM1_L', 'NFC', 'BALANCE_MAX_CAS', 'BALANCE_AVG_CAS']   
    df[c_float16] = df[c_float16].apply(np.float16)

    df['BALANCE_MIN_CA_3MONTHS'] = df['BALANCE_MIN_CA_3MONTHS'].map(lambda x: -1 if np.isnan(x) else x).astype(np.float16)
    df['BALANCE_MAX_DEPOSITS_3MONTHS'] = df['BALANCE_MAX_DEPOSITS_3MONTHS'].map(lambda x: -1 if np.isnan(x) else x).astype(np.float16)
    
    df['NO_SALARY'] = df['NO_SALARY'].map(lambda x: -1 if np.isnan(x) else x).astype(np.int8)
    df['NO_CASH_LAST_MONTH'] = df['NO_CASH_LAST_MONTH'].map(lambda x: -1 if np.isnan(x) else x).astype(np.int8)
    df['NO_OUTGOINGS_MONTH'] = df['NO_OUTGOINGS_MONTH'].map(lambda x: -1 if np.isnan(x) else x).astype(np.int8)

    df['SALARY'] = df['SALARY'].map(lambda x: -1 if np.isnan(x) else x).astype(np.float16)
    df['CASH_LAST_MONTH'] = df['CASH_LAST_MONTH'].map(lambda x: -1 if np.isnan(x) else x).astype(np.float16)
    df['OUTGOINGS_MONTH'] = df['OUTGOINGS_MONTH'].map(lambda x: -1 if np.isnan(x) else x).astype(np.float16)
    
    return df

In [40]:
tic0 = timeit.default_timer()

reader = pd.read_csv('../data/C_CLIENTS_DATA_TABLE.dsv', sep=';', chunksize=100000, parse_dates=[0])

#for chunk in reader:
#    df = process_data(chunk)
#    break
    
df = pd.concat([process_data(chunk) for chunk in reader])

print('Load time: ', timeit.default_timer() - tic0)

Load time:  188.24463845102582


In [54]:
tic0 = timeit.default_timer()

df.to_pickle('../cache/c_clients.pkl')
#df.to_hdf('../data/processed/c_clients.hdf', 'dump', mode = 'w')

for d in df['DAX'].unique():
    df[df['DAX'] == d].to_pickle('../cache/c_' + pd.to_datetime(str(d)).strftime('%Y%m') + '.pkl')

print('Save time: ', timeit.default_timer() - tic0)

Save time:  0.2832836279994808


In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 53 columns):
DAX                             100000 non-null datetime64[ns]
CIC                             100000 non-null uint32
SEGMENT                         100000 non-null int8
SUBSEGMENT                      100000 non-null int8
GENDER                          100000 non-null int8
MARITAL_STATUS                  100000 non-null int8
EDUCATION                       100000 non-null int8
PROFESSION                      100000 non-null int16
BCR_EMPLOYEE                    100000 non-null int8
WORKOUT_FLAG                    100000 non-null int8
RATING_VALUE                    100000 non-null int8
BRANCH_CODE                     100000 non-null uint16
CLIENT_DPD                      100000 non-null uint16
MARKETING_AGREEMENT             100000 non-null int8
FLAG_ACTIVE_34                  100000 non-null int8
UNSECURED                       100000 non-null int8
SECURED                  

In [24]:
df.head()

Unnamed: 0,DAX,CIC,SEGMENT,SUBSEGMENT,GENDER,MARITAL_STATUS,EDUCATION,PROFESSION,BCR_EMPLOYEE,WORKOUT_FLAG,...,BALANCE_MAX_DEPOSITS_3MONTHS,SALARY,NO_SALARY,CASH_LAST_MONTH,NO_CASH_LAST_MONTH,OUTGOINGS_MONTH,NO_OUTGOINGS_MONTH,AGE,TENOR,ACCOUNT
0,2015-01-01,12431937,1,3,2,2,9,27048,0,0,...,-1.0,-1.0,-1,-1.0,-1,-1.0,-1,29,39,2
1,2015-01-01,12434396,1,3,2,2,9,27048,0,0,...,-1.0,-1.0,-1,-1.0,-1,-1.0,-1,11,39,2
2,2015-01-01,12454618,1,3,2,2,-1,25085,0,0,...,-1.0,-1.0,-1,-1.0,-1,-1.0,-1,69,39,5
3,2015-01-01,12495110,1,1,2,2,9,24218,0,1,...,-1.0,-1.0,-1,-1.0,-1,-1.0,-1,-50,38,5
4,2015-01-01,12504209,1,1,1,1,5,25383,0,0,...,-1.0,-1.0,-1,-1.0,-1,-1.0,-1,-4,38,6
