In [91]:
# imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
%matplotlib inline

In [37]:
# ingest the data file used for training
train = pd.read_csv("./dataset/train.csv", low_memory=False)

In [3]:
%%bash
# Directory structure
ls -ltra --block-size=M

total 1M
drwxr-xr-x 48 raavan root   1M Nov  1 21:08 ..
drwxr-xr-x  2 raavan raavan 1M Nov  1 21:09 .ipynb_checkpoints
drwxrwxr-x  2 raavan raavan 1M Nov  2 10:34 dataset
-rw-rw-r--  1 raavan raavan 1M Nov  2 10:39 vigu.ipynb
drwxrwxr-x  5 raavan raavan 1M Nov  2 10:39 .
drwxrwxr-x  8 raavan raavan 1M Nov  2 10:40 .git


In [4]:
len(train.columns)

377

In [5]:
len(train)

300000

In [6]:
train.columns

Index(['UCIC_ID', 'NO_OF_Accs', 'HNW_CATEGORY', 'vintage', 'EMAIL_UNSUBSCRIBE',
       'OCCUP_ALL_NEW', 'city', 'dependents', 'zip', 'FINAL_WORTH_prev1',
       ...
       'Query_Resolved_PrevQ1', 'Complaint_Logged_PrevQ1',
       'Complaint_Resolved_PrevQ1', 'NO_OF_CHEQUE_BOUNCE_V1',
       'Percent_Change_in_Credits', 'Percent_Change_in_FT_Bank',
       'Percent_Change_in_FT_outside', 'Percent_Change_in_Self_Txn',
       'Percent_Change_in_Big_Expenses', 'Responders'],
      dtype='object', length=377)

In [9]:
train.head()

Unnamed: 0,UCIC_ID,NO_OF_Accs,HNW_CATEGORY,vintage,EMAIL_UNSUBSCRIBE,OCCUP_ALL_NEW,city,dependents,zip,FINAL_WORTH_prev1,...,Query_Resolved_PrevQ1,Complaint_Logged_PrevQ1,Complaint_Resolved_PrevQ1,NO_OF_CHEQUE_BOUNCE_V1,Percent_Change_in_Credits,Percent_Change_in_FT_Bank,Percent_Change_in_FT_outside,Percent_Change_in_Self_Txn,Percent_Change_in_Big_Expenses,Responders
0,170114,1,2_Preferred,1947,,HOUSEWIFE,BANGALORE,0.0,562114.0,HIGH,...,,,,,-94.757314,,0.0,,,0
1,488013,1,1_Imperia,3082,,SALARIED,MUMBAI,0.0,400015.0,HIGH,...,,,,,160.024566,,0.0,,0.0,0
2,435239,1,2_Preferred,572,,SELF_EMPLOYED,HILI,,733126.0,HIGH,...,,,,,273.771918,,10.0,,,1
3,331646,1,3_Classic,773,,SELF_EMPLOYED,NAMAKKAL,0.0,637410.0,HIGH,...,,,,,,,,,,0
4,226900,1,2_Preferred,1627,,HOUSEWIFE,MUMBAI,0.0,400053.0,HIGH,...,,,,,399.997265,,7.0,,,0


In [38]:
nulls = train.isnull().sum().sort_values(ascending=False).head(147)

In [39]:
remove_features = nulls.index.tolist()

In [41]:
train.drop(remove_features, axis=1, inplace=True)

In [None]:
train.drop('city', inplace=True, axis=1)

In [42]:
train.head()

Unnamed: 0,UCIC_ID,NO_OF_Accs,HNW_CATEGORY,vintage,OCCUP_ALL_NEW,city,dependents,zip,FINAL_WORTH_prev1,ENGAGEMENT_TAG_prev1,...,I_CNR_PrevQ2,I_NRV_PrevQ1,I_NRV_PrevQ2,CR_AMB_Drop_Build_1,CR_AMB_Drop_Build_2,CR_AMB_Drop_Build_3,CR_AMB_Drop_Build_4,CR_AMB_Drop_Build_5,Percent_Change_in_Credits,Responders
0,170114,1,2_Preferred,1947,HOUSEWIFE,BANGALORE,0.0,562114.0,HIGH,LOW,...,2154.266667,900750.2,1034232.0,-0.211875,-0.069396,-0.022604,-0.151781,0.454159,-94.757314,0
1,488013,1,1_Imperia,3082,SALARIED,MUMBAI,0.0,400015.0,HIGH,MEDIUM,...,14239.06,52729820.0,56656580.0,0.241488,0.167411,-0.071992,-0.013259,-0.30289,160.024566,0
2,435239,1,2_Preferred,572,SELF_EMPLOYED,HILI,,733126.0,HIGH,LOW,...,13.57,180150.8,26547.2,0.459032,46.068404,0.048818,-0.906061,0.66384,273.771918,1
3,331646,1,3_Classic,773,SELF_EMPLOYED,NAMAKKAL,0.0,637410.0,HIGH,LOW,...,2028.69,856120.9,850697.3,0.0,0.0,0.0,0.0,0.019374,,0
4,226900,1,2_Preferred,1627,HOUSEWIFE,MUMBAI,0.0,400053.0,HIGH,LOW,...,74.343333,172639.8,55725.59,8.270783,-0.272225,-0.150621,1.3914,-0.538117,399.997265,0


In [49]:
train.select_dtypes(exclude=[np.number]).columns

Index(['HNW_CATEGORY', 'OCCUP_ALL_NEW', 'city', 'FINAL_WORTH_prev1',
       'ENGAGEMENT_TAG_prev1', 'EFT_SELF_TRANSFER_PrevQ1',
       'Billpay_Active_PrevQ1_N', 'Billpay_Reg_ason_Prev1_N',
       'Charges_cnt_PrevQ1_N', 'FRX_PrevQ1_N', 'RBI_Class_Audit',
       'gender_bin'],
      dtype='object')

In [50]:
train.HNW_CATEGORY.value_counts()

2_Preferred    154549
3_Classic      107078
1_Imperia       38373
Name: HNW_CATEGORY, dtype: int64

In [79]:
ohe = OneHotEncoder(sparse=False)
le = LabelEncoder()

In [77]:
for feature in train.columns.values:
    
    if train[feature].dtypes == 'object':
        print(feature)
        train[feature] = le.fit_transform(train[feature].fillna('0'))

HNW_CATEGORY
OCCUP_ALL_NEW
FINAL_WORTH_prev1
ENGAGEMENT_TAG_prev1
EFT_SELF_TRANSFER_PrevQ1
Billpay_Active_PrevQ1_N
Billpay_Reg_ason_Prev1_N
Charges_cnt_PrevQ1_N
FRX_PrevQ1_N
RBI_Class_Audit
gender_bin


In [87]:
for feature in train.columns.values:
    
    if feature in ['OCCUP_ALL_NEW','FINAL_WORTH_prev1','ENGAGEMENT_TAG_prev1',\
                   'EFT_SELF_TRANSFER_PrevQ1','Billpay_Active_PrevQ1_N',\
                   'Billpay_Reg_ason_Prev1_N','Charges_cnt_PrevQ1_N','FRX_PrevQ1_N',\
                   'RBI_Class_Audit','gender_bin']:
        print(feature)
        train[feature] = ohe.fit_transform(train[feature].values.reshape(-1,1))

OCCUP_ALL_NEW
FINAL_WORTH_prev1
ENGAGEMENT_TAG_prev1
EFT_SELF_TRANSFER_PrevQ1
Billpay_Active_PrevQ1_N
Billpay_Reg_ason_Prev1_N
Charges_cnt_PrevQ1_N
FRX_PrevQ1_N
RBI_Class_Audit
gender_bin


In [93]:
train.isnull().sum().sort_values(ascending=False)

Recency_of_IB_TXN            135988
custinit_DR_cnt_prev4        126456
custinit_DR_amt_prev4        126456
custinit_DR_cnt_prev6        122657
custinit_DR_amt_prev6        122657
custinit_DR_cnt_prev5        119421
custinit_DR_amt_prev5        119421
custinit_DR_amt_prev3        116621
custinit_DR_cnt_prev3        116621
custinit_DR_cnt_prev1        115852
custinit_DR_amt_prev1        115852
custinit_DR_amt_prev2        114828
custinit_DR_cnt_prev2        114828
Percent_Change_in_Credits     96415
Recency_of_CR_TXN             33194
dependents                    26382
Recency_of_DR_TXN             23252
Recency_of_BRANCH_TXN         18368
zip                            1050
Recency_of_Activity             477
BRANCH_C_prev3                    0
MB_D_prev3                        0
MB_C_prev3                        0
IB_D_prev3                        0
COUNT_MB_D_prev2                  0
COUNT_POS_C_prev2                 0
IB_C_prev3                        0
BRANCH_D_prev3              

In [97]:
len(train.columns)

229

In [92]:
train = StandardScaler().fit_transform(train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').