# SPR Data analysis

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%pylab inline
pylab.rcParams['figure.figsize'] = (10, 6)
color = sns.color_palette()



Populating the interactive namespace from numpy and matplotlib


**Dataset Size:**

First let us check the number of rows in train and test file :
- Number of rows in train :  13'647'309
- Number of rows in test :  929'615
- Number of clients (train dataset) : 956645

**Dataset columns:**
  
  
Main columns :   

- fecha_dato 	The table is partitioned for this column
- ncodpers 	Customer code
- ind_empleado 	Employee index: A active, B ex employed, F filial, N not employee, P pasive
- pais_residencia 	Customer's Country residence
- sexo 	Customer's sex
- age 	Age
- fecha_alta 	The date in which the customer became as the first holder of a contract in the bank
- ind_nuevo 	New customer Index. 1 if the customer registered in the last 6 months.
- antiguedad 	Customer seniority (in months)
- indrel 	1 (First/Primary), 99 (Primary customer during the month but not at the end of the month)
- ult_fec_cli_1t 	Last date as primary customer (if he isn't at the end of the month)
- indrel_1mes 	Customer type at the beginning of the month ,1 (First/Primary customer), 2 (co-owner ),P (Potential),3 (former primary), 4(former co-owner)
- tiprel_1mes 	Customer relation type at the beginning of the month, A (active), I (inactive), P (former customer),R (Potential)
- indresi 	Residence index (S (Yes) or N (No) if the residence country is the same than the bank country)
- indext 	Foreigner index (S (Yes) or N (No) if the customer's birth country is different than the bank country)
- conyuemp 	Spouse index. 1 if the customer is spouse of an employee
- canal_entrada 	channel used by the customer to join
- indfall 	Deceased index. N/S
- tipodom 	Addres type. 1, primary address
- cod_prov 	Province code (customer's address)
- nomprov 	Province name
- ind_actividad_cliente 	Activity index (1, active customer; 0, inactive customer)
- renta 	Gross income of the household
- segmento 	segmentation: 01 - VIP, 02 - Individuals 03 - college graduated    
    
    
target columns : 
    
- ind_ahor_fin_ult1 	Saving Account
- ind_aval_fin_ult1 	Guarantees
- ind_cco_fin_ult1 	Current Accounts
- ind_cder_fin_ult1 	Derivada Account
- ind_cno_fin_ult1 	Payroll Account
- ind_ctju_fin_ult1 	Junior Account
- ind_ctma_fin_ult1 	Más particular Account
- ind_ctop_fin_ult1 	particular Account
- ind_ctpp_fin_ult1 	particular Plus Account
- ind_deco_fin_ult1 	Short-term deposits
- ind_deme_fin_ult1 	Medium-term deposits
- ind_dela_fin_ult1 	Long-term deposits
- ind_ecue_fin_ult1 	e-account
- ind_fond_fin_ult1 	Funds
- ind_hip_fin_ult1 	Mortgage
- ind_plan_fin_ult1 	Pensions
- ind_pres_fin_ult1 	Loans
- ind_reca_fin_ult1 	Taxes
- ind_tjcr_fin_ult1 	Credit Card
- ind_valo_fin_ult1 	Securities
- ind_viv_fin_ult1 	Home Account
- ind_nomina_ult1 	Payroll
- ind_nom_pens_ult1 	Pensions
- ind_recibo_ult1 	Direct Debit    
    

## Data stats

In [3]:
data_path = "data/"
test_df = pd.read_csv(data_path+"test_ver2.csv")

Number of clients

In [4]:
len(test_df['ncodpers'].unique())

929615

In [9]:
test_df.dtypes

fecha_dato                object
ncodpers                   int64
ind_empleado              object
pais_residencia           object
sexo                      object
age                        int64
fecha_alta                object
ind_nuevo                  int64
antiguedad                 int64
indrel                     int64
ult_fec_cli_1t            object
indrel_1mes              float64
tiprel_1mes               object
indresi                   object
indext                    object
conyuemp                  object
canal_entrada             object
indfall                   object
tipodom                    int64
cod_prov                 float64
nomprov                   object
ind_actividad_cliente      int64
renta                     object
segmento                  object
dtype: object

Comparing with train : 
```
fecha_dato               datetime64[ns]
ncodpers                          int64
ind_empleado                     object
pais_residencia                  object
sexo                             object
age                              object
fecha_alta               datetime64[ns]
ind_nuevo                       float64
antiguedad                       object
indrel                          float64
ult_fec_cli_1t                   object
indrel_1mes                     float64
tiprel_1mes                      object
indresi                          object
indext                           object
conyuemp                        float64
canal_entrada                    object
indfall                          object
tipodom                         float64
cod_prov                        float64
nomprov                          object
ind_actividad_cliente           float64
renta                           float64
segmento                         object
ind_ahor_fin_ult1                 int64
ind_aval_fin_ult1                 int64
ind_cco_fin_ult1                  int64
ind_cder_fin_ult1                 int64
ind_cno_fin_ult1                  int64
ind_ctju_fin_ult1                 int64
ind_ctma_fin_ult1                 int64
ind_ctop_fin_ult1                 int64
ind_ctpp_fin_ult1                 int64
ind_deco_fin_ult1                 int64
ind_deme_fin_ult1                 int64
ind_dela_fin_ult1                 int64
ind_ecue_fin_ult1                 int64
ind_fond_fin_ult1                 int64
ind_hip_fin_ult1                  int64
ind_plan_fin_ult1                 int64
ind_pres_fin_ult1                 int64
ind_reca_fin_ult1                 int64
ind_tjcr_fin_ult1                 int64
ind_valo_fin_ult1                 int64
ind_viv_fin_ult1                  int64
ind_nomina_ult1                 float64
ind_nom_pens_ult1               float64
ind_recibo_ult1                   int64
dtype: object
```

In [10]:
test_df["age"] = pd.to_numeric(test_df["age"], errors="coerce")
test_df["renta"] = pd.to_numeric(test_df["renta"], errors="coerce")

In [13]:
print test_df['renta'].isnull().sum(), (~test_df['renta'].isnull()).sum(), test_df['renta'].shape[0]

227965 701650 929615


In [11]:
test_df["renta"].value_counts()

451931.22    354
463625.16    111
181042.20     91
128318.52     91
105260.88     90
488798.49     84
127141.50     73
283325.67     70
132335.73     67
236690.34     67
104563.80     63
227267.49     56
111933.69     55
273387.54     55
227397.72     53
174407.10     53
155930.43     51
218030.01     50
163432.47     44
208961.79     43
168733.62     43
135522.15     42
555605.55     41
152601.99     41
148736.19     41
248233.11     39
91673.25      38
288997.44     38
47550.39      38
326853.09     37
            ... 
131759.64      1
94696.23       1
203864.91      1
361948.38      1
102371.79      1
65207.94       1
67300.74       1
52590.42       1
160343.28      1
129977.07      1
144322.65      1
40388.10       1
88111.74       1
80455.77       1
74479.98       1
107185.83      1
66941.49       1
47097.72       1
68218.74       1
28025.40       1
57714.39       1
54004.59       1
119457.39      1
86670.87       1
319321.71      1
193227.72      1
110094.21      1
141067.95     

In [14]:
test_df[(~test_df['renta'].isnull())]['renta'].describe()

count    7.016500e+05
mean     1.340879e+05
std      2.323120e+05
min      1.202730e+03
25%      6.849114e+04
50%      1.014905e+05
75%      1.555408e+05
max      2.889440e+07
Name: renta, dtype: float64