In [None]:
import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline

import seaborn as sns

import os

path = os.getcwd()
os.chdir(f"{path}")

In [None]:
trn = pd.read_csv('./santander-product-recommendation/train_ver2.csv.zip')

# EDA

In [None]:
trn.shape

In [None]:
trn.head()

In [None]:
for col in trn.columns:
    print('{}\n'.format(trn[col].head()))

In [None]:
trn.info()

**It is crucial to know what each column indicates.**

## 1.1 Quantitative columns observation

Find the basic statistics of int- and float- type data from the first 24 columns, which are customer features.

In [None]:
num_cols = [col for col in trn.columns[:24] if trn[col].dtype in ['int64', 'float64']]
trn[num_cols].describe()

- <u>ncodpers</u>: works like an id
- <u>ind_nuevo</u>: the bottom 75% is 0, and the rest is 1.
- <u>indrel</u>: the bottom 75% is 1, and the rest is 99.
- <u>tipodom</u>: all values are 1, *and hence is unimportant for feature engineering*.
- <u>cod_prov</u>: ranges from 1 to 52; it is quantitative, but works as a qualitative column as it explains area code.
- <u>ind_actividad_cliente</u>: the bottom 50% is 0, and the rest is 1.
- <u>renta</u>: regular quantitative data.

## 1.2 Qualitative columns observation

In [None]:
# object type columns out of the first 24 columns (customer feature)

cat_cols = [col for col in trn.columns[:24] if trn[col].dtype in ['O']]
trn[cat_cols].describe()

Worth noting:

1. 'age' should be quantitative, but is categorized as qualitative.
2. 'antiguedad' should also be quantitative.

In [None]:
# find unique values per column

for col in cat_cols:
    uniq = np.unique(trn[col].astype(str))
    print('-' * 50)
    print('# col {}, n_uniq {}, uniq {}'.format(col, len(uniq), uniq))

**IT IS EXTREMELY IMPORTANT TO WRITE DOWN THE ANALYSIS NOTE.**

## 1.3 Visual observation

'ncodpers', which works like an id, and 'renta' have too many unique values, so for now we skip for visualization.

In [None]:
skip_cols = ['ncodpers', 'renta']
for col in trn.columns:
    if col in skip_cols:
        continue
    
    print('-' * 50)
    print('col : ', col)
    
    f, ax = plt.subplots(figsize=(12,9))
    sns.countplot(x=col, data=trn, alpha=0.5)
    
    plt.show()

**IT IS EXTREMELY IMPORTANT TO WRITE DOWN THE ANALYSIS NOTE.**

### 1.3.1 Temporal data visual observation

Some binary visualizations don't help much for feature engineering. We include the temporal factor into our analysis.

In [None]:
months = trn['fecha_dato'].unique().tolist()
label_cols = trn.columns[24:].tolist()

label_over_time = []
for i in range(len(label_cols)):
    label_sum = trn.groupby(['fecha_dato'])[label_cols[i]].agg('sum')
    label_over_time.append(label_sum.tolist())
    
label_sum_over_time = []
for i in range(len(label_cols)):
    label_sum_over_time.append(np.asarray(label_over_time[i:]).sum(axis=0))
    
color_list = ['#F5B7B1', '#D2B4DE', '#AED6F1', '#A2D9CE', '#ABEBC6', '#F9E79F',
              '#F5CBA7', '#CCD1D1']

f, ax = plt.subplots(figsize=(20, 10))
for i in range(len(label_cols)):
    sns.barplot(x=months, y=label_sum_over_time[i], color=color_list[i%8], alpha=0.7)
    
plt.legend([plt.Rectangle((0,0),1,1,fc=color_list[i%8], edgecolor='none') for i in range(len(label_cols))],
          label_cols, loc=1, ncol=2, prop={'size':16})