# `AZDIAS` + `CUSTOMERS` dataset

# 00. Importing packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
try:
    from tqdm import tqdm
except:
    !pip install tqdm
    from tqdm import tqdm    
%matplotlib inline


### my utils
from sklearn.preprocessing import OrdinalEncoder
from eda_utils import *

# 01. Loading datasets

In [2]:
customers = pd.read_csv("../arvato_data_processed/customers.csv", low_memory = False, index_col = 'LNR')
azdias = pd.read_csv("../arvato_data_processed/azdias.csv", low_memory = False, index_col = 'LNR')

customers_feature_types =  pd.read_csv("../arvato_data_processed/customers_feature_types.csv", low_memory = False,index_col='Attribute' )
azdias_feature_types =  pd.read_csv("../arvato_data_processed/azdias_feature_types.csv", low_memory = False,index_col='Attribute' )

print(customers_feature_types.shape,azdias_feature_types.shape)

customers_features = set(customers_feature_types['Attribute.1'])
azdias_features = set(azdias_feature_types['Attribute.1']) 

(333, 2) (332, 2)


# 02. Common features

In [3]:
common_features=  customers_features.intersection(azdias_features)
len(common_features)

330

We have 330 features in common, let's see what features are missing. We know from previous notebooks that `customers` initially had 3 more features than `azdias` dataset:

In [4]:
customers_features.difference(azdias_features)

{'CUSTOMER_GROUP', 'ONLINE_PURCHASE', 'PRODUCT_GROUP'}

In [5]:
azdias_features.difference(customers_features)

{'KKK', 'REGIOTYP'}

So, after applying filtering for missingness in `azdias` and `customers` we have 330 common features.

Three features present in `customers` are lacking entires in `azdias`:
1. `CUSTOMER_GROUP`
2. `ONLINE_PURCHASE`
3. `PRODUCT_GROUP`

These features are related to the customer-related information for which we **naturally don't have general population equivalent**: 1. what is the group of the customer (single/multiple buyer), 2. whether it bought something online from us, 3. what category of product was it

Whereas customers lack two features present in general population:

1. `KKK` - purchasing power
2. `REGIOTYP` - neighbourhood, one of several possible classes: unknown, upper class, conservatives, upper middle class, middle class, lower middle class, traditional workers, marginal groups

In [6]:
azdias = azdias[common_features]
customers = customers[common_features]

In [7]:
customers['dataset'] = 'customers'
azdias['dataset'] = 'azdias'

In [8]:
joint_dataset = pd.concat([customers,azdias])
assert customers.shape[0]+azdias.shape[0]==joint_dataset.shape[0]

joint_dataset.head(5)

Unnamed: 0_level_0,D19_VERSI_DATUM,KBA13_HERST_EUROPA,KBA05_VORB0,GEBURTSJAHR,D19_SAMMELARTIKEL_RZ,CJT_TYP_6,KBA05_ZUL3,KBA13_AUDI,KBA05_ANTG2,SEMIO_PFLICHT,...,KBA13_CCM_3000,RELAT_AB,KBA13_KRSHERST_FORD_OPEL,KBA13_KRSZUL_NEU,D19_NAHRUNGSERGAENZUNG_RZ,KBA13_SEG_VAN,KBA05_HERST3,KBA13_VORB_0,KBA05_ANHANG,dataset
LNR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9626,10,2.0,4.0,0,6,5.0,2.0,4.0,2.0,2,...,3.0,1.0,3.0,3.0,0,4.0,3.0,4.0,1.0,customers
143872,10,4.0,4.0,0,6,5.0,3.0,4.0,2.0,4,...,5.0,3.0,1.0,3.0,0,3.0,1.0,4.0,3.0,customers
143873,9,5.0,3.0,0,0,5.0,4.0,2.0,0.0,3,...,3.0,1.0,1.0,1.0,5,4.0,3.0,4.0,3.0,customers
143874,10,4.0,2.0,1960,6,3.0,3.0,2.0,3.0,5,...,4.0,1.0,3.0,2.0,0,4.0,3.0,3.0,0.0,customers
143888,10,3.0,5.0,0,0,5.0,4.0,3.0,2.0,3,...,4.0,5.0,2.0,3.0,0,3.0,1.0,4.0,0.0,customers


# 03. Features

## 03.01. Standardizing features

Upon inspecting metadata attributes in `DIAS Attributes - Values 2017.xlsx` I've found that there are a couple of features that need re-encoding:

- `LP_FAMILIE_GROB` must be standardized. As initially there are multiple labels that refer to the same class!
    - from documentation: 
     ```
     1 -> single
     2 - > couple
     3,4,5 -> single parent
     6,7,8 -> family
     9,10,11 -> multiperson household
     
     ```
     
     
I will replace all secondary values for a class with the first value of a class. Also, there seems to be a value `0` present, but for which we don't have any information, I'll replace `0` with NA

In [9]:

print("Before standarization we have classes: {}".format(np.unique(joint_dataset['LP_FAMILIE_GROB'].dropna())))


joint_dataset['LP_FAMILIE_GROB'].replace(4,3,inplace=True)
joint_dataset['LP_FAMILIE_GROB'].replace(5,3,inplace=True)

joint_dataset['LP_FAMILIE_GROB'].replace(7,6,inplace=True)
joint_dataset['LP_FAMILIE_GROB'].replace(8,6,inplace=True)

joint_dataset['LP_FAMILIE_GROB'].replace(10,9,inplace=True)
joint_dataset['LP_FAMILIE_GROB'].replace(11,9,inplace=True)

joint_dataset['LP_FAMILIE_GROB'].replace(0,np.nan,inplace=True)


print("After standarization we have classes: {}".format(np.unique(joint_dataset['LP_FAMILIE_GROB'].dropna())))


Before standarization we have classes: [0. 1. 2. 3. 4. 5.]
After standarization we have classes: [1. 2. 3.]


- `LP_STATUS_GROB` is a feature that encodes the same class with different labels. From the provided metadata in Exel file we know:
   ```
   1,2 - > low-income earners
   3,4,5 -> average,erners
   6,7 -> independants
   8,9 -> houseowners
   10 -> top earners
   
   ```
   

In [10]:
print("Before standarization we have classes: {}".format(np.unique(joint_dataset['LP_STATUS_GROB'].dropna())))

joint_dataset['LP_STATUS_GROB'].replace(2,1,inplace=True)

joint_dataset['LP_STATUS_GROB'].replace(4,3,inplace=True)
joint_dataset['LP_STATUS_GROB'].replace(5,3,inplace=True)

joint_dataset['LP_STATUS_GROB'].replace(7,6,inplace=True)

joint_dataset['LP_STATUS_GROB'].replace(9,8,inplace=True)

print("After standarization we have classes: {}".format(np.unique(joint_dataset['LP_STATUS_GROB'].dropna())))


Before standarization we have classes: [1. 2. 3. 4. 5.]
After standarization we have classes: [1. 3.]


- `PRAEGENDE_JUGENDJAHRE` could be simplified, thus I create a new feature: `PRAEGENDE_JUGENDJAHRE_SIM` to collapse a couple of classes together based on metadata:


initial classes:
```
1	40ies - war years (Mainstream, O+W)
2	40ies - reconstruction years (Avantgarde, O+W)
3	50ies - economic miracle (Mainstream, O+W)
4	50ies - milk bar / Individualisation (Avantgarde, O+W)
5	60ies - economic miracle (Mainstream, O+W)
6	60ies - generation 68 / student protestors (Avantgarde, W)
7	60ies - opponents to the building of the Wall (Avantgarde, O)
8	70ies - family orientation (Mainstream, O+W)
9	70ies - peace movement (Avantgarde, O+W)
10	80ies - Generation Golf (Mainstream, W)
11	80ies - ecological awareness (Avantgarde, W)
12	80ies - FDJ / communist party youth organisation (Mainstream, O)
13	80ies - Swords into ploughshares (Avantgarde, O)
14	90ies - digital media kids (Mainstream, O+W)
15	90ies - ecological awareness (Avantgarde, O+W)
```
simplified classes:

```
    1,2 -> 40ies
    3,4 -> 50ies
    5,6,7 -> 60ies
    8,9 -> 70ies
    10,11,12,13 -: 80ies
    14,15 -> 90ies

```


In [11]:
joint_dataset['PRAEGENDE_JUGENDJAHRE_SIM']= joint_dataset['PRAEGENDE_JUGENDJAHRE']

print("Before standarization we have classes: {}".format(np.unique(joint_dataset['PRAEGENDE_JUGENDJAHRE'].dropna())))

joint_dataset['PRAEGENDE_JUGENDJAHRE_SIM'].replace(2,1,inplace=True)

joint_dataset['PRAEGENDE_JUGENDJAHRE_SIM'].replace(4,3,inplace=True)

joint_dataset['PRAEGENDE_JUGENDJAHRE_SIM'].replace([6,7],5,inplace=True)
 
joint_dataset['PRAEGENDE_JUGENDJAHRE_SIM'].replace(9,8,inplace=True)


joint_dataset['PRAEGENDE_JUGENDJAHRE_SIM'].replace([11,12,13],10,inplace=True)
 

joint_dataset['PRAEGENDE_JUGENDJAHRE_SIM'].replace(15,14,inplace=True)

print("After standarization we have classes: {}".format(np.unique(joint_dataset['PRAEGENDE_JUGENDJAHRE_SIM'].dropna())))


Before standarization we have classes: [ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15.]
After standarization we have classes: [ 1.  3.  5.  8. 10. 14.]


- `CAMEO_DEUINTL_2015` can be also generalized/simplified: to `CAMEO_DEUINTL_2015_SIM`

original annotations
```
11	Wealthy Households-Pre-Family Couples & Singles
12	Wealthy Households-Young Couples With Children
13	Wealthy Households-Families With School Age Children
14	Wealthy Households-Older Families &  Mature Couples
15	Wealthy Households-Elders In Retirement
21	Prosperous Households-Pre-Family Couples & Singles
22	Prosperous Households-Young Couples With Children
23	Prosperous Households-Families With School Age Children
24	Prosperous Households-Older Families & Mature Couples
25	Prosperous Households-Elders In Retirement
31	Comfortable Households-Pre-Family Couples & Singles
32	Comfortable Households-Young Couples With Children
33	Comfortable Households-Families With School Age Children
34	Comfortable Households-Older Families & Mature Couples
35	Comfortable Households-Elders In Retirement
41	Less Affluent Households-Pre-Family Couples & Singles
42	Less Affluent Households-Young Couples With Children
43	Less Affluent Households-Families With School Age Children
44	Less Affluent Households-Older Families & Mature Couples
45	Less Affluent Households-Elders In Retirement
51	Poorer Households-Pre-Family Couples & Singles
52	Poorer Households-Young Couples With Children
53	Poorer Households-Families With School Age Children
54	Poorer Households-Older Families & Mature Couples
55	Poorer Households-Elders In Retirement

```

simplified:

```
11,12,13,14,15 -> wealthy

21,22,23,24,25 -> prosperous

31,32,33,34,35 -> comfortable

41,42,43,44,45 -> less affluent

51,52,53,54,55 -> poorer

```

also, the `XX` is replaced as unknown:

In [12]:
joint_dataset['CAMEO_DEUINTL_2015_SIM'] = joint_dataset['CAMEO_DEUINTL_2015']

 
print("Before standarization we have classes: {}".format(np.unique(joint_dataset['CAMEO_DEUINTL_2015_SIM'].dropna())))

joint_dataset['CAMEO_DEUINTL_2015_SIM'].replace([12,13,14,15],11,inplace=True)
 
joint_dataset['CAMEO_DEUINTL_2015_SIM'].replace([22,223,24,25],21,inplace=True)
 
joint_dataset['CAMEO_DEUINTL_2015_SIM'].replace([32,33,34,35],31,inplace=True)

joint_dataset['CAMEO_DEUINTL_2015_SIM'].replace([42,43,44,45],41,inplace=True)

joint_dataset['CAMEO_DEUINTL_2015_SIM'].replace([52,53,54,55],51,inplace=True)
    
joint_dataset['CAMEO_DEUINTL_2015_SIM'].replace('XX',np.nan,inplace=True)    
    
print("After standarization we have classes: {}".format(np.unique(joint_dataset['CAMEO_DEUINTL_2015_SIM'].dropna())))


Before standarization we have classes: ['12' '13' '14' '15' '22' '23' '24' '25' '31' '32' '33' '34' '35' '41'
 '43' '44' '45' '51' '52' '54' '55' 'XX']
After standarization we have classes: ['12' '13' '14' '15' '22' '23' '24' '25' '31' '32' '33' '34' '35' '41'
 '43' '44' '45' '51' '52' '54' '55']


- encode (check types)
- impute
- correlate
- save independent?

## 03.02 Impute features

In [13]:
from sklearn.impute import IterativeImputer

ImportError: cannot import name 'IterativeImputer'

## 03.02. Encode Features

joint dataset comprises of three types of features: 

1. numerical - will be left unchanged
2. ordinal - will be encoded with label encoder
3. categorical -> will be encoded with label encoder


I will encode ordinal and categorical features as standardized integers. As seen above some classes comprise of high-value numbers.

In [None]:
# Assert that the feature types in AZDIAS and Customers are the same for ALL
common_features = pd.merge(customers_feature_types,azdias_feature_types,on='Attribute.1')
assert common_features[common_features['Type_x']!=common_features["Type_y"]].shape[0]==0


In [None]:
str_features = list(common_features[common_features['Type_x']!='numerical']['Attribute.1'])

In [None]:
OE = OrdinalEncoder()

In [None]:
joint_dataset[str_features]  = OE.fit_transform( joint_dataset[str_features]  )