# `AZDIAS` + `CUSTOMERS` dataset

# 00. Importing packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
try:
    from tqdm import tqdm
except:
    !pip install tqdm
    from tqdm import tqdm    
%matplotlib inline


### my utils
from sklearn.preprocessing import OrdinalEncoder
from eda_utils import *

# 01. Loading datasets

In [None]:
customers = pd.read_csv("../arvato_data_processed/customers.csv", low_memory = False, index_col = 'LNR')
azdias = pd.read_csv("../arvato_data_processed/azdias.csv", low_memory = False, index_col = 'LNR')

customers_feature_types =  pd.read_csv("../arvato_data_processed/customers_feature_types.csv", low_memory = False,index_col='Attribute' )
azdias_feature_types =  pd.read_csv("../arvato_data_processed/azdias_feature_types.csv", low_memory = False,index_col='Attribute' )

print(customers_feature_types.shape,azdias_feature_types.shape)

customers_features = set(customers_feature_types['Attribute.1'])
azdias_features = set(azdias_feature_types['Attribute.1']) 

# 02. Common features

In [None]:
common_features=  customers_features.intersection(azdias_features)
len(common_features)

We have 330 features in common, let's see what features are missing. We know from previous notebooks that `customers` initially had 3 more features than `azdias` dataset:

In [None]:
customers_features.difference(azdias_features)

In [None]:
azdias_features.difference(customers_features)

So, after applying filtering for missingness in `azdias` and `customers` we have 330 common features.

Three features present in `customers` are lacking entires in `azdias`:
1. `CUSTOMER_GROUP`
2. `ONLINE_PURCHASE`
3. `PRODUCT_GROUP`

These features are related to the customer-related information for which we **naturally don't have general population equivalent**: 1. what is the group of the customer (single/multiple buyer), 2. whether it bought something online from us, 3. what category of product was it

Whereas customers lack two features present in general population:

1. `KKK` - purchasing power
2. `REGIOTYP` - neighbourhood, one of several possible classes: unknown, upper class, conservatives, upper middle class, middle class, lower middle class, traditional workers, marginal groups

In [None]:
azdias = azdias[common_features]
customers = customers[common_features]

In [None]:
customers['dataset'] = 'customers'
azdias['dataset'] = 'azdias'

In [None]:
joint_dataset = pd.concat([customers,azdias])
assert customers.shape[0]+azdias.shape[0]==joint_dataset.shape[0]

joint_dataset.head(5)

In [None]:
# free memory
del customers
del azdias

# 03. Features

## 03.01. Standardizing features

Upon inspecting metadata attributes in `DIAS Attributes - Values 2017.xlsx` I've found that there are a couple of features that need re-encoding:

- `LP_FAMILIE_GROB` must be standardized. As initially there are multiple labels that refer to the same class!
    - from documentation: 
     ```
     1 -> single
     2 - > couple
     3,4,5 -> single parent
     6,7,8 -> family
     9,10,11 -> multiperson household
     
     ```
     
     
I will replace all secondary values for a class with the first value of a class. Also, there seems to be a value `0` present, but for which we don't have any information, I'll replace `0` with NA

In [None]:

print("Before standarization we have classes: {}".format(np.unique(joint_dataset['LP_FAMILIE_GROB'].dropna())))


joint_dataset['LP_FAMILIE_GROB'].replace(4,3,inplace=True)
joint_dataset['LP_FAMILIE_GROB'].replace(5,3,inplace=True)

joint_dataset['LP_FAMILIE_GROB'].replace(7,6,inplace=True)
joint_dataset['LP_FAMILIE_GROB'].replace(8,6,inplace=True)

joint_dataset['LP_FAMILIE_GROB'].replace(10,9,inplace=True)
joint_dataset['LP_FAMILIE_GROB'].replace(11,9,inplace=True)

joint_dataset['LP_FAMILIE_GROB'].replace(0,np.nan,inplace=True)


print("After standarization we have classes: {}".format(np.unique(joint_dataset['LP_FAMILIE_GROB'].dropna())))


- `LP_STATUS_GROB` is a feature that encodes the same class with different labels. From the provided metadata in Exel file we know:
   ```
   1,2 - > low-income earners
   3,4,5 -> average,erners
   6,7 -> independants
   8,9 -> houseowners
   10 -> top earners
   
   ```
   

In [None]:
print("Before standarization we have classes: {}".format(np.unique(joint_dataset['LP_STATUS_GROB'].dropna())))

joint_dataset['LP_STATUS_GROB'].replace(2,1,inplace=True)

joint_dataset['LP_STATUS_GROB'].replace(4,3,inplace=True)
joint_dataset['LP_STATUS_GROB'].replace(5,3,inplace=True)

joint_dataset['LP_STATUS_GROB'].replace(7,6,inplace=True)

joint_dataset['LP_STATUS_GROB'].replace(9,8,inplace=True)

print("After standarization we have classes: {}".format(np.unique(joint_dataset['LP_STATUS_GROB'].dropna())))


- `PRAEGENDE_JUGENDJAHRE` could be simplified, thus I create a new feature: `PRAEGENDE_JUGENDJAHRE_SIM` to collapse a couple of classes together based on metadata:


initial classes:
```
1	40ies - war years (Mainstream, O+W)
2	40ies - reconstruction years (Avantgarde, O+W)
3	50ies - economic miracle (Mainstream, O+W)
4	50ies - milk bar / Individualisation (Avantgarde, O+W)
5	60ies - economic miracle (Mainstream, O+W)
6	60ies - generation 68 / student protestors (Avantgarde, W)
7	60ies - opponents to the building of the Wall (Avantgarde, O)
8	70ies - family orientation (Mainstream, O+W)
9	70ies - peace movement (Avantgarde, O+W)
10	80ies - Generation Golf (Mainstream, W)
11	80ies - ecological awareness (Avantgarde, W)
12	80ies - FDJ / communist party youth organisation (Mainstream, O)
13	80ies - Swords into ploughshares (Avantgarde, O)
14	90ies - digital media kids (Mainstream, O+W)
15	90ies - ecological awareness (Avantgarde, O+W)
```
simplified classes:

```
    1,2 -> 40ies
    3,4 -> 50ies
    5,6,7 -> 60ies
    8,9 -> 70ies
    10,11,12,13 -: 80ies
    14,15 -> 90ies

```


In [None]:
joint_dataset['PRAEGENDE_JUGENDJAHRE_SIM']= joint_dataset['PRAEGENDE_JUGENDJAHRE']

print("Before standarization we have classes: {}".format(np.unique(joint_dataset['PRAEGENDE_JUGENDJAHRE'].dropna())))

joint_dataset['PRAEGENDE_JUGENDJAHRE_SIM'].replace(2,1,inplace=True)

joint_dataset['PRAEGENDE_JUGENDJAHRE_SIM'].replace(4,3,inplace=True)

joint_dataset['PRAEGENDE_JUGENDJAHRE_SIM'].replace(6,5,inplace=True)
joint_dataset['PRAEGENDE_JUGENDJAHRE_SIM'].replace(7,5,inplace=True)
 
joint_dataset['PRAEGENDE_JUGENDJAHRE_SIM'].replace(9,8,inplace=True)


joint_dataset['PRAEGENDE_JUGENDJAHRE_SIM'].replace([11,12,13],10,inplace=True)


joint_dataset['PRAEGENDE_JUGENDJAHRE_SIM'].replace(15,14,inplace=True)

print("After standarization we have classes: {}".format(np.unique(joint_dataset['PRAEGENDE_JUGENDJAHRE_SIM'].dropna())))


- `CAMEO_DEUINTL_2015` can be also generalized/simplified: to `CAMEO_DEUINTL_2015_SIM`

original annotations
```
11	Wealthy Households-Pre-Family Couples & Singles
12	Wealthy Households-Young Couples With Children
13	Wealthy Households-Families With School Age Children
14	Wealthy Households-Older Families &  Mature Couples
15	Wealthy Households-Elders In Retirement
21	Prosperous Households-Pre-Family Couples & Singles
22	Prosperous Households-Young Couples With Children
23	Prosperous Households-Families With School Age Children
24	Prosperous Households-Older Families & Mature Couples
25	Prosperous Households-Elders In Retirement
31	Comfortable Households-Pre-Family Couples & Singles
32	Comfortable Households-Young Couples With Children
33	Comfortable Households-Families With School Age Children
34	Comfortable Households-Older Families & Mature Couples
35	Comfortable Households-Elders In Retirement
41	Less Affluent Households-Pre-Family Couples & Singles
42	Less Affluent Households-Young Couples With Children
43	Less Affluent Households-Families With School Age Children
44	Less Affluent Households-Older Families & Mature Couples
45	Less Affluent Households-Elders In Retirement
51	Poorer Households-Pre-Family Couples & Singles
52	Poorer Households-Young Couples With Children
53	Poorer Households-Families With School Age Children
54	Poorer Households-Older Families & Mature Couples
55	Poorer Households-Elders In Retirement

```

simplified:

```
11,12,13,14,15 -> wealthy

21,22,23,24,25 -> prosperous

31,32,33,34,35 -> comfortable

41,42,43,44,45 -> less affluent

51,52,53,54,55 -> poorer

```

also, the `XX` is replaced as unknown:

In [None]:
joint_dataset['CAMEO_DEUINTL_2015_SIM'] = joint_dataset['CAMEO_DEUINTL_2015']


joint_dataset['CAMEO_DEUINTL_2015'].replace('XX',np.nan,inplace=True)
 
    
print("Before standarization we have classes: {}".format(np.unique(joint_dataset['CAMEO_DEUINTL_2015_SIM'].dropna())))

joint_dataset['CAMEO_DEUINTL_2015_SIM'].replace(12,11,inplace=True)
joint_dataset['CAMEO_DEUINTL_2015_SIM'].replace(13,11,inplace=True)
joint_dataset['CAMEO_DEUINTL_2015_SIM'].replace(14,11,inplace=True)
joint_dataset['CAMEO_DEUINTL_2015_SIM'].replace(15,11,inplace=True)
 
joint_dataset['CAMEO_DEUINTL_2015_SIM'].replace(22,21,inplace=True)
joint_dataset['CAMEO_DEUINTL_2015_SIM'].replace(23,21,inplace=True)
joint_dataset['CAMEO_DEUINTL_2015_SIM'].replace(24,21,inplace=True)
joint_dataset['CAMEO_DEUINTL_2015_SIM'].replace(25,21,inplace=True)
 
joint_dataset['CAMEO_DEUINTL_2015_SIM'].replace(32,31,inplace=True)
joint_dataset['CAMEO_DEUINTL_2015_SIM'].replace(33,31,inplace=True)
joint_dataset['CAMEO_DEUINTL_2015_SIM'].replace(34,31,inplace=True)
joint_dataset['CAMEO_DEUINTL_2015_SIM'].replace(35,31,inplace=True)

joint_dataset['CAMEO_DEUINTL_2015_SIM'].replace(42,41,inplace=True)
joint_dataset['CAMEO_DEUINTL_2015_SIM'].replace(43,41,inplace=True)
joint_dataset['CAMEO_DEUINTL_2015_SIM'].replace(44,41,inplace=True)
joint_dataset['CAMEO_DEUINTL_2015_SIM'].replace(45,41,inplace=True)

joint_dataset['CAMEO_DEUINTL_2015_SIM'].replace(52,51,inplace=True)
joint_dataset['CAMEO_DEUINTL_2015_SIM'].replace(53,51,inplace=True)
joint_dataset['CAMEO_DEUINTL_2015_SIM'].replace(54,51,inplace=True)
joint_dataset['CAMEO_DEUINTL_2015_SIM'].replace(55,51,inplace=True)
    
joint_dataset['CAMEO_DEUINTL_2015_SIM'].replace('XX',np.nan,inplace=True)    
    
print("After standarization we have classes: {}".format(np.unique(joint_dataset['CAMEO_DEUINTL_2015_SIM'].dropna())))


In [None]:
np.unique(joint_dataset['CAMEO_DEUINTL_2015_SIM'].dropna())

- encode (check types)
- impute
- correlate
- save independent?

## 03.02. Encode Features

joint dataset comprises of three types of features: 

1. numerical - will be left unchanged
2. ordinal - will be encoded with label encoder
3. categorical -> will be encoded with label encoder


I will encode ordinal and categorical features as standardized integers. As seen above some classes comprise of high-value numbers.

In [None]:
# Assert that the feature types in AZDIAS and Customers are the same for ALL
common_features = pd.merge(customers_feature_types,azdias_feature_types,on='Attribute.1')
assert common_features[common_features['Type_x']!=common_features["Type_y"]].shape[0]==0


Extract feature names that **ARE NOT NUMERICAL** (that need label encoding)

In [None]:
str_features = list(common_features[common_features['Type_x']!='numerical']['Attribute.1'])

# two newly created features
str_features.append('PRAEGENDE_JUGENDJAHRE_SIM')
str_features.append('CAMEO_DEUINTL_2015_SIM')
str_features = set(str_features)
len(str_features)

In [None]:
joint_dataset.shape

In [None]:
from sklearn.preprocessing import LabelEncoder
LabelEncoders = { x: LabelEncoder() for x in str_features  }

In [None]:
%%time
empty_cells = pd.isnull(joint_dataset).sum()*100.0/joint_dataset.shape[0]
empty_cells = empty_cells.sort_values(ascending=False)
empty_cells[:50].plot(figsize=(20,3),kind='bar') # bar plot of first 50 most missing features

In [None]:
joint_dataset.shape

In [None]:
for idx,feature in enumerate(str_features):
    print("{}/{}: {}".format(idx+1,len(str_features),feature ))
    
    not_na_mask =  joint_dataset[feature].notnull()
    
    joint_dataset[feature][not_na_mask] = LabelEncoders[feature].fit_transform(  joint_dataset[feature][not_na_mask]  )

In [None]:
joint_dataset.shape

In [None]:
%%time
empty_cells = pd.isnull(joint_dataset).sum()*100.0/joint_dataset.shape[0]
empty_cells = empty_cells.sort_values(ascending=False)
empty_cells[:50].plot(figsize=(20,3),kind='bar') # bar plot of first 50 most missing features

## 03.02 Impute features

In [None]:
joint_dataset.drop('dataset',inplace=True,axis='columns')

In [None]:
%%time
import sklearn 
from sklearn.experimental import enable_iterative_imputer
print('The scikit-learn version is {}.'.format(sklearn.__version__))
from sklearn.impute import IterativeImputer

ie = IterativeImputer(n_nearest_features=3) # 10 is fine

joint_dataset_imputed = ie.fit_transform(joint_dataset) # ~20min, returns numpy array
joint_dataset_imputed_df = pd.DataFrame(joint_dataset_imputed, columns= joint_dataset.columns )
joint_dataset_imputed_df.to_csv("../arvato_data_processed/joint_dataset_imputed.csv")

In [None]:
joint_dataset_imputed_df =  pd.read_csv("../arvato_data_processed/joint_dataset_imputed.csv", low_memory = False )

In [None]:
for i,feature in enumerate(str_features):
    print("{}/{} {}".format(i+1,len(str_features), feature))
    joint_dataset_imputed_df[feature] = joint_dataset_imputed_df[feature].round()

# TODO
if it was imputed, then compare the number of 'uniques', perhaps round

In [None]:
imputed_features_counts = {}
imputed_features_counts['Attribute'] =[]
imputed_features_counts['Before_imputation'] =[]
imputed_features_counts['After_imputation'] =[]

for idx,f in enumerate(str_features):
    print("{}/{}: {}".format(idx+1,len(str_features),f))
    
    imputed_features_counts['Attribute'].append(f)

    uniq_before = len(np.unique(joint_dataset[f].dropna() ))
    imputed_features_counts['Before_imputation'].append(uniq_before)
    
    uniq_after =   len(np.unique(joint_dataset_imputed_df[f].dropna() ))
    imputed_features_counts['After_imputation'].append(uniq_after)
    
imputed_features_counts = pd.DataFrame.from_dict(imputed_features_counts)
imputed_features_counts.sort_values('Before_imputation',ascending=False, inplace=True)
imputed_features_counts

In [None]:
imputed_features_counts[imputed_features_counts['Before_imputation']!=imputed_features_counts['After_imputation']]

In [None]:
plt.hist(joint_dataset['CAMEO_DEUINTL_2015_SIM'].dropna())

In [None]:
plt.hist(joint_dataset_imputed_df['CAMEO_DEUINTL_2015_SIM'].dropna())

In [None]:
# replace seemingly new classes with NANs
for feature in enumerate(str_features):
    print("{}/{}: {}".format(idx+1,len(str_features),f))
    imputed_clusters = np.unique(joint_dataset_imputed_df[feature].dropna())
    original_clusters = np.unique(joint_dataset[feature].dropna())


    unobserved_classes = set(imputed_clusters).difference( set(original_clusters) )
    for uclass in unobserved_classes:
        joint_dataset_imputed_df[feature].replace(uclass,np.nan, inplace=True)
        