# Abschnitt 4.8: Autoencoder mit kategorialen Daten
### 1) Daten einlesen

In [1]:
import pandas as pd
from math import ceil

pd.set_option('display.max_columns', 20)

data_url=r'https://github.com/tplusone/hanser_ml_zeitreihen/blob/master/Daten/music_movies_interests_pref.csv?raw=true'
df = pd.read_csv(data_url)
df.head()

Unnamed: 0,Alcohol,Education,Age,Smoking,Bmi,Horror,Thriller,Comedy,Romantic,Sci-fi,...,Passive sport,Active sport,Gardening,Celebrities,Shopping,Science and technology,Theatre,Fun with friends,Adrenaline sports,Pets
0,drink a lot,college/bachelor degree,20.0,never smoked,18.066167,1,0,1,1,1,...,0,1,1,0,1,1,0,1,1,1
1,drink a lot,college/bachelor degree,19.0,never smoked,21.829952,0,0,1,0,1,...,0,0,0,0,0,0,0,1,0,1
2,drink a lot,secondary school,20.0,tried smoking,21.629649,0,1,1,0,1,...,1,0,0,0,1,0,1,1,1,1
3,drink a lot,college/bachelor degree,22.0,former smoker,19.943213,1,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,social drinker,secondary school,20.0,tried smoking,20.415225,1,1,1,0,0,...,0,0,1,0,0,0,0,1,0,0


### 2) Einteilung Train-/Test

In [2]:
df_train = df.sample(frac=.9, random_state=11)
df_test = df.drop(df_train.index)
X_train = df_train.drop(['Alcohol','Education', 'Age', 'Smoking', 'Bmi'], axis=1).values
X_test = df_test.drop(['Alcohol','Education', 'Age', 'Smoking', 'Bmi'], axis=1).values
X_train.shape, X_test.shape

((709, 60), (79, 60))

### 3) Autoencoder zusammenstellen

In [3]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.regularizers import l2

reg = l2(.0001)
input = Input(shape=(60,), name='input')
layer = Dense(units=90, activation='relu', 
                kernel_regularizer=reg,
                name='encoder_1')(input)
layer = Dense(units=20, activation='relu', 
                kernel_regularizer=reg,
                name='encoder_out')(layer)
layer = Dense(units=60, activation='sigmoid', 
                kernel_regularizer=reg, 
                name='decoder_out')(layer)
autoencoder = Model(input, layer)
autoencoder.compile(loss='binary_crossentropy', 
                    optimizer='adam', metrics=['accuracy'])
autoencoder.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 60)]              0         
_________________________________________________________________
encoder_1 (Dense)            (None, 90)                5490      
_________________________________________________________________
encoder_out (Dense)          (None, 20)                1820      
_________________________________________________________________
decoder_out (Dense)          (None, 60)                1260      
Total params: 8,570
Trainable params: 8,570
Non-trainable params: 0
_________________________________________________________________


### 4) Autoencoder anlernen

In [4]:
from tensorflow.keras.callbacks import (ModelCheckpoint, 
                                        EarlyStopping)
from tensorflow.keras.models import load_model

check = ModelCheckpoint(filepath='autoencoder_ohe.h5', 
                        monitor='val_loss', save_best_only=True)
early = EarlyStopping(monitor='val_loss', patience=10)

history = autoencoder.fit(X_train, X_train, 
                validation_data=(X_test, X_test), 
                epochs=200, batch_size=8, 
                callbacks=[check, early], verbose=False)

autoencoder.evaluate(X_test, X_test)



[0.28548082520690143, 0.8951477]

### 5) Evaluation: Anzahl einzigartiger Predictions zählen

In [5]:
import numpy as np

def count_unique_pred(y_pred):
    unique = []
    for i in range(0, len(y_pred)):
        temp = y_pred[i].astype(np.int)
        temp = ''.join(map(str,list(temp)))
        if i==0:
            unique.append(temp)
            continue
        if temp in unique:
            continue
        else:
            unique.append(temp)
    return unique

X_test_re = np.round(autoencoder.predict(X_test))
unique = count_unique_pred(X_test_re)
print('number of unique predictions: {}, total predictions: {}'.format(len(unique), len(X_test_re)))

number of unique predictions: 79, total predictions: 79


### 6) Encoder auskoppeln

In [6]:
from tensorflow.keras.models import load_model, Model

autoencoder = load_model('autoencoder_ohe.h5')
input_layer = autoencoder.get_layer('input').input
encoder_layer = autoencoder.get_layer('encoder_out').output

encoder = Model(input_layer, encoder_layer)
encoder.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 60)]              0         
_________________________________________________________________
encoder_1 (Dense)            (None, 90)                5490      
_________________________________________________________________
encoder_out (Dense)          (None, 20)                1820      
Total params: 7,310
Trainable params: 7,310
Non-trainable params: 0
_________________________________________________________________


### 7) Daten transformieren

In [7]:
X_train_dim = encoder.predict(X_train)
X_test_dim = encoder.predict(X_test)
X_train_dim.shape, X_test_dim.shape

((709, 20), (79, 20))

### 8) Tests: Regression mit dimensionsreduzierten Daten auf Alter der Person

In [8]:
from sklearn.linear_model import LinearRegression

linear = LinearRegression()
linear.fit(X_train_dim, df_train['Age'])
r2 = linear.score(X_test_dim, df_test['Age'])
print('r2: {:.3f}'.format(r2))

r2: 0.113


#### Zum Vergleich: Mit Originaldaten

In [9]:
linear = LinearRegression()
linear.fit(X_train, df_train['Age'])
r2 = linear.score(X_test, df_test['Age'])
print('r2: {:.3f}'.format(r2))

r2: 0.099
