In [None]:
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from classifier_tf import ClassifierTFModel
import tensorflow as tf

In [None]:
df = pd.read_csv(
    "data/CustomerData_LeadGenerator.csv",
    sep=','
)
df.head()

In [None]:
# error in dataset (in "q_OpeningHours"), maybe a typo? replace with O
df = df.replace("ject", 0)

# cast "q_OpeningHours" to float64
df = df.astype({'q_OpeningHours': 'float64'})

In [None]:
def labels_to_class(arr):
    """ Converts the array of labels into a decimal number 
        ex. [1,0,0,0] -> 8
            [0,0,1,1] -> 3
    """
    to_class = [np.sum([n*2**i for i, n in enumerate(reversed(arr[j]))]) for j in range(arr.shape[0])]
    return to_class

In [None]:
# Convert the combinations of specialization labels into a class (0~2^10) 
include_b = [col for col in df.columns if col.startswith("b_") ][:-2]
binary = df[include_b].values[:300]

to_int = labels_to_class(binary)

fig, ax = plt.subplots(2,1,figsize=(15,4))
ax[0].scatter(np.arange(len(to_int)), to_int, s=5)
ax[1].scatter(np.arange(len(to_int)), np.sum(binary, axis=1), s=5)
plt.show()

### Analyze Input Data

In [None]:
# Plot all data that starts with "q_"
include_q = [col for col in df.columns if col.startswith("q_") ]
X = df[include_q].values
fig, ax = plt.subplots(X.shape[-1],1,figsize=(15,2*len(include_q)),sharex=True)
axes = [ax[i].scatter(np.arange(len(X)), X[:,i], s=5) for i in range(X.shape[-1])]
[ax[i].set_title(name) for i, name in enumerate(include_q)]
plt.show()

In [None]:

# some data seems to be correlated
fig, ax = plt.subplots(2,figsize=(15,5))
ax[0].scatter(np.arange(len(X)),df['q_2017 Total Households'].values, c='r')
ax1=ax[0].twinx()
ax1.scatter(np.arange(len(X)),df['q_2017 HHs: 5th Quintile (68.759 and above)'].values)

ax[1].scatter(np.arange(len(X)),df['q_2017 Pop 15+/Edu: University, Fachhochschule'].values)
ax2=ax[1].twinx()
ax2.scatter(np.arange(len(X)),df['q_Uni by Total Pop'].values, c='g')


In [None]:
label_kontakt = df['b_in_kontakt_gewesen'].values
label_gesamt = df['b_gekauft_gesamt'].values

print("Percentage of 1 in 'b_in_kontakt_gewesen' dataset: {}%, length:{}"
    .format(np.sum(label_kontakt)/len(label_kontakt)*100,np.sum(label_kontakt)))
print("Percentage of 1 in 'b_gekauft_gesamt' dataset: {}%, length:{}"
    .format(np.sum(label_gesamt)/len(label_gesamt)*100,np.sum(label_gesamt)))

### Train Classifier
The model will not be trained properly because there are too little label '1's in the labels. Most of the batches will contain only 0 labels and the model will converge to predict only '0' labels, which will still give a high accuracy because there are so little '1's..

In [None]:
X = StandardScaler().fit_transform(X)
y = df[['b_in_kontakt_gewesen','b_gekauft_gesamt']].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=99
)
print("Train size: {}, Test size: {}".format(X_train.shape, X_test.shape))
print("Train size: {}, Test size: {}".format(y_train.shape, y_test.shape))

In [None]:
model = ClassifierTFModel("./config.yaml")
model.train(X_train, X_test, y_train, y_test)

In [None]:
# High accuracy but not trained properly
result = model.predict(X_test)
y_pred = np.round(result, 0)
correct=[1 if np.array_equal(y_ref, y_pred) else 0 for y_ref, y_pred in zip(y_test, y_pred) ]
print("Accuracy based on the test set: {0:0.3f}%".format(np.sum(correct)/len(correct)*100))
