In [45]:
import os
import pandas as pd
from pathlib import Path

In [82]:
# Data file paths
data_dir = os.getenv("DATA")
dataset_df = pd.read_csv(Path(data_dir) / "dataset.csv")
dataset_df = dataset_df.sort_values(by=["id"])
for col in dataset_df:
    if dataset_df[col].dtype == "object":
        dataset_df[col] = dataset_df[col].astype("category")
dataset_df.head()

Unnamed: 0,id,Geschlecht,Alter,Interesse,Fahrerlaubnis,Regional_Code,Vorversicherung,Alter_Fzg,Vorschaden,Jahresbeitrag,Vertriebskanal,Kundentreue
0,1,Male,44,1,1,28,0,> 2 Years,Yes,40454.0,26.0,217
1,2,Male,76,0,1,3,0,1-2 Year,No,33536.0,26.0,183
2,3,Male,47,1,1,28,0,> 2 Years,Yes,38294.0,26.0,27
3,4,Male,21,0,1,11,1,< 1 Year,No,28619.0,152.0,203
4,5,Female,29,0,1,41,1,< 1 Year,No,27496.0,152.0,39


Conversions:
- id: drop 
    - we don't want to train on the ID as this would kill generalization
- regional code: one-hot
    - no inherent ranking
- vertriebs channel: one-hot
    - no inherent ranking
- vehicle age: ordinal
    - Inherently ranks the age of the vehicle
- Vorschaden: binary

In [83]:
target = dataset_df.pop("Interesse")
target = pd.concat([target, dataset_df["id"]], axis=1)

In [84]:
one_hot_df = pd.get_dummies(dataset_df, columns=['Regional_Code', 'Vertriebskanal'])
one_hot_df.head()

Unnamed: 0,id,Geschlecht,Alter,Fahrerlaubnis,Vorversicherung,Alter_Fzg,Vorschaden,Jahresbeitrag,Kundentreue,Regional_Code_0,...,Vertriebskanal_152.0,Vertriebskanal_153.0,Vertriebskanal_154.0,Vertriebskanal_155.0,Vertriebskanal_156.0,Vertriebskanal_157.0,Vertriebskanal_158.0,Vertriebskanal_159.0,Vertriebskanal_160.0,Vertriebskanal_163.0
0,1,Male,44,1,0,> 2 Years,Yes,40454.0,217,False,...,False,False,False,False,False,False,False,False,False,False
1,2,Male,76,1,0,1-2 Year,No,33536.0,183,False,...,False,False,False,False,False,False,False,False,False,False
2,3,Male,47,1,0,> 2 Years,Yes,38294.0,27,False,...,False,False,False,False,False,False,False,False,False,False
3,4,Male,21,1,1,< 1 Year,No,28619.0,203,False,...,True,False,False,False,False,False,False,False,False,False
4,5,Female,29,1,1,< 1 Year,No,27496.0,39,False,...,True,False,False,False,False,False,False,False,False,False


In [85]:
target

Unnamed: 0,Interesse,id
0,1,1
1,0,2
2,1,3
3,0,4
4,0,5
...,...,...
381104,0,381105
381105,0,381106
381106,0,381107
381107,0,381108


In [86]:
# Using dictionary comprehension to handle the encoding and then creating the DataFrame
encoded_data = {
    col: one_hot_df[col].cat.codes if one_hot_df[col].dtype.name == "category" else one_hot_df[col]
    for col in one_hot_df.columns
}

# Create DataFrame from the dictionary
encoded_df = pd.DataFrame(encoded_data).astype(float)

# Check dtypes of the resulting DataFrame
encoded_df.dtypes

id                      float64
Geschlecht              float64
Alter                   float64
Fahrerlaubnis           float64
Vorversicherung         float64
                         ...   
Vertriebskanal_157.0    float64
Vertriebskanal_158.0    float64
Vertriebskanal_159.0    float64
Vertriebskanal_160.0    float64
Vertriebskanal_163.0    float64
Length: 217, dtype: object

In [87]:
inputs_df = encoded_df.sample(n=1000, random_state=42)
inputs_df.head()
targets_df = target.iloc[inputs_df.index]
targets_df.sum() # Check if positives in class

Interesse          126
id           186970379
dtype: int64

In [88]:
targets_df.head()

Unnamed: 0,Interesse,id
200222,0,200223
49766,0,49767
172201,0,172202
160713,0,160714
53272,0,53273


In [89]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer, roc_auc_score, average_precision_score

# Define ROC AUC scorer
roc_auc_scorer = make_scorer(roc_auc_score, needs_proba=True, multi_class='ovo')

# Define AUC Precision-Recall scorer
pr_auc_scorer = make_scorer(average_precision_score, needs_proba=True)

k = 5  # Number of folds
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Initialize the Random Forest model
model = RandomForestClassifier(random_state=42)

# Cross-validate using ROC AUC
roc_auc_scores = cross_val_score(model,
                                 inputs_df.drop(columns=["id"]),
                                 targets_df.drop(columns=["id"]),
                                 cv=kf,
                                 scoring=roc_auc_scorer)
print(f'ROC AUC scores for each fold: {roc_auc_scores}')
print(f'Average ROC AUC: {roc_auc_scores.mean()}')

# Cross-validate using PR AUC
pr_auc_scores = cross_val_score(model,
                                inputs_df.drop(columns=["id"]),
                                targets_df.drop(columns=["id"]),
                                cv=kf,
                                scoring=pr_auc_scorer)
print(f'PR AUC scores for each fold: {pr_auc_scores}')
print(f'Average PR AUC: {pr_auc_scores.mean()}')

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


ROC AUC scores for each fold: [0.81623754 0.75686275 0.78361742 0.77464379 0.82845714]
Average ROC AUC: 0.7919637291051573


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


PR AUC scores for each fold: [0.3267542  0.3306909  0.26183702 0.28879949 0.42796537]
Average PR AUC: 0.3272093961971737


In [93]:
# Let's start by scaling the data (no feature can rule them all)
from sklearn.preprocessing import StandardScaler

# Scale the data
scaler = StandardScaler()
scaled_df = pd.DataFrame()
cols = encoded_df.columns.drop("id")
scaled_df = pd.DataFrame(scaler.fit_transform(encoded_df[cols]),
                         columns=cols)
scaled_df["id"] = encoded_df["id"]
scaled_df.to_csv(Path(data_dir) / "normal_dataset.csv", index=False)

In [94]:
# Let's start by scaling the data (no feature can rule them all)
from sklearn.preprocessing import MinMaxScaler

# Scale the data
scaler = MinMaxScaler()
scaled_df = pd.DataFrame()
scaled_df = pd.DataFrame(scaler.fit_transform(encoded_df[cols]),
                         columns=cols)
scaled_df["id"] = encoded_df["id"]
scaled_df.to_csv(Path(data_dir) / "minmax_dataset.csv", index=False)

In [91]:
encoded_df[encoded_df.columns.drop("id")].head()

Unnamed: 0,Geschlecht,Alter,Fahrerlaubnis,Vorversicherung,Alter_Fzg,Vorschaden,Jahresbeitrag,Kundentreue,Regional_Code_0,Regional_Code_1,...,Vertriebskanal_152.0,Vertriebskanal_153.0,Vertriebskanal_154.0,Vertriebskanal_155.0,Vertriebskanal_156.0,Vertriebskanal_157.0,Vertriebskanal_158.0,Vertriebskanal_159.0,Vertriebskanal_160.0,Vertriebskanal_163.0
0,1.0,44.0,1.0,0.0,2.0,1.0,40454.0,217.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,76.0,1.0,0.0,0.0,0.0,33536.0,183.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,47.0,1.0,0.0,2.0,1.0,38294.0,27.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,21.0,1.0,1.0,1.0,0.0,28619.0,203.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,29.0,1.0,1.0,1.0,0.0,27496.0,39.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
