In [None]:
import math

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import optuna

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, HDBSCAN
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, silhouette_score
from sklearn.feature_selection import VarianceThreshold
from hdbscan import HDBSCAN
from hdbscan.validity import validity_index
from yellowbrick.cluster import silhouette_visualizer, kelbow_visualizer

from utils import (
    create_interactive_pie_charts,
    plot_feature_correlations,
    plot_feature_distributions,
)

In [None]:
# Load the data
data = pd.read_csv("customer_data_test.csv", sep=";", index_col=0)
data = data.drop(columns=["ClientId"])

In [None]:
# Display the first 5 rows of the data
data.head()

In [None]:
data.info()

In [None]:
# Check for missing values
data.isnull().sum()

In [None]:
# Check for duplicates
data.duplicated().sum()

In [None]:
# Check for unique values
data.nunique()

In [None]:
# Check negative values
(data < 0).sum()

In [None]:
# Describe the data
percentiles = [0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99]
data.describe(percentiles=percentiles)

In [None]:
# Change all columns with unit "days" to be less than 365
days_columns = [col for col in data.columns if "Days" in col]
days_columns += ["LifeTime", "TimeToFirstDeposit"]
days_columns.remove("PercActiveDays")
days_columns

In [None]:
for col in days_columns:
    data[col] = data[col].apply(lambda x: min(x, 365))

In [None]:
# Replace columns with percentage and coefficient values greater than 1 with 1
perc_coeff_columns = [col for col in data.columns if "Perc" in col]
perc_coeff_columns.append("WinCoefficient")
perc_coeff_columns

In [None]:
for col in perc_coeff_columns:
    data[col] = data[col].apply(lambda x: min(x, 1))

In [None]:
# Replace TotalInactiveDays less than 0 with 0
data["TotalInactiveDays"] = data["TotalInactiveDays"].apply(lambda x: max(x, 0))

In [None]:
# Delete ActivePassiveRatio column
data = data.drop(columns="ActivePassiveRatio")

In [None]:
# Describe the data
data.describe(percentiles=percentiles)

In [None]:
# Get the most correlated features without duplicates
corr_matrix = data.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
most_correlated = upper.stack().sort_values(ascending=False)
most_correlated = most_correlated[most_correlated > 0.5]
most_correlated

In [None]:
# Drop the most correlated features
to_drop = [
    "TeamCount",
    "TotalInactiveDays",
    "MaxCompetitionPercent",
    "BetCount",
    "CountActiveBetDays",
    "TurnoverBonusInEur",
    "PercActiveDays",
]

In [None]:
# data = data.drop(columns=to_drop)

### Sanity check of all features

In [None]:
data.describe(percentiles=percentiles)

In [None]:
plot_feature_correlations(data)

In [None]:
plot_feature_distributions(data)

In [None]:
# Binned features analysis
create_interactive_pie_charts(data, num_bins=10)

### GGRInEur analysis

In [None]:
data["GGRInEur"].describe(percentiles=percentiles)

In [None]:
print(f"{data[data['GGRInEur'] < 0].shape[0] / data.shape[0]:0.2f}% of users won more than they lost")

### Clustering

In [None]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)
pca = PCA(n_components=scaled_data.shape[1])
pca.fit(scaled_data)

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel("Number of components")
plt.ylabel("Cumulative explained variance")
plt.grid()
plt.show()

In [None]:
visualizer = kelbow_visualizer(KMeans(), scaled_data, k=(2, 12))

In [None]:
# Define the objective function for Optuna
def objective(trial):
    # Define hyperparameters to tune
    n_components = trial.suggest_categorical("pca__n_components", [0.8, 0.9, 0.95, 0.99])
    # n_clusters = trial.suggest_categorical("kmeans__n_clusters", [2])
    n_init = trial.suggest_categorical("kmeans__n_init", [10, 20, 30])
    max_iter = trial.suggest_categorical("kmeans__max_iter", [300, 500, 1000])
    
    
    # Create column transformer
    column_transformer = ColumnTransformer([
        ('scaler', StandardScaler(), slice(0, data.shape[1])),
        ('pca', PCA(n_components=n_components), slice(0, data.shape[1]))
    ], remainder='passthrough')
    
    # Create pipeline
    pipe = Pipeline([
        ("col_transformer", column_transformer),
        ("clusterer", KMeans(
            n_clusters=visualizer.elbow_value_,
            n_init=n_init,
            max_iter=max_iter
        ))
    ])

    pipe.fit(data)

    return silhouette_score(pipe.named_steps["col_transformer"].transform(data), pipe.named_steps["clusterer"].labels_)

# Create a study for grid search
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

# Print the best parameters
print("Best parameters found: ", study.best_params)
print("Best DBCV score: ", study.best_value)

In [None]:
# Define the objective function for Optuna
def objective(trial):
    # Define hyperparameters to tune
    n_components = trial.suggest_categorical("pca__n_components", np.arange(0.1, 1, 0.1).tolist() + [0.99])
    min_cluster_size = trial.suggest_categorical("hdbscan__min_cluster_size", np.arange(100, 1100, 100).tolist())
    min_samples = trial.suggest_categorical("hdbscan__min_samples", np.arange(10, 110, 10).tolist())
    cluster_selection_epsilon = trial.suggest_categorical("hdbscan__cluster_selection_epsilon", np.arange(0.1, 1.1, 0.1).tolist())
    cluster_selection_method = trial.suggest_categorical("hdbscan__cluster_selection_method", ['eom', 'leaf'])

    # Create column transformer
    column_transformer = ColumnTransformer([
        ('scaler', StandardScaler(), slice(0, data.shape[1])),
        ('pca', PCA(n_components=n_components), slice(0, data.shape[1]))
    ], remainder='passthrough')
    
    # Create pipeline
    pipe = Pipeline([
        ("col_transformer", column_transformer),
        ("clusterer", HDBSCAN(
            min_cluster_size=min_cluster_size,
            min_samples=min_samples,
            cluster_selection_epsilon=cluster_selection_epsilon,
            cluster_selection_method=cluster_selection_method,
            allow_single_cluster=False,
            gen_min_span_tree=True
        ))
    ])

    pipe.fit(data)

    # Calculate approximation of DBCV
    dbcv = pipe.named_steps["clusterer"].relative_validity_

    return dbcv

search_space = {
    "pca__n_components": np.arange(0.1, 1, 0.1).tolist() + [0.99],
    "hdbscan__min_cluster_size": np.arange(100, 1100, 100).tolist(),
    "hdbscan__min_samples": np.arange(10, 110, 10).tolist(),
    "hdbscan__cluster_selection_epsilon": np.arange(0.1, 1.1, 0.1).tolist(),
    "hdbscan__cluster_selection_method": ['eom', 'leaf']
}

# Create a study for grid search
# study = optuna.create_study(direction="maximize", sampler=optuna.samplers.GridSampler(search_space))
# study.optimize(objective, n_trials=np.prod([len(v) for v in search_space.values()]))
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

# Print the best parameters
print("Best parameters found: ", study.best_params)
print("Best DBCV score: ", study.best_value)

In [None]:
# Create column transformer
column_transformer = ColumnTransformer([
    ('scaler', StandardScaler(), slice(0, data.shape[1])),
    ('pca', PCA(n_components=study.best_params["pca__n_components"]), slice(0, data.shape[1]))
], remainder='passthrough')

# Create pipeline
pipe = Pipeline([
    ("col_transformer", column_transformer),
    ("clusterer", HDBSCAN(
        min_cluster_size=study.best_params["hdbscan__min_cluster_size"],
        min_samples=study.best_params["hdbscan__min_samples"],
        cluster_selection_epsilon=study.best_params["hdbscan__cluster_selection_epsilon"],
        cluster_selection_method=study.best_params["hdbscan__cluster_selection_method"],
        allow_single_cluster=False,
        gen_min_span_tree=True
    ))
])

In [None]:
pipe.fit(data)

In [None]:
labels = pipe.named_steps["clusterer"].labels_

In [None]:
np.unique(labels, return_counts=True)

In [None]:
pipe.named_steps["clusterer"].relative_validity_

In [None]:
pipe.named_steps["col_transformer"].transform(data)