# Import packages

In [63]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.ensemble import RandomForestClassifier
import json
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Data preparation

In [49]:
# Load your dataset
df = pd.read_csv("/Users/lucakoster/Desktop/Marketing Case seminar/baqm-seminar-16/data/prepped_data.csv")

# Assuming 'no WD and no LPA' represents untreated customers
untreated_df = df[df['control_group'].isin(['no WD and no LPA', 'no WD and LPA'])]

# Check if the DataFrame is not empty
if not untreated_df.empty:
    # Define your features and target variable
    X = untreated_df.drop(['policy_nr_hashed', 'last_data_year', 'years_to_churn', 'control_group', 'premiums', 'last_brand', 'last_type', 'last_fuel_type', 'last_postcode', 'last_product', 'last_trend_nr_coverages', 'last_change_premium_abs', 'last_change_premium_perc', 'years_since_last_car_change'], axis=1)
    y = untreated_df['years_to_churn']

    # Split the dataset into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
else:
    print("No untreated customers found. Check the filtering criteria.")


  df = pd.read_csv("/Users/lucakoster/Desktop/Marketing Case seminar/baqm-seminar-16/data/prepped_data.csv")


# Setting up GridsearchCV

In [54]:

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    # Add more parameters here if needed
}

# test

# Initialize the Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Set up GridSearchCV or BayesSearchCV
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
# bayes_search = BayesSearchCV(rf, search_spaces, n_iter=32, cv=5, n_jobs=-1) # Uncomment for BayesSearchCV



# Setting up BayessearchCV

In [64]:
# Define the search space
search_spaces = {
    'n_estimators': Integer(100, 300),
    'max_depth': Categorical([None, 5, 10, 15]),
    'min_samples_split': Integer(2, 10)
    # Add more parameters and their ranges if needed
}

# Initialize the Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Set up BayesSearchCV
bayes_search = BayesSearchCV(rf, search_spaces, n_iter=32, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)

# Now you can fit the model using BayesSearchCV
bayes_search.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters:", bayes_search.best_params_)
print("Best score:", bayes_search.best_score_)

AttributeError: module 'numpy' has no attribute 'int'.
`np.int` was a deprecated alias for the builtin `int`. To avoid this error in existing code, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

# Training the model

In [55]:
# Fit the model
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)


Best parameters: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 300}
Best score: 0.9563075467419552


# Evaluate the model

In [53]:
# Use the best estimator to make predictions on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)  # Ensure to use the transformed version of X_test if applicable

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
# Calculate metrics with 'weighted' average for multiclass classification
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the metrics
print(f"Accuracy: {accuracy}")
print(f"Precision (Weighted): {precision}")
print(f"Recall (Weighted): {recall}")
print(f"F1 Score (Weighted): {f1}")
print(f"Confusion Matrix:\n{conf_matrix}")

Accuracy: 0.9456207892204043
Precision (Weighted): 0.934594239301392
Recall (Weighted): 0.9456207892204043
F1 Score (Weighted): 0.9267957081239083
Confusion Matrix:
[[1949    1    0]
 [  70   13    0]
 [  31   11    3]]
