# Import Packages

In [12]:
import pandas as pd
import numpy as np

# Load Data

In [13]:
input_df = pd.read_csv("./data/20240117_churn_data.csv", low_memory=False).drop_duplicates()

# Prepare Data

## Functions

In [14]:
def tryconvert(value, default, *types):
    for t in types:
        try:
            return t(value)
        except (ValueError, TypeError, IndexError):
            continue
    return default

def count_decreases(group):
    # Shift the 'Value' column down to compare with the next row
    prev_values = group['policy_nr_hashed'].shift(1)
    # Check if the current value is greater than the next value
    decreases = group['policy_nr_hashed'] > prev_values
    # Sum the True values (which represent decreases)
    return decreases.sum()

def last_non_zero(series):
    non_zero_values = series.replace(0, np.NaN).dropna()
    if not non_zero_values.empty:
        return non_zero_values.iloc[-1]
    else:
        return 0
    
def second_to_last(series):
    return series.iloc[-2]

## Filter Customers

In [15]:
# Only keep customers that joined after welcome discount was introduced
input_df = input_df.sort_values("year_initiation_policy_version")

# Only keep customers that have at least two data-points
input_df["number_datapoints"] = input_df.groupby('policy_nr_hashed')[['policy_nr_hashed']].transform('count')
input_df = input_df[input_df["number_datapoints"] > 1]

# We need to filter out all the customers that churned 
input_df['non_relevant_churn'] = input_df.groupby('policy_nr_hashed')[['d_churn_between_prolongations', 'd_churn_cancellation']].transform('sum').sum(axis=1)
input_df = input_df[input_df['non_relevant_churn'] == 0]

## Create Relevant Columns

In [16]:
# A column that says whether a customer got a discount or not
input_df["has_discount"] = (input_df.groupby('policy_nr_hashed')[['welcome_discount']].transform('min') < 1)

# Create a dict of all the premium data
input_df['premium_data'] = input_df.apply(lambda row: {'year': int(row['year_initiation_policy_version']), 'main': np.round(row['premium_main_coverages'], 0), 'supp': np.round(row['premium_supplementary_coverages'], 0), 'total': np.round(row['total_premium'], 0)}, axis=1)

# How much of the premium is main coverage
input_df['main_coverage_portion'] = input_df['premium_main_coverages'] / input_df['total_premium']

# Aggregate all car-data into a single column
input_df['car_data'] = input_df["brand"] + input_df["type"] + input_df["weight"].astype(str) + input_df["fuel_type"]
input_df["lagged_car_data"] = input_df.groupby('policy_nr_hashed')['car_data'].shift(1)
input_df["car_change"] = ((input_df["lagged_car_data"] != input_df["car_data"]) & ~input_df['lagged_car_data'].isnull())
input_df["year_car_change"] = input_df["car_change"].astype("int") * input_df["year_initiation_policy_version"]

# Concatenate all mutations
input_df["all_mutations"] = input_df[[col for col in input_df.columns if (("mutation" in col) and (len(col) < 12))]].astype("str").sum(1).str.replace('nan', '')

# Tag when a policy has changed holder
input_df["policyholder_change"] = input_df["all_mutations"].str.contains("replacePolicyholder")
input_df["fake_alarm"] = input_df["all_mutations"].str.contains("restoreCancellation")

# Compute total lagged coverage
input_df['n_coverages_trend'] = input_df["n_coverages"] - input_df.groupby('policy_nr_hashed')['n_coverages'].shift(1)

# Calculate number of accdent years
input_df["accident_years"] = ((input_df.groupby('policy_nr_hashed')['accident_free_years'].shift(1) > input_df['accident_free_years']).astype("int") * (input_df.groupby('policy_nr_hashed')['accident_free_years'].shift(1) - input_df['accident_free_years'])).fillna(0).replace(-0.0, 0)

# Create lagged premium difference (abs and perc)
input_df["lagged_total_premium"] = input_df.groupby('policy_nr_hashed')['total_premium'].shift(1)
input_df["abs_diff_total_premium"] = input_df["total_premium"] - input_df["lagged_total_premium"]
input_df["perc_diff_total_premium"] = input_df["abs_diff_total_premium"] / input_df["lagged_total_premium"]

# display(input_df[input_df["policy_nr_hashed"] == "lrzJmX0"][["year_initiation_policy_version", "car_data"]])

In [17]:
customer_data_columns = ['customer_age', 'accident_free_years', 'car_value', 'age_car', 'brand', 'type', 'weight', 'fuel_type', 'postcode', 'product', 'allrisk basis', 'allrisk compleet', 'allrisk royaal', 'wa-extra']
customer_data_agg = {f'last_{col}': pd.NamedAgg(column=col, aggfunc=second_to_last) for col in customer_data_columns}

In [18]:
final_df = (
    input_df
    .sort_values("year_initiation_policy_version")
    .groupby("policy_nr_hashed")
    .agg(
        welcome_discount=pd.NamedAgg(column="welcome_discount", aggfunc="min"),
        last_data_year=pd.NamedAgg(column="year_initiation_policy_version", aggfunc=second_to_last),
        first_data_year=pd.NamedAgg(column="year_initiation_policy", aggfunc="first"),
        churn=pd.NamedAgg(column="d_churn_around_prolongation", aggfunc="max"),
        control_group=pd.NamedAgg(column="welcome_discount_control_group", aggfunc=second_to_last),
        # premiums=pd.NamedAgg(column="premium_data", aggfunc=lambda x: x.to_list()),
        first_premium=pd.NamedAgg(column="total_premium", aggfunc='first'),
        last_premium=pd.NamedAgg(column="total_premium", aggfunc=second_to_last),
        first_split=pd.NamedAgg(column="main_coverage_portion", aggfunc='first'),
        last_split=pd.NamedAgg(column="main_coverage_portion", aggfunc=second_to_last),
        **customer_data_agg,
        nr_cars=pd.NamedAgg(column="car_data", aggfunc=lambda x:len(set(x.to_list()[:-1]))),
        fake_alarm=pd.NamedAgg(column="fake_alarm", aggfunc=lambda x:np.sum(x.to_list()[:-1])),
        policyholder_change=pd.NamedAgg(column="policyholder_change", aggfunc=lambda x:np.sum(x.to_list()[:-1])),
        max_nr_coverages=pd.NamedAgg(column="n_coverages", aggfunc=lambda x:np.max(x.to_list()[:-1])),
        last_nr_coverages=pd.NamedAgg(column="n_coverages", aggfunc=second_to_last),
        last_trend_nr_coverages=pd.NamedAgg(column="n_coverages_trend", aggfunc=second_to_last),
        accident_years=pd.NamedAgg(column="accident_years", aggfunc=lambda x:np.sum(x.to_list()[:-1])),
        last_year_car_change=pd.NamedAgg(column="year_car_change", aggfunc=last_non_zero),
        last_change_premium_abs=pd.NamedAgg(column="abs_diff_total_premium", aggfunc=second_to_last),
        last_change_premium_perc=pd.NamedAgg(column="perc_diff_total_premium", aggfunc=second_to_last),
    )
    .reset_index()
)

In [20]:
final_df["years_since_last_car_change"] = (final_df["last_data_year"] - final_df["last_year_car_change"]).astype("int").apply(lambda x: x if x <= 10 else np.NaN)
final_df["n_last_vs_peak"] = final_df["last_nr_coverages"] - final_df["max_nr_coverages"]
final_df["last_vs_first_split"] = final_df["last_split"] - final_df["first_split"]
final_df["lpa"] = (~final_df["control_group"].str.contains("no LPA")).astype("int")
final_df["cum_change_premium_abs"] = final_df["last_premium"] - final_df["first_premium"]
final_df["cum_change_premium_perc"] = final_df["cum_change_premium_abs"] / final_df["first_premium"]

final_df['last_postcode'] = final_df['last_postcode'].astype(str).str[0]

final_df.to_csv("./data/prepped_data.csv")

Unnamed: 0,policy_nr_hashed,welcome_discount,last_data_year,first_data_year,churn,control_group,first_premium,last_premium,first_split,last_split,...,accident_years,last_year_car_change,last_change_premium_abs,last_change_premium_perc,years_since_last_car_change,n_last_vs_peak,last_vs_first_split,lpa,cum_change_premium_abs,cum_change_premium_perc
247,0WBV29B,1.0,2022.0,2019,0,no WD and no LPA,3666.432,4044.096,0.931085,0.933533,...,0.0,0.0,111.552,0.028366,,0,0.002448,0,377.664,0.103006
248,0WBV2JB,1.0,2022.0,2019,0,WD and LPA,7076.608,8038.016,0.964358,0.964720,...,0.0,0.0,292.096,0.037710,,0,0.000362,1,961.408,0.135857
249,0WBV2R5,1.0,2022.0,2019,0,no WD and LPA,5097.792,5432.448,0.950435,0.953983,...,0.0,0.0,147.840,0.027976,,0,0.003548,1,334.656,0.065647
250,0WBV2nn,1.0,2022.0,2019,0,no WD and LPA,4022.592,3940.608,0.829602,0.809004,...,0.0,0.0,61.824,0.015939,,0,-0.020598,1,-81.984,-0.020381
251,0WBV7Gm,1.0,2020.0,2019,0,no WD and no LPA,3837.120,4678.464,0.827671,0.853490,...,8.0,0.0,841.344,0.219264,,0,0.025820,0,841.344,0.219264
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84674,zPmyxXA,1.0,2022.0,2021,0,WD and no LPA,4284.672,4233.600,0.836888,0.832063,...,0.0,0.0,-51.072,-0.011920,,0,-0.004825,0,-51.072,-0.011920
84675,zPmyyWO,1.0,2022.0,2021,1,no WD and no LPA,8041.152,8215.200,0.917419,0.917832,...,0.0,0.0,174.048,0.021645,,0,0.000414,0,174.048,0.021645
84676,zPmyydl,1.0,2023.0,2021,0,WD and LPA,3186.624,3968.832,1.000000,1.000000,...,0.0,2022.0,-13.440,-0.003375,1.0,0,0.000000,1,782.208,0.245466
84677,zPmyynd,1.0,2021.0,2021,1,no WD and no LPA,17485.440,17485.440,1.000000,1.000000,...,0.0,0.0,,,,0,0.000000,0,0.000,0.000000
