# 0. LDF selection experiment
Messing around with a median-weighted ldf selection methodology. Unfortunately, results do not seem to significantly outperform the volume-weighted average selection that is the default of the chainladder package, but something worth exploring further.

In [62]:
import chainladder as cl
import pandas as pd
import os
from dateutil.relativedelta import relativedelta
import numpy as np

# 0. Testing initial methodology on sample data

In [None]:
# Load sample dataset from chainladder
data = cl.load_sample("raa")
backtest = -1
print(data)

In [None]:
data.link_ratio.heatmap()

In [None]:
training_data = data[data.valuation < str(data.origin[-1])]
training_data

In [None]:
training_data.link_ratio.heatmap()

In [None]:
actual = data.latest_diagonal[data.origin < str(data.origin.year[backtest])]
actual

## 0.1 Testing BCL and developments

In [None]:
ave_simple = cl.Chainladder().fit(cl.Development(average='simple').fit_transform(training_data)).full_triangle_.dev_to_val()
ave_simple = ave_simple[ave_simple.valuation==data.valuation_date].rename('columns', 'Expected')
ave_simple

In [None]:
bcl = cl.Chainladder().fit(cl.Development(average='volume').fit_transform(training_data))
ave_volume = bcl.full_triangle_.dev_to_val()
ave_volume = ave_volume[ave_volume.valuation==data.valuation_date].rename('columns', 'Expected')
ave_volume

In [None]:
bcl = cl.Chainladder().fit(cl.Development(average='regression').fit_transform(training_data))
ave_regression = bcl.full_triangle_.dev_to_val()
ave_regression = ave_regression[ave_regression.valuation==data.valuation_date].rename('columns', 'Expected')
ave_regression

## 0.2 Testing new approach

In [None]:
data.development.to_list()

In [None]:
import numpy as np

basic_selected_ldf = {}
dev_index = data.development.to_list()

print(dev_index[-2])

for i, j in enumerate(dev_index[:-2]):
   # print(i, j)
   dev_period = data[(data.development >= dev_index[i]) & (data.development <= dev_index[i+1])].link_ratio.to_frame()
   claims = data[data.development == dev_index[i]].to_frame()

   dev_period.index = claims.index
   valid_data = dev_period.dropna()
   valid_claims = claims.loc[valid_data.index]

   # print(dev_period.tail(5))

   volume_mean = np.average(
    valid_data.iloc[:, 0],  # LDF values
    weights=valid_claims.iloc[:, 0]  # Claim values as weights
   )

   mean = dev_period.mean().item()
   median = dev_period.median().item()

   print(j, mean, median, volume_mean)

   volume_scale_factor = abs(mean-median)/mean

   basic_selected_ldf[j] = median*(1-volume_scale_factor) + volume_mean*(volume_scale_factor)

basic_selected_ldf[dev_index[-2]] = np.nan

In [None]:
selected_ldf = {}
dev_index = data.development.to_list()

for i, j in enumerate(dev_index[:-1]):
   # print(i, j)
   dev_period = data[(data.development >= dev_index[i]) & (data.development <= dev_index[i+1])].link_ratio.to_frame().tail(4+i)
   claims = data[data.development == dev_index[i]].to_frame().tail(4+i)

   dev_period.index = claims.index
   valid_data = dev_period.dropna()
   valid_claims = claims.loc[valid_data.index]

   # print(dev_period.tail(5))

   volume_mean = np.average(
    valid_data.iloc[:, 0],  # LDF values
    weights=valid_claims.iloc[:, 0]  # Claim values as weights
   )

   mean = dev_period.mean().item()
   median = dev_period.median().item()
   std_dev = dev_period.std().item()

   volume_cv = std_dev/volume_mean
   cv = std_dev/mean

   volume_scale_factor = 1/(1+volume_cv)
   scale_factor = 1/(1+cv)

   selected_ldf[j] = median*(1-volume_scale_factor) + volume_mean*(volume_scale_factor)


In [None]:
cl.DevelopmentConstant(patterns=basic_selected_ldf, style='ldf').fit(training_data).cdf_

In [None]:
cl.DevelopmentConstant(patterns=selected_ldf, style='ldf').fit(training_data).cdf_

In [None]:
bcl2 = cl.Chainladder().fit(cl.DevelopmentConstant(patterns=basic_selected_ldf, style='ldf').fit_transform(training_data))
ave_selected2 = bcl2.full_triangle_.dev_to_val()
ave_selected2 = ave_selected2[ave_selected2.valuation==data.valuation_date]
ave_selected2

In [None]:
bcl = cl.Chainladder().fit(cl.DevelopmentConstant(patterns=selected_ldf, style='ldf').fit_transform(training_data))
ave_selected = bcl.full_triangle_.dev_to_val()
ave_selected = ave_selected[ave_selected.valuation==data.valuation_date]
ave_selected

In [None]:
print(f"volume: {abs(actual-ave_volume).sum()}")
print(f"simple: {abs(actual-ave_simple).sum()}")
print(f"regression: {abs(actual-ave_regression).sum()}")
print(f"selected: {abs(actual-ave_selected).sum()}")
print(f"selected2: {abs(actual-ave_selected2).sum()}")

# 1. Testing approach against triangles

In [43]:
def median_weighted_ldf(triangle, volume=False, n_periods=0):
    '''
    Credibility based approach with more weight on the median LDF as coefficient of variation increases.
    :param: triangle: pd.DataFrame -> triangle from chainladder package
    :param: volume: bool -> True if volume weighted LDF is needed
    :param: n_periods: int -> Number of periods to consider for the LDF calculation

    Returns a dictionary of selected LDFs
    '''

    selected_ldf = {}
    dev_index = triangle.development.to_list()

    for i, j in enumerate(dev_index[:-1]):

        if n_periods > 0:
            dev_period = triangle[(triangle.development >= dev_index[i]) & (triangle.development <= dev_index[i+1])].link_ratio.to_frame().tail(n_periods+i)
            claims = triangle[triangle.development == dev_index[i]].to_frame().tail(n_periods+i)
        else:
            dev_period = triangle[(triangle.development >= dev_index[i]) & (triangle.development <= dev_index[i+1])].link_ratio.to_frame()
            claims = triangle[triangle.development == dev_index[i]].to_frame()

        median = dev_period.median().item()
        std_dev = dev_period.std().item()

        if volume == True:

            dev_period.index = claims.index
            valid_data = dev_period.dropna()
            valid_claims = claims.loc[valid_data.index]

            mean = np.average(
                valid_data.iloc[:, 0],  # LDF values
                weights=valid_claims.iloc[:, 0]  # Claim values as weights
            )
            cv = std_dev/volume_mean

        else:
        
            mean = dev_period.mean().item()
            cv = std_dev/mean

        scale_factor = 1/(1+cv)

        selected_ldf[j] = median*(1-scale_factor) + volume_mean*(scale_factor)

    return selected_ldf

def basic_median_ldf(triangle, volume=False, n_periods=0):
    '''
    Credibility based approach with more weight on the median LDF as relative distance from mean to median increases.
    :param: triangle: pd.DataFrame -> triangle from chainladder package
    :param: volume: bool -> True if volume weighted LDF is needed
    :param: n_periods: int -> Number of periods to consider for the LDF calculation

    Returns a dictionary of selected LDFs
    '''

    selected_ldf = {}
    dev_index = triangle.development.to_list()

    for i, j in enumerate(dev_index[:-1]):

        if n_periods > 0:
            dev_period = triangle[(triangle.development >= dev_index[i]) & (triangle.development <= dev_index[i+1])].link_ratio.to_frame().tail(n_periods+i)
            claims = triangle[triangle.development == dev_index[i]].to_frame().tail(n_periods+i)
        else:
            dev_period = triangle[(triangle.development >= dev_index[i]) & (triangle.development <= dev_index[i+1])].link_ratio.to_frame()
            claims = triangle[triangle.development == dev_index[i]].to_frame()

        median = dev_period.median().item()

        if volume == True:

            dev_period.index = claims.index
            valid_data = dev_period.dropna()
            valid_claims = claims.loc[valid_data.index]

            mean = np.average(
                valid_data.iloc[:, 0],  # LDF values
                weights=valid_claims.iloc[:, 0]  # Claim values as weights
            )

        else:
        
            mean = dev_period.mean().item()

        scale_factor = abs(mean-median)/mean

        selected_ldf[j] = median*(scale_factor) + volume_mean*(1-scale_factor)

    return selected_ldf

def quarter_to_date(quarter_str):
    year, qtr = quarter_str.split()
    qtr = int(qtr[-1])
    month = qtr * 3  # Last month of the quarter
    return f"{year}/{month:02d}/" + ("31" if month in [3, 12] else "30")

def dev_to_date(date_str, months_str, max_date, m_flag=True):
    if m_flag:
        months = int(months_str[:-1])-3  # Remove the 'm' and convert to int
    else:
        months = int(months_str)-3
    original_date = pd.to_datetime(date_str)
    accident_day = original_date.day
    accident_month = original_date.month
    if ((accident_month+months)%12 in [0, 3]) & (accident_day == 30):
        new_date = original_date + relativedelta(months=months, days=1)
    else:
        new_date = original_date + relativedelta(months=months)
    return min(new_date, max_date).strftime('%Y/%m/%d')

In [None]:
claims_path = r""
backtest = -1

triangle_name = []
AvE_simple = []
AvE_regression = []
AvE_volume = []
AvE_median = []
AvE_med_vol = []
AvE_basic_median = []
AvE_basic_med_vol = []

for file in os.listdir(claims_path):
    claims_data = pd.read_csv(os.path.join(claims_path, file))
    
    flat_claims_data = pd.melt(
        claims_data, 
        id_vars=['Accident Quarter'],
        value_vars=list(claims_data.columns)[1:],
        var_name='Development',
        value_name='Paid'
    ).dropna(subset=['Accident Quarter']).sort_values(by=["Accident Quarter"])

    flat_claims_data["Accident Quarter"] = flat_claims_data["Accident Quarter"].apply(quarter_to_date)
    max_date = pd.to_datetime(flat_claims_data['Accident Quarter']).max() # prevents development dates for future periods

    flat_claims_data["Development"] = flat_claims_data.apply(
        lambda row: dev_to_date(
            row["Accident Quarter"], 
            row["Development"],
            max_date
        ),
        axis=1
    )

    flat_claims_data["Paid"] = pd.to_numeric(flat_claims_data["Paid"], errors="coerce")

    triangle = cl.Triangle(
        data=flat_claims_data,
        origin="Accident Quarter",
        development="Development",
        columns="Paid",
        cumulative=True
    )

    training_data = triangle[triangle.valuation < str(triangle.origin[backtest])]

    actual = triangle[(triangle.origin < str(triangle.origin[backtest]))]
    actual = actual[(actual.valuation > str(triangle.origin[backtest]))].sum("development")

    ave_simple = cl.Chainladder().fit(cl.Development(average='simple').fit_transform(training_data)).full_triangle_.dev_to_val()
    ave_simple = ave_simple[ave_simple.valuation==triangle.valuation_date]

    ave_reg = cl.Chainladder().fit(cl.Development(average='regression').fit_transform(training_data)).full_triangle_.dev_to_val()
    ave_reg = ave_reg[ave_reg.valuation==triangle.valuation_date]
    
    ave_volume = cl.Chainladder().fit(cl.Development(average='volume').fit_transform(training_data)).full_triangle_.dev_to_val()
    ave_volume = ave_volume[ave_volume.valuation==triangle.valuation_date]

    ave_med = cl.Chainladder().fit(cl.DevelopmentConstant(patterns=median_weighted_ldf(training_data), style='ldf').fit_transform(training_data)).full_triangle_.dev_to_val()
    ave_med = ave_med[ave_med.valuation==triangle.valuation_date]
    
    ave_basic_med = cl.Chainladder().fit(cl.DevelopmentConstant(patterns=basic_median_ldf(training_data), style='ldf').fit_transform(training_data)).full_triangle_.dev_to_val()
    ave_basic_med = ave_basic_med[ave_basic_med.valuation==triangle.valuation_date]

    triangle_name.append(file.replace(".csv", ""))
    AvE_simple.append(abs(actual-ave_simple).sum())
    AvE_regression.append(abs(actual-ave_reg).sum())
    AvE_volume.append(abs(actual-ave_volume).sum())
    AvE_median.append(abs(actual-ave_med).sum())
    AvE_basic_median.append(abs(actual-ave_basic_med).sum())
    
ave_df = pd.DataFrame({
    "triangle_name": triangle_name,
    "AvE simple": AvE_simple,
    "AvE regression": AvE_regression,
    "AvE volume": AvE_volume,
    "AvE median": AvE_median,
    "AvE basic median": AvE_basic_median,
})

ave_df.to_excel("ave_results_omar_1q.xlsx", index=False)
ave_df

In [None]:
claims_path = r""
backtest = -1

triangle_name = []
AvE_simple = []
AvE_regression = []
AvE_volume = []
AvE_median = []
AvE_med_vol = []
AvE_basic_median = []
AvE_basic_med_vol = []

for file in os.listdir(claims_path):
    claims_data = pd.read_excel(os.path.join(claims_path, file))
    
    flat_claims_data = pd.melt(
        claims_data, 
        id_vars=['Accident Quarter'],
        value_vars=list(claims_data.columns)[1:],
        var_name='Development',
        value_name='Paid'
    ).dropna(subset=['Accident Quarter']).sort_values(by=["Accident Quarter"])

    flat_claims_data["Accident Quarter"] = flat_claims_data["Accident Quarter"].apply(quarter_to_date)
    max_date = pd.to_datetime(flat_claims_data['Accident Quarter']).max() # prevents development dates for future periods

    flat_claims_data["Development"] = flat_claims_data.apply(
        lambda row: dev_to_date(
            row["Accident Quarter"], 
            row["Development"],
            max_date
        ),
        axis=1
    )

    flat_claims_data["Paid"] = pd.to_numeric(flat_claims_data["Paid"], errors="coerce")

    triangle = cl.Triangle(
        data=flat_claims_data,
        origin="Accident Quarter",
        development="Development",
        columns="Paid",
        cumulative=True
    )

    training_data = triangle[triangle.valuation < str(triangle.origin[backtest])]

    actual = triangle[(triangle.origin < str(triangle.origin[backtest]))]
    actual = actual[(actual.valuation > str(triangle.origin[backtest]))].sum("development")

    ave_simple = cl.Chainladder().fit(cl.Development(average='simple', n_periods=8).fit_transform(training_data)).full_triangle_.dev_to_val()
    ave_simple = ave_simple[ave_simple.valuation==triangle.valuation_date]

    ave_reg = cl.Chainladder().fit(cl.Development(average='regression', n_periods=8).fit_transform(training_data)).full_triangle_.dev_to_val()
    ave_reg = ave_reg[ave_reg.valuation==triangle.valuation_date]
    
    ave_volume = cl.Chainladder().fit(cl.Development(average='volume', n_periods=8).fit_transform(training_data)).full_triangle_.dev_to_val()
    ave_volume = ave_volume[ave_volume.valuation==triangle.valuation_date]

    ave_med = cl.Chainladder().fit(cl.DevelopmentConstant(patterns=median_weighted_ldf(training_data, n_periods=8), style='ldf').fit_transform(training_data)).full_triangle_.dev_to_val()
    ave_med = ave_med[ave_med.valuation==triangle.valuation_date]
    
    ave_med_vol = cl.Chainladder().fit(cl.DevelopmentConstant(patterns=median_weighted_ldf(training_data, volume=True, n_periods=8), style='ldf').fit_transform(training_data)).full_triangle_.dev_to_val()
    ave_med_vol = ave_med_vol[ave_med_vol.valuation==triangle.valuation_date]

    ave_basic_med = cl.Chainladder().fit(cl.DevelopmentConstant(patterns=basic_median_ldf(training_data, n_periods=8), style='ldf').fit_transform(training_data)).full_triangle_.dev_to_val()
    ave_basic_med = ave_basic_med[ave_basic_med.valuation==triangle.valuation_date]
    
    ave_basic_med_vol = cl.Chainladder().fit(cl.DevelopmentConstant(patterns=basic_median_ldf(training_data, volume=True, n_periods=8), style='ldf').fit_transform(training_data)).full_triangle_.dev_to_val()
    ave_basic_med_vol = ave_basic_med_vol[ave_basic_med_vol.valuation==triangle.valuation_date]

    triangle_name.append(file.replace(".xlsx", ""))
    AvE_simple.append(abs(actual-ave_simple).sum())
    AvE_regression.append(abs(actual-ave_reg).sum())
    AvE_volume.append(abs(actual-ave_volume).sum())
    AvE_median.append(abs(actual-ave_med).sum())
    AvE_med_vol.append(abs(actual-ave_med_vol).sum())
    AvE_basic_median.append(abs(actual-ave_basic_med).sum())
    AvE_basic_med_vol.append(abs(actual-ave_basic_med_vol).sum())
    
ave_df = pd.DataFrame({
    "triangle_name": triangle_name,
    "AvE simple": AvE_simple,
    "AvE regression": AvE_regression,
    "AvE volume": AvE_volume,
    "AvE median": AvE_median,
    "AvE median volume": AvE_med_vol,
    "AvE basic median": AvE_basic_median,
    "AvE basic median volume": AvE_basic_med_vol
})

ave_df.to_excel("ave_results_omi_n8_q1.xlsx", index=False)
ave_df

In [None]:
claims_path = r"K:\Shaazia\OMI Reserve Investigations\2023-09\ResQ Macros\Claims Data"
backtest = -1

claims_data = pd.read_excel(os.path.join(claims_path, os.listdir(claims_path)[0]))

flat_claims_data = pd.melt(
    claims_data, 
    id_vars=['Accident Quarter'],
    value_vars=list(claims_data.columns)[1:],
    var_name='Development',
    value_name='Paid'
).dropna(subset=['Accident Quarter']).sort_values(by=["Accident Quarter"])

flat_claims_data["Accident Quarter"] = flat_claims_data["Accident Quarter"].apply(quarter_to_date)
max_date = pd.to_datetime(flat_claims_data['Accident Quarter']).max() # prevents development dates for future periods

flat_claims_data["Development"] = flat_claims_data.apply(
    lambda row: dev_to_date(
        row["Accident Quarter"], 
        row["Development"],
        max_date
    ),
    axis=1
)

flat_claims_data["Paid"] = pd.to_numeric(flat_claims_data["Paid"], errors="coerce")

triangle = cl.Triangle(
    data=flat_claims_data,
    origin="Accident Quarter",
    development="Development",
    columns="Paid",
    cumulative=True
)
triangle.link_ratio.heatmap()

In [None]:
training_data = triangle[triangle.valuation < str(triangle.origin[-1])]
training_data.link_ratio.heatmap()

# 99. Storing other pieces of code

In [None]:
## R-code Correlation

# library(data.table)
# library(dplyr)
# library(ggplot2)
# library(stringr)
# library(lubridate)
# library(corrplot)

# # valid_dates = rbind(expand.grid(V1 = 2014:2023, V2 = paste0("Q",1:4)), 
# #                    expand.grid(V1 = 2024, V2 = paste0("Q", 1:2))) %>% data.table()

# valid_dates = expand.grid(V1 = 2019:2023, V2 = paste0("Q",1:4)) %>% data.table()

# valid_dates[, accident_period := paste0(V1," ", V2)]
# valid_dates %>% setkey(accident_period)
# valid_dates = valid_dates[-1]
# #valid_dates = valid_dates[-1]

# dat = fread("")

# dat_sub = dat[country != "Arsenal" & basis == "gross" & accident_period %in% valid_dates$accident_period]

# pivot = dat_sub %>% dcast.data.table(accident_period ~ country, value.var = "ulr_residual")
# corrs = pivot[, c(-1)] %>% as.matrix()  %>% cor

# corrs %>% corrplot::corrplot()

# tests = corrs %>% corrplot::cor.mtest()
# pvals = data.table(names = tests$p %>% rownames(), tests$p) %>% melt.data.table()


# corrs = data.table(names = corrs %>% rownames(), corrs) %>% melt.data.table()
# corrs[, pair := paste0(names, variable)]
# pvals[, pair := paste0(names, variable)]

# corrs[pair %in% pvals[value >0 & value < 0.05][order(names)]$pair]

In [None]:
## R-code ISP

# library(data.table)
# library(dplyr)
# library(ggplot2)
# library(ggpubr)
# library(keras)
# library(tensorflow)
# library(stringr)
# library(lubridate)
# require(janitor)
# library(ChainLadder)


# dat = fread("", header = T) %>% melt.data.table(id.vars = "V1") %>% clean_names()
# dat[, AY := v1]
# dat[, v1 := NULL]
# dat[, DY := variable]
# dat[, variable := NULL]
# dat[, value := as.numeric(value)]

# triang = dat %>% as.triangle(., origin = "AY", dev = "DY", value = "value")
# triang %>% MackChainLadder()

# exp(qnorm(0.8)*0.2)*50000000-50000000

# ### CDR

# triang %>% MackChainLadder() %>% CDR()
# cdrs = triang %>% MackChainLadder() %>% CDR(dev = "all")

# ibnr = cdrs$IBNR[21]

# cdrs = (cdrs$`CDR(1)S.E.`[21]^2+
# cdrs$`CDR(2)S.E.`[21]^2+
# cdrs$`CDR(3)S.E.`[21]^2+
# cdrs$`CDR(4)S.E.`[21]^2)^0.5

# CoV = cdrs/ibnr

# ADC = exp(qnorm(0.65)*CoV)*ibnr-ibnr

# ## lognormal

# mean = ibnr
# sd = cdrs
# empirical_cov = sd/mean

# sigma = (log(empirical_cov^2+1))^0.5
# mu = log(mean)-sigma^2/2

# qlnorm(0.8, meanlog = mu, sdlog = sigma) - mean