In [1]:
# 导入库
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm

In [2]:
# 导入包含特征、赔付概率和赔付金额的数据集（x, y_prob, y_amount）
data = pd.read_csv('C:/Users/taowu/Desktop/项目/保险赔付金额预测（nn-qrnn）/数据/my_data1.csv')  # 导入数据

# 创建LabelEncoder对象
label_encoder = LabelEncoder()

# 对指定列进行标签编码
data['veh_body_encoded'] = label_encoder.fit_transform(data['veh_body'])
data['area_encoded'] = label_encoder.fit_transform(data['area'])
data['amount_per_exposure'] = data['claimcst0'] / data['exposure']

 
# 输入特征和真实y值
x = data[['veh_value', 'veh_age', 'agecat', 'gender', 'veh_body_encoded', 'area_encoded']].values
y_prob = data[['clm']].values
y_amount = data[['amount_per_exposure']].values

# 划分训练集和测试集
x_train, x_test, y_prob_train, y_prob_test, y_amount_train, y_amount_test = train_test_split(x, y_prob, y_amount,
                                                                                             test_size=0.2,
                                                                                             random_state=42)

In [3]:
# 定义函数 t(p)
def t_function(p, T):
    return (T - p) / (1 - p)

# 定义 T
T = 0.95

## 第一步逻辑回归

In [4]:
# 将类别标签反转
y_prob_train_inverted = 1 - y_prob_train
# 使用逻辑回归模型
logistic_model = LogisticRegression()
# 训练逻辑回归模型
logistic_model.fit(x_train, y_prob_train_inverted.squeeze())
# 获取训练集逻辑回归模型的预测概率
predictions_A_train = logistic_model.predict_proba(x_train)[:, 1]

# 计算训练集的 tau
t_p_train = t_function(predictions_A_train, T)

# 计算测试集的 tau 
# 从训练集中选择与测试集大小相同的样本计算tau而后用训练集的τ计算yhat
random_indices = np.random.choice(len(x_train), size=len(x_test), replace=False)
x_train_subset = x_train[random_indices] 
predictions_A_test = logistic_model.predict_proba(x_train_subset)[:, 1]

t_p_test = t_function(predictions_A_test, T)
# 打印predictions_A、t_p、predictions_A_test、t_p_test
print("predictions_A:", predictions_A_train)
print("t_p_train:",t_p_train)
print("predictions_A_test:", predictions_A_test)
print("t_p_test:", t_p_test)

predictions_A: [0.90717235 0.93355933 0.92080237 ... 0.93550523 0.93560693 0.91975326]
t_p_train: [0.46136739 0.24744886 0.36866801 ... 0.22474336 0.22351898 0.37692175]
predictions_A_test: [0.93793721 0.93701997 0.91990481 ... 0.93656565 0.92579991 0.92506096]
t_p_test: [0.19436435 0.20609756 0.37574276 ... 0.21178353 0.32614636 0.332791  ]


## 第一第二步衔接

In [5]:
# 仅选择赔付额大于0的保单进行第二步分位数回归
mask_train = y_amount_train.squeeze() > 0
mask_test = y_amount_test.squeeze() > 0

# 使用赔付额大于0的保单的 t_p 作为分位数水平
t_p_train_selected = t_p_train[mask_train]
t_p_test_selected = t_p_test[mask_test]

# 选取赔付额大于0的保单的特征和赔付额
x_train_selected = x_train[mask_train]
y_amount_train_selected = y_amount_train[mask_train]

x_test_selected = x_test[mask_test]
y_amount_test_selected = y_amount_test[mask_test]

# 将 t_p 转为二维列向量 
t_p_train_selected = t_p_train_selected.reshape(-1, 1) 
t_p_test_selected = t_p_test_selected.reshape(-1, 1)

# 在特征矩阵中添加常数列
x_train_selected_with_const = sm.add_constant(x_train_selected) 
x_test_selected_with_const = sm.add_constant(x_test_selected)

In [6]:
# 检查
# 打印维度
print("x_train_selected.shape:", x_train_selected.shape)
print("y_amount_train_selected.shape:",y_amount_train_selected.shape)

print("x_test_selected.shape:", x_test_selected.shape)
print("y_amount_test_selected.shape:", y_amount_test_selected.shape)

print("t_p_train_selected.shape:",t_p_train_selected.shape)
print("t_p_test_selected.shape:",t_p_test_selected.shape)

print("x_train_selected_with_const.shape:",x_train_selected_with_const.shape)
print("x_test_selected_with_const.shape:",x_test_selected_with_const.shape)

# 打印仅赔付的保单t_p范围
print("t_p_train_selected range:", t_p_train_selected.min(), t_p_train_selected.max())
print("t_p_test_selected range:", t_p_test_selected.min(), t_p_test_selected.max())

# 打印仅赔付保单t_p的0和负值个数
zero_count = len(t_p_train_selected) - np.count_nonzero(t_p_train_selected)
print("Number of zeros in t_p_train_selected:", zero_count)

zero_count = len(t_p_test_selected) - np.count_nonzero(t_p_test_selected)
print("Number of zeros in t_p_test_selected:", zero_count)

negative_count = np.sum(t_p_train_selected < 0)
print("Number of negative values in t_p_train_selected:", negative_count)

negative_count = np.sum(t_p_test_selected < 0)
print("Number of negative values in t_p_test_selected:", negative_count)

x_train_selected.shape: (3728, 6)
y_amount_train_selected.shape: (3728, 1)
x_test_selected.shape: (896, 6)
y_amount_test_selected.shape: (896, 1)
t_p_train_selected.shape: (3728, 1)
t_p_test_selected.shape: (896, 1)
x_train_selected_with_const.shape: (3728, 7)
x_test_selected_with_const.shape: (896, 7)
t_p_train_selected range: -0.006284358755090148 0.6363843926692928
t_p_test_selected range: -0.021635051590136668 0.5323957353668567
Number of zeros in t_p_train_selected: 0
Number of zeros in t_p_test_selected: 0
Number of negative values in t_p_train_selected: 2
Number of negative values in t_p_test_selected: 1


## 第二步分位数回归

In [7]:
# 初始化结果存储
y_amount_pred_train = np.zeros_like(y_amount_train_selected)
y_amount_pred_test = np.zeros_like(y_amount_test_selected)

# 初始化损失存储
loss_train = 0.0
loss_test = 0.0

In [8]:
# 循环处理每个保单
# 训练集
for i in range(len(t_p_train_selected)):
    # 将一维数组转换为二维数组 
    x_train_i = x_train_selected_with_const[i:i+1, :] 
    # t_p_train_selected[i] > 0进行分位数回归，否则直接令 y_amount_pred_train为0
    if t_p_train_selected[i] > 0:
        # 使用分位数回归模型
        quantreg = sm.QuantReg(y_amount_train_selected[i: i+1], sm.add_constant(x_train_i))
        quantreg_result = quantreg.fit(q=t_p_train_selected[i])
        # 预测训练集的赔付金额
        y_amount_pred_train[i: i+1] = quantreg_result.predict(sm.add_constant(x_train_i)).reshape(1, -1) 
    else:
        y_amount_pred_train[i: i+1] = np.zeros_like(y_amount_pred_train[i: i+1])  
             
#测试集
for i in range(len(t_p_test_selected)):
    # 将一维数组转换为二维数组
    x_test_i = x_test_selected_with_const[i:i+1, :]
    # t_p_test_selected[i] > 0进行分位数回归，否则直接令 y_amount_pred_test为0
    if t_p_test_selected[i] > 0:
        # 使用分位数回归模型
        quantreg = sm.QuantReg(y_amount_test_selected[i: i+1], sm.add_constant(x_test_i))
        quantreg_result = quantreg.fit(q=t_p_test_selected[i])
        # 预测测试集的赔付金额
        y_amount_pred_test[i: i+1] = quantreg_result.predict(sm.add_constant(x_test_i)).reshape(1, -1)
    else:
        y_amount_pred_test[i: i+1] = np.zeros_like(y_amount_pred_test[i: i+1])  

In [11]:
# 计算测试集mse、rmse
test_mse = np.mean((y_amount_pred_test - y_amount_test_selected.squeeze())** 2)
test_rmse = np.sqrt(test_mse)

# 打印结果
print("Test_mse:", test_mse)
print("Test_rmse:", test_rmse)

Test_mse: 12775153134.805859
Test_rmse: 113027.22298104054
