In [1]:
# 导入库
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm

In [2]:
# 导入包含特征、赔付概率和赔付金额的数据集（x, y_prob, y_amount）
data = pd.read_csv('C:/Users/taowu/Desktop/项目/保险赔付金额预测（nn-qrnn）/数据/my_data1.csv')  # 导入数据

# 创建LabelEncoder对象
label_encoder = LabelEncoder()

# 对指定列进行标签编码
data['veh_body_encoded'] = label_encoder.fit_transform(data['veh_body'])
data['area_encoded'] = label_encoder.fit_transform(data['area'])
data['amount_per_exposure'] = data['claimcst0'] / data['exposure']

 
# 输入特征和真实y值
x = data[['veh_value', 'veh_age', 'agecat', 'gender', 'veh_body_encoded', 'area_encoded']].values
y_prob = data[['clm']].values
y_amount = data[['amount_per_exposure']].values
e = data[['exposure']]
# 划分训练集和测试集
x_train, x_test, y_prob_train, y_prob_test, y_amount_train, y_amount_test, e_train, e_test = train_test_split(x, y_prob, y_amount,e,
                                                                                             test_size=0.2,random_state=42)
print(e_train) 
print(e_test) 
e_train = e_train['exposure'].values
e_test = e_test['exposure'].values

       exposure
16541  0.908966
60689  0.626968
67668  0.999316
20568  0.676249
36708  0.062971
...         ...
37194  0.095825
6265   0.884326
54886  0.676249
860    0.898015
15795  0.145106

[54284 rows x 1 columns]
       exposure
30347  0.046543
44050  0.473648
45163  0.013689
13363  0.856947
8875   0.895277
...         ...
36034  0.418891
23668  0.673511
2476   0.629706
13912  0.435318
23899  0.481862

[13572 rows x 1 columns]


In [3]:
# 定义函数 t(p)
def t_function(p, T):
    return (T - p) / (1 - p)

# 定义 T
T = 0.95

In [4]:
# 将类别标签反转
y_prob_train_inverted = 1 - y_prob_train
# 使用逻辑回归模型
logistic_model = LogisticRegression()
# 训练逻辑回归模型
logistic_model.fit(x_train, y_prob_train_inverted.squeeze())
# 获取训练集逻辑回归模型的预测概率
predictions_A_train = logistic_model.predict_proba(x_train)[:, 1]
print("predictions_A_train :", predictions_A_train)
predictions_A_train_star = 1 - (1 - predictions_A_train) / e_train
# 计算训练集的 tau
t_p_train = t_function(predictions_A_train_star, T)

# 计算测试集的 tau 
# 从训练集中选择与测试集大小相同的样本计算tau而后用训练集的τ计算yhat
random_indices = np.random.choice(len(x_train), size=len(x_test), replace=False)
x_train_subset = x_train[random_indices] 
predictions_A_test = logistic_model.predict_proba(x_train_subset)[:, 1]
print("predictions_A_test :", predictions_A_test)
predictions_A_test_star = 1 - (1 - predictions_A_test) / e_test
t_p_test = t_function(predictions_A_test_star, T)

# 打印predictions_A、t_p、predictions_A_test、t_p_test
print("predictions_A_train_star:", predictions_A_train_star)
print("t_p_train:",t_p_train)
print("predictions_A_test_star:", predictions_A_test_star)
print("t_p_test:", t_p_test)

#检查修正后的p小于0的个数
negative_count = np.sum(predictions_A_train_star < 0)
print("Number of negative values in predictions_A_train_star:", negative_count)
negative_count = np.sum(predictions_A_test_star < 0)
print("Number of negative values in predictions_A_test_star:", negative_count)

negative_count = np.sum(t_p_train < 0)
print("Number of negative values in t_p_train:", negative_count)

negative_count = np.sum(t_p_test < 0)
print("Number of negative values in t_p_test:", negative_count)

predictions_A_train : [0.90717235 0.93355933 0.92080237 ... 0.93550523 0.93560693 0.91975326]
predictions_A_test : [0.93734519 0.91570819 0.93272493 ... 0.92987821 0.94924804 0.9367402 ]
predictions_A_train_star: [0.8978756  0.89402859 0.92074812 ... 0.90462868 0.92829399 0.44697882]
t_p_train: [0.51040102 0.52817464 0.36910013 ... 0.47573336 0.30270835 0.90958755]
predictions_A_test_star: [-0.34615708  0.82203709 -3.91444351 ...  0.88864354  0.88341413
  0.86871794]
t_p_test: [0.96285723 0.71904258 0.98982591 ... 0.55099146 0.57113158 0.61914066]
Number of negative values in predictions_A_train_star: 4365
Number of negative values in predictions_A_test_star: 1075
Number of negative values in t_p_train: 2
Number of negative values in t_p_test: 0


In [5]:
# 仅选择赔付额大于0的保单进行第二步分位数回归
mask_train = y_amount_train.squeeze() > 0
mask_test = y_amount_test.squeeze() > 0

# 使用赔付额大于0的保单的 t_p 作为分位数水平
t_p_train_selected = t_p_train[mask_train]
t_p_test_selected = t_p_test[mask_test]

# 选取赔付额大于0的保单的特征和赔付额
x_train_selected = x_train[mask_train]
y_amount_train_selected = y_amount_train[mask_train]

x_test_selected = x_test[mask_test]
y_amount_test_selected = y_amount_test[mask_test]

# 将 t_p 转为二维列向量 
t_p_train_selected = t_p_train_selected.reshape(-1, 1) 
t_p_test_selected = t_p_test_selected.reshape(-1, 1)

# 在特征矩阵中添加常数列
x_train_selected_with_const = sm.add_constant(x_train_selected) 
x_test_selected_with_const = sm.add_constant(x_test_selected)


# 打印仅赔付保单p的范围以及负值个数
predictions_A_train_star_selected = predictions_A_train_star[mask_train]
predictions_A_test_star_selected = predictions_A_test_star[mask_test]

print("predictions_A_train_star_selected size:", predictions_A_train_star_selected.shape[0])
print("predictions_A_test_star_selected size:", predictions_A_test_star_selected.shape[0])

print("predictions_A_train_star_selected range:", predictions_A_train_star_selected.min(), predictions_A_train_star_selected.max())
print("predictions_A_test_star_selected range:", predictions_A_test_star_selected.min(), predictions_A_test_star_selected.max())

negative_count = np.sum(predictions_A_train_star_selected < 0)
print("Number of negative values in predictions_A_train_star_selected:", negative_count)
negative_count = np.sum(predictions_A_test_star_selected < 0)
print("Number of negative values in predictions_A_test_star_selected:", negative_count)

# 打印仅赔付保单t_p的负值个数
negative_count = np.sum(t_p_train_selected < 0)
print("Number of negative values in t_p_train_selected:", negative_count)
negative_count = np.sum(t_p_test_selected < 0)
print("Number of negative values in t_p_test_selected:", negative_count)

# 打印仅赔付的保单t_p范围
print("t_p_train_selected range:", t_p_train_selected.min(), t_p_train_selected.max())
print("t_p_test_selected range:", t_p_test_selected.min(), t_p_test_selected.max())

predictions_A_train_star_selected size: 3728
predictions_A_test_star_selected size: 896
predictions_A_train_star_selected range: -27.34467942998271 0.9492911531005811
predictions_A_test_star_selected range: -9.843593076671844 0.948104944145016
Number of negative values in predictions_A_train_star_selected: 79
Number of negative values in predictions_A_test_star_selected: 16
Number of negative values in t_p_train_selected: 0
Number of negative values in t_p_test_selected: 0
t_p_train_selected range: 0.01397876194709884 0.9982360005120711
t_p_test_selected range: 0.03651707901191035 0.9953889822638616


In [6]:
# 筛去p小于0情况
new_train = predictions_A_train_star_selected.squeeze() > 0
new_test = predictions_A_test_star_selected.squeeze() > 0

predictions_A_train_star_selected_second = predictions_A_train_star_selected[new_train]
predictions_A_test_star_selected_second = predictions_A_test_star_selected[new_test]

t_p_train_selected_second = t_p_train_selected[new_train]
t_p_test_selected_second = t_p_test_selected[new_test]

# 打印筛去小于0后的p范围
print("predictions_A_train_star_selected_second size:", predictions_A_train_star_selected_second.shape[0])
print("predictions_A_test_star_selected_second size:", predictions_A_test_star_selected_second.shape[0])
print("predictions_A_train_star_selected_second range:", predictions_A_train_star_selected_second.min(), predictions_A_train_star_selected_second.max())
print("predictions_A_test_star_selected_second range:", predictions_A_test_star_selected_second.min(), predictions_A_test_star_selected_second.max())


# 打印仅赔付保单t_p的负值个数
negative_count = np.sum(t_p_train_selected_second < 0)
print("Number of negative values in t_p_train_selected_second:", negative_count)
negative_count = np.sum(t_p_test_selected_second < 0)
print("Number of negative values in t_p_test_selected_second:", negative_count)

# 打印仅赔付的保单t_p范围
print("t_p_train_selected_second range:", t_p_train_selected_second.min(), t_p_train_selected_second.max())
print("t_p_test_selected_second range:", t_p_test_selected_second.min(), t_p_test_selected_second.max())

predictions_A_train_star_selected_second size: 3649
predictions_A_test_star_selected_second size: 880
predictions_A_train_star_selected_second range: 0.011550132835399185 0.9492911531005811
predictions_A_test_star_selected_second range: 0.011140264801184907 0.948104944145016
Number of negative values in t_p_train_selected_second: 0
Number of negative values in t_p_test_selected_second: 0
t_p_train_selected_second range: 0.01397876194709884 0.949415745136952
t_p_test_selected_second range: 0.03651707901191035 0.9494367115777574


In [7]:
# 选取p大于0的保单的特征和赔付额
x_train_selected_second = x_train_selected[new_train]
y_amount_train_selected_second = y_amount_train_selected[new_train]

x_test_selected_second = x_test_selected[new_test]
y_amount_test_selected_second = y_amount_test_selected[new_test]

# 在特征矩阵中添加常数列
x_train_selected_with_const = sm.add_constant(x_train_selected_second) 
x_test_selected_with_const = sm.add_constant(x_test_selected_second)

# 初始化结果存储
y_amount_pred_train = np.zeros_like(y_amount_train_selected_second)
y_amount_pred_test = np.zeros_like(y_amount_test_selected_second)

# 初始化损失存储
loss_train = 0.0
loss_test = 0.0

In [8]:
# 循环处理每个保单
# 训练集
for i in range(len(t_p_train_selected_second)):
    # 将一维数组转换为二维数组 
    x_train_i = x_train_selected_with_const[i:i+1, :] 
    # t_p_train_selected_second[i] > 0进行分位数回归，否则直接令 y_amount_pred_train为0
    if t_p_train_selected_second[i] > 0:
        # 使用分位数回归模型
        quantreg = sm.QuantReg(y_amount_train_selected_second[i: i+1], sm.add_constant(x_train_i))
        quantreg_result = quantreg.fit(q=t_p_train_selected_second[i])
        # 预测训练集的赔付金额
        y_amount_pred_train[i: i+1] = quantreg_result.predict(sm.add_constant(x_train_i)).reshape(1, -1) 
    else:
        y_amount_pred_train[i: i+1] = np.zeros_like(y_amount_pred_train[i: i+1])  
             
#测试集
for i in range(len(t_p_test_selected_second)):
    # 将一维数组转换为二维数组
    x_test_i = x_test_selected_with_const[i:i+1, :]
    # t_p_test_selected[i]_second > 0进行分位数回归，否则直接令 y_amount_pred_test为0
    if t_p_test_selected_second[i] > 0:
        # 使用分位数回归模型
        quantreg = sm.QuantReg(y_amount_test_selected_second[i: i+1], sm.add_constant(x_test_i))
        quantreg_result = quantreg.fit(q=t_p_test_selected_second[i])
        # 预测测试集的赔付金额
        y_amount_pred_test[i: i+1] = quantreg_result.predict(sm.add_constant(x_test_i)).reshape(1, -1)
    else:
        y_amount_pred_test[i: i+1] = np.zeros_like(y_amount_pred_test[i: i+1]) 

In [9]:
# 计算测试集mse、rmse
test_mse = np.mean((y_amount_pred_test - y_amount_test_selected.squeeze())** 2)
test_rmse = np.sqrt(test_mse)

# 打印结果
print("Test_mse:", test_mse)
print("Test_rmse:", test_rmse)

Test_mse: 6663989207.170865
Test_rmse: 81633.26042227435
