In [15]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge, RidgeCV, LassoCV
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from math import sqrt

# LGB
import lightgbm as lgb

#Read multiple files
import glob

In [16]:
# Read training data
df_train = pd.read_csv('Balance_training.csv')
df_train.shape

(339399, 65)

In [17]:
# Define categorical columns
categorical_columns = ['Vehicle_Performance',
                       'Vehicle_Passive_Restraint',
                       'Policy_Company','Policy_Billing_Code',
                       'Policy_Method_Of_Payment',
                       'Policy_Reinstatement_Fee_Indicator',
                    'Vehicle_Comprehensive_Coverage_Indicator',
                       'Vehicle_Collision_Coverage_Indicator',
                        'Vehicle_Youthful_Driver_Indicator',
                     'Vehicle_Youthful_Good_Student_Code',
                       'Vehicle_Youthful_Driver_Training_Code',
                     'Vehicle_Safe_Driver_Discount_Indicator',
                     'EEA_Liability_Coverage_Only_Indicator',
                     'EEA_Multi_Auto_Policies_Indicator',
                     'EEA_Packaged_Policy_Indicator',
                     'EEA_Full_Coverage_Indicator',
                       'EEA_Agency_Type',
                     'SYS_Renewed',
                     'SYS_New_Business',
                      'Vehicle_Usage',
                      'Vehicle_Anti_Theft_Device','Vehicle_Make_Description']

In [18]:
# Get dummy columns for training dataset
df_train = pd.get_dummies(df_train, columns=categorical_columns, prefix_sep="_")

#Droping columns after feature selection
df_train.drop(['Driver_Total_Female','Driver_Total_Related_To_Insured_Spouse','Vehicle_Performance_Intermediate'],axis=1, inplace=True)

In [19]:
df_train.shape

(339399, 146)

In [20]:
# Devide dataset into training and testing dataset
X = df_train.drop(['Loss_Amount','Frequency','Severity','Loss_Ratio'],axis=1)
Y = df_train['Loss_Amount']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33)

In [21]:
# scale input attributes for train dataset
scaler = preprocessing.StandardScaler()
scaler= scaler.fit(X_train)

# Scale input attributes for Train dataset
X_trainscaled = scaler.transform(X_train)

# Scale input attributes for Test dataset
X_testscaled = scaler.transform(X_test)

In [22]:
lgb_train = lgb.Dataset(X_trainscaled, label=Y_train)
params = {}
params['learning_rate'] = 0.1
params['boosting_type'] = 'gbdt'
params['objective'] = 'regression'
params['metric'] = 'rmse'
params['sub_feature'] = 0.9
params['num_leaves'] = 10
params['min_data'] = 100
params['max_depth'] = 10
lgb_tr = lgb.train(params, lgb_train, 1000)

In [23]:

y_pred=lgb_tr.predict(X_trainscaled)
rms = sqrt(mean_squared_error(y_pred, Y_train))
print('RMSE for training dataset for LGB model is :: ',rms)

RMSE for training dataset for LGB model is ::  2458.179830048121


In [24]:
ytest_pred=lgb_tr.predict(X_testscaled)
t_rms = sqrt(mean_squared_error(ytest_pred, Y_test))
print('RMSE for testing dataset for LGB model is :: ',t_rms)

RMSE for testing dataset for LGB model is ::  2122.885620663153


In [25]:
# Copy data in tabular format
dfObj = pd.DataFrame(columns=['FileName', 'PredictedLogRatio', 'ActualLogRatio','AbsoluteErrorLogRatio','RMSE_Portfolio'])

In [26]:
# Assigning value to table

## Applying predective model on test data
path = r'.\Trainportfolio' # use your path
all_files = glob.glob(path + "/*.csv")


for filename in all_files:
    port_test = pd.read_csv(filename)
    port_test = pd.get_dummies(port_test, columns=categorical_columns, prefix_sep="_")
    
    port_test.drop(['Driver_Total_Female','Driver_Total_Related_To_Insured_Spouse'],axis=1, inplace=True)

    # Align columns for both test and train to avoid mismatch of columns
    df_train, port_test = df_train.align(port_test, join='left', axis=1, fill_value=0)
    
    X_port = port_test.drop(['Loss_Amount','Frequency','Severity','Loss_Ratio'],axis=1)
    Y_port = port_test['Loss_Amount']

    X_portscaled = scaler.transform(X_port)
    
    port_test['Pred_lossamount']=lgb_tr.predict(X_portscaled)
    port_test['Pred_lossratio']=port_test['Pred_lossamount']/port_test['Annual_Premium']
    por_rms = sqrt(mean_squared_error(port_test['Pred_lossamount'], Y_port))
    

    port_actual_loss=port_test['Loss_Amount'].sum()
    port_actual_lossratio=port_test['Loss_Ratio'].mean()
    actual_loss_log = np.log(port_actual_lossratio)
    port_predict_loss_ratio=port_test['Pred_lossratio'].mean()

    predict_loss_log = np.log(port_predict_loss_ratio)
    abs_error = port_predict_loss_ratio-port_actual_lossratio
    
    dfObj = dfObj.append({'FileName': filename[filename.rfind('\\')+1:], 'PredictedLogRatio': predict_loss_log, 'ActualLogRatio': actual_loss_log, 
                          'AbsoluteErrorLogRatio': abs_error, 'RMSE_Portfolio': por_rms}, ignore_index=True)



In [27]:
dfObj.to_csv('Results-LGB.csv',index = False)

In [28]:
dfObj.head()

Unnamed: 0,FileName,PredictedLogRatio,ActualLogRatio,AbsoluteErrorLogRatio,RMSE_Portfolio
0,Test400.csv,0.162416,0.284192,-0.152337,1693.732348
1,Test401.csv,0.073311,0.180583,-0.121851,1577.048452
2,Test402.csv,-0.24014,-0.211575,-0.022791,2132.902485
3,Test403.csv,-0.002518,0.024458,-0.027274,1621.217024
4,Test404.csv,0.231507,-0.322801,0.53638,1545.696042
