### Statistical Modeling

In [1]:
# Import standard python libraries
import os,sys

# Get the absolute path of the parent directory
rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)


In [3]:
# Loading modules from the script directory
from scripts.statistical_modeling import *

In [4]:
# Read the dataset
df = pd.read_csv('../data/cleaned_data.csv',index_col=False,low_memory=False)

In [24]:
models = Statistical_Modelling(df)

In [26]:
# Specify the numeric and categorical features to use 
numeric_features = ['CalculatedPremiumPerTerm', 'PostalCode', 'RegistrationYear', 'SumInsured']
categorical_features = ['Gender', 'RegistrationYear', 'CoverType', 'CoverCategory', 'VehicleType', 'PostalCode', 'MaritalStatus', 'Province', 'make', 'Citizenship', 'Model', 'NewVehicle']

In [27]:
# Features
features = list(set(numeric_features) | set(categorical_features) - set(['TotalPremium', 'TotalClaims']))

In [28]:
# Encode categorical variables using label encoder
for col in categorical_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

In [29]:
X = df[features]
y_premium = df['TotalPremium']
y_claims= df['TotalClaims']

# split the data
X_train, X_test, y_claims_train, y_claims_test=models.split_data(X,y_claims)
X_train, X_test, y_premium_train, y_premium_test=models.split_data(X,y_premium)

In [30]:
# LinearRegression
model_claim = LinearRegression()
model_premium=LinearRegression()
model_claim.fit(X_train, y_claims_train) 
model_premium.fit(X_train, y_premium_train)


In [31]:
# LinearRegression result
print ('Totalclaims Linear Regression result')
models.model_testing(model_claim, X_test, y_claims_test)
print('')
print('Total Premium Linear Regression Result')
models.model_testing(model_premium, X_test,y_premium_test)

Totalclaims Linear Regression result
Mean Absolute Error (MAE): 136.16616913308886
Mean Squared Error (MSE): 4869187.531346977
Root Mean Squared Error (RMSE): 2206.623559048298
R-squared: 0.0035198920419504676

Total Premium Linear Regression Result
Mean Absolute Error (MAE): 58.183567017681376
Mean Squared Error (MSE): 16810.459647369207
Root Mean Squared Error (RMSE): 129.65515665552684
R-squared: 0.3906644125958112


Model Training


In [32]:
# for TotalClaims
Linear_Regression = LinearRegression()
Linear_Regression .fit(X_train, y_claims_train) 

Decision_Tree = DecisionTreeRegressor(random_state=42)
Decision_Tree.fit(X_train, y_claims_train)

Random_Forest =RandomForestRegressor(n_estimators=100, random_state=42)
Random_Forest.fit(X_train, y_claims_train)

XGBoost= XGBRegressor(n_estimators=100, random_state=42)
XGBoost.fit(X_train, y_claims_train)

In [33]:
model_names = {
    Linear_Regression: "Linear Regression",
    Decision_Tree: "Decision Tree",
    Random_Forest: "Random Forest",
    XGBoost: "XGBoost"
}

for model in [Linear_Regression, Decision_Tree, Random_Forest, XGBoost]:
    print(f"{model_names[model]} Model result for TotalClaims")
    
    # Perform model testing
    models.model_testing(model, X_test, y_claims_test)
    print()

Linear Regression Model result for TotalClaims
Mean Absolute Error (MAE): 136.16616913308886
Mean Squared Error (MSE): 4869187.531346977
Root Mean Squared Error (RMSE): 2206.623559048298
R-squared: 0.0035198920419504676

Decision Tree Model result for TotalClaims
Mean Absolute Error (MAE): 133.95722784767696
Mean Squared Error (MSE): 5955243.577539956
Root Mean Squared Error (RMSE): 2440.33677543489
R-squared: -0.21874167401843314

Random Forest Model result for TotalClaims
Mean Absolute Error (MAE): 133.67135441225912
Mean Squared Error (MSE): 5911699.291171213
Root Mean Squared Error (RMSE): 2431.3986286027252
R-squared: -0.20983032794635492

XGBoost Model result for TotalClaims
Mean Absolute Error (MAE): 140.80403385254138
Mean Squared Error (MSE): 5365061.01708841
Root Mean Squared Error (RMSE): 2316.2601358846573
R-squared: -0.09796070639958598



In [34]:
# for TotalClaims
Linear_Regression = LinearRegression()
Linear_Regression .fit(X_train, y_premium_train) 

Decision_Tree = DecisionTreeRegressor(random_state=42)
Decision_Tree.fit(X_train, y_premium_train)

Random_Forest =RandomForestRegressor(n_estimators=100, random_state=42)
Random_Forest.fit(X_train, y_premium_train)

XGBoost= XGBRegressor(n_estimators=100, random_state=42)
XGBoost.fit(X_train, y_premium_train)

In [35]:
model_names = {
    Linear_Regression: "Linear Regression",
    Decision_Tree: "Decision Tree",
    Random_Forest: "Random Forest",
    XGBoost: "XGBoost"
}

for model in [Linear_Regression, Decision_Tree, Random_Forest, XGBoost]:
    print(f"{model_names[model]} Model result for Totalpremium")
    print('----------------------------------------------------')
    # Perform model testing
    models.model_testing(model, X_test, y_premium_test)
    print()

Linear Regression Model result for Totalpremium
----------------------------------------------------
Mean Absolute Error (MAE): 58.183567017681376
Mean Squared Error (MSE): 16810.459647369207
Root Mean Squared Error (RMSE): 129.65515665552684
R-squared: 0.3906644125958112

Decision Tree Model result for Totalpremium
----------------------------------------------------
Mean Absolute Error (MAE): 23.686492675018297
Mean Squared Error (MSE): 7122.370528912975
Root Mean Squared Error (RMSE): 84.39413800088828
R-squared: 0.74183253040171

Random Forest Model result for Totalpremium
----------------------------------------------------
Mean Absolute Error (MAE): 23.898141307877182
Mean Squared Error (MSE): 7057.029048736476
Root Mean Squared Error (RMSE): 84.00612506678591
R-squared: 0.744200989684823

XGBoost Model result for Totalpremium
----------------------------------------------------
Mean Absolute Error (MAE): 41.37340255430723
Mean Squared Error (MSE): 10106.438196962363
Root Mean Sq