In [1]:
# Taarak Shah
# Stat 8051 project

# imports
import pandas as pd
import seaborn as sns
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
import numpy as np

# allow plots to appear directly in the notebook
%matplotlib inline

In [2]:
# calculate gini
def Gini(y_true, y_pred):
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]
    # sort rows on prediction column
    # (from largest to smalwrlest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:,0].argsort()][::-1,0]
    pred_order = arr[arr[:,1].argsort()][::-1,0]
    # get Lorenz curves
    L_true = np.cumsum(true_order) / np.sum(true_order)
    L_pred = np.cumsum(pred_order) / np.sum(pred_order)
    L_ones = np.linspace(1/n_samples, 1, n_samples)
    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)
    # normalize to true Gini coefficient
    return G_pred/G_true

In [3]:
# read in with pandas
train = pd.read_csv('InsNova_train.csv')
train.head()

Unnamed: 0,id,veh_value,exposure,veh_body,veh_age,gender,area,dr_age,claim_ind,claim_count,claim_cost
0,3,6.43,0.241898,STNWG,1,M,A,3,0,0,0.0
1,6,4.46,0.856523,STNWG,1,M,A,3,0,0,0.0
2,20,1.7,0.417517,HBACK,1,M,A,4,0,0,0.0
3,21,0.48,0.626975,SEDAN,4,F,A,6,0,0,0.0
4,28,1.96,0.08977,HBACK,1,F,A,2,0,0,0.0


In [4]:
test = pd.read_csv('InsNova_test.csv')
test.head()

Unnamed: 0,id,veh_value,exposure,veh_body,veh_age,gender,area,dr_age
0,1,2.52,0.289392,SEDAN,1,F,A,5
1,2,1.04,0.54006,SEDAN,4,F,A,3
2,7,1.35,0.938813,HBACK,2,F,A,4
3,8,1.15,0.226887,STNWG,4,M,A,6
4,9,1.29,0.657806,MIBUS,4,F,A,4


# So linreg was not promising.

Kaggle scored the basic regression the best. Even when calculating on all of the train data in R, the results were worse than the 60/40 initial split. So linear regression does not look helpful.

Here's my outline of things to do for the day:

-Build logistic model to predict 0, 1 for claim_ind;

-Train a lin reg model on only those with positive claim cost;

-Test predictions on Kaggle and see what sticks.

If this does not all work, try the Tweedie example.


Now, we create a logistic regression model.

In [5]:
# split training set into train and validation
np.random.seed(8051)
n = len(train)
index = np.random.rand(n) < 0.7
df_train = train[index]
df_valid = train[~index]

In [6]:
def prep_data(df):
    #Prepare the data
    df_dummy = df[['id','veh_value','exposure','claim_count','claim_cost']].copy()
    #Make dummy variables using Pandas
    df_dummy = pd.concat([df_dummy,pd.get_dummies(df['veh_age'],drop_first=True,prefix="veh_age")],axis=1)
    df_dummy = pd.concat([df_dummy,pd.get_dummies(df['dr_age'],drop_first=True,prefix="dr_age")],axis=1)
    df_dummy = pd.concat([df_dummy,pd.get_dummies(df['veh_body'],drop_first=True)],axis=1)
    df_dummy = pd.concat([df_dummy,pd.get_dummies(df['gender'],drop_first=True)],axis=1)
    df_dummy = pd.concat([df_dummy,pd.get_dummies(df['area'],drop_first=True)],axis=1)
    #Normalize the Vehicle Value
    df_dummy['veh_value'] = df_dummy['veh_value']/df_dummy['veh_value'].max()
    
    #We are trying to predict whether there is claim or no claim
    df_dummy['claim_count'] = df_dummy['claim_count'].apply(lambda x: 1 if x>0 else 0)
    return(df_dummy)

In [7]:
#We create a separate X_train and X_test dataset because we would require the
#categorical variables in the original dataset later 
#X_train and X_test dataframes have only dummy variables
X_train = prep_data(df_train)
X_test = prep_data(df_valid)

#Create two additional columns for the individual probabilities of claim (prob1) and no-claim (prob0)
X_train['prob0'] = np.zeros(len(X_train))
X_train['prob1'] = np.zeros(len(X_train))

X_test['prob0'] = np.zeros(len(X_test))
X_test['prob1'] = np.zeros(len(X_test))

In [8]:
#In the first part, we use Logistic Regression to get the probabilities of claim vs no-claim
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression(C=1000, class_weight='balanced', solver='lbfgs')

n=10

for i in range (0,n):
    
    #Choose 3000 Negative Classes
    X_train_temp = X_train.loc[np.random.choice(X_train[X_train['claim_count']==0].index, 3000, replace = False)]
    #Choose 3000 Positive Classes
    X_train_pos = X_train.loc[np.random.choice(X_train[X_train['claim_count']==1].index, 3000, replace = True)]
    #Append the positive classes 
    X_train_temp = X_train_temp.append(X_train_pos, ignore_index=True)

    #Separate into X and Y to train the model
    y_train_temp = X_train_temp['claim_count']
    X_train_temp.drop(['id','claim_count','claim_cost','prob0','prob1'], axis=1,inplace=True)

    
    #Fit the Logistic Regression Model
    logmodel.fit(X_train_temp,y_train_temp)
    X_train[['prob0','prob1']] = X_train[['prob0','prob1']] + logmodel.predict_proba(X_train.drop(['id','claim_count','claim_cost','prob0','prob1'],axis=1))
    X_test[['prob0','prob1']] = X_test[['prob0','prob1']] + logmodel.predict_proba(X_test.drop(['id','claim_count','claim_cost','prob0','prob1'],axis=1))
    
#Divide the log_proba and log_probb values by 10 to get the average log probabilities
X_train['prob0']=X_train['prob0']/n
X_train['prob1']=X_train['prob1']/n

X_test['prob0']=X_test['prob0']/n
X_test['prob1']=X_test['prob1']/n
    
#Check the metrics on training and test data
logProb = 1
temp = logProb > (X_test['prob0']/X_test['prob1'])
X_test['predicted_claim_count']= [1 if (p==True) else 0 for p in temp]

#Metrics on Test set
from sklearn.metrics import classification_report
print(classification_report(X_test['claim_count'],X_test['predicted_claim_count']))
X_test.drop('predicted_claim_count',axis=1,inplace=True)

#Calculate Claim Frequency for Test Set
X_test['predicted_freq']= X_test['prob1'].divide(X_test['prob0'])
X_test['predicted_freq']= X_test['predicted_freq'].apply(lambda x: x**3.65)



              precision    recall  f1-score   support

           0       0.96      0.62      0.75      6296
           1       0.11      0.64      0.19       470

    accuracy                           0.62      6766
   macro avg       0.53      0.63      0.47      6766
weighted avg       0.90      0.62      0.71      6766





In [9]:
#Part 2: Use Regression for predicting Claim Severity

#Train only on the subset of positive claim counts
X_train_regress = X_train[X_train['claim_count']>0].copy()
y_train_regress = np.log(X_train[X_train['claim_count']>0]['claim_cost'])

In [10]:
#Implement GLM for regression
lm1 = LinearRegression()
X_train_regress = X_train_regress.drop(['id','exposure','claim_count','claim_cost','prob0','prob1'],axis=1)
lm1.fit(X_train_regress,y_train_regress)

predictions = lm1.predict(X_test.drop(['id','exposure','claim_count','claim_cost','prob0','prob1','predicted_freq'],axis=1))
X_test['predicted_claim_cost']=[np.exp(p) for p in predictions]
X_test['predicted_claim_cost']=X_test['predicted_claim_cost'].multiply(X_test['predicted_freq'])

In [11]:
X_test

Unnamed: 0,id,veh_value,exposure,claim_count,claim_cost,veh_age_2,veh_age_3,veh_age_4,dr_age_2,dr_age_3,...,M,B,C,D,E,F,prob0,prob1,predicted_freq,predicted_claim_cost
0,3,0.308986,0.241898,0,0.000000,0,0,0,0,1,...,1,0,0,0,0,0,0.653527,0.346473,0.098646,74.479438
4,28,0.094185,0.089770,0,0.000000,0,0,0,1,0,...,0,0,0,0,0,0,0.713296,0.286704,0.035909,26.304674
8,36,0.095147,0.157753,0,0.000000,1,0,0,0,0,...,1,0,0,0,0,0,0.682195,0.317805,0.061535,47.925701
10,45,0.052859,0.351635,0,0.000000,0,0,1,0,0,...,1,0,0,0,0,0,0.627086,0.372914,0.150013,161.608455
12,49,0.066314,0.654490,0,0.000000,0,0,1,0,0,...,1,0,0,0,0,0,0.440792,0.559208,2.383357,2509.850126
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22576,67749,0.117732,0.825829,1,1075.481830,0,1,0,0,0,...,0,0,0,0,0,1,0.410798,0.589202,3.730128,4459.116719
22593,67802,0.230178,0.731370,1,286.749300,0,0,0,0,0,...,1,0,0,0,0,1,0.398039,0.601961,4.525809,5025.910282
22597,67814,0.203748,0.887322,1,841.053908,0,0,0,0,0,...,0,0,0,0,0,1,0.229370,0.770630,83.373729,107924.114442
22599,67821,0.169149,0.915025,1,1365.945399,1,0,0,0,0,...,1,0,0,0,0,1,0.242760,0.757240,63.577694,75152.785700


In [12]:
Gini(X_test.claim_cost, X_test.predicted_claim_cost)

0.17039448614841946

# Solid Gini value. Now, let's re-do analysis but on full training, predict on test and upload.

In [13]:
train = pd.read_csv('InsNova_train.csv')
test = pd.read_csv('InsNova_test.csv')

def prep_data_test(df):
    #Prepare the data
    df_dummy = df[['id','veh_value','exposure']].copy()
    #Make dummy variables using Pandas
    df_dummy = pd.concat([df_dummy,pd.get_dummies(df['veh_age'],drop_first=True,prefix="veh_age")],axis=1)
    df_dummy = pd.concat([df_dummy,pd.get_dummies(df['dr_age'],drop_first=True,prefix="dr_age")],axis=1)
    df_dummy = pd.concat([df_dummy,pd.get_dummies(df['veh_body'],drop_first=True)],axis=1)
    df_dummy = pd.concat([df_dummy,pd.get_dummies(df['gender'],drop_first=True)],axis=1)
    df_dummy = pd.concat([df_dummy,pd.get_dummies(df['area'],drop_first=True)],axis=1)
    #Normalize the Vehicle Value
    df_dummy['veh_value'] = df_dummy['veh_value']/df_dummy['veh_value'].max()
    
    #We are trying to predict whether there is claim or no claim
    #df_dummy['claim_count'] = df_dummy['claim_count'].apply(lambda x: 1 if x>0 else 0)
    return(df_dummy)

#We create a separate X_train and X_test dataset because we would require the
#categorical variables in the original dataset later 
#X_train and X_test dataframes have only dummy variables
X_train = prep_data(train)
X_test = prep_data_test(test)

#Create two additional columns for the individual probabilities of claim (prob1) and no-claim (prob0)
X_train['prob0'] = np.zeros(len(X_train))
X_train['prob1'] = np.zeros(len(X_train))

X_test['prob0'] = np.zeros(len(X_test))
X_test['prob1'] = np.zeros(len(X_test))

In [14]:
#In the first part, we use Logistic Regression to get the probabilities of claim vs no-claim
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression(C=1000, class_weight='balanced', solver='lbfgs')

n=10

for i in range (0,n):
    
    #Choose 3000 Negative Classes
    X_train_temp = X_train.loc[np.random.choice(X_train[X_train['claim_count']==0].index, 3000, replace = False)]
    #Choose 3000 Positive Classes
    X_train_pos = X_train.loc[np.random.choice(X_train[X_train['claim_count']==1].index, 3000, replace = True)]
    #Append the positive classes 
    X_train_temp = X_train_temp.append(X_train_pos, ignore_index=True)

    #Separate into X and Y to train the model
    y_train_temp = X_train_temp['claim_count']
    X_train_temp.drop(['id','claim_count','claim_cost','prob0','prob1'], axis=1,inplace=True)

    
    #Fit the Logistic Regression Model
    logmodel.fit(X_train_temp,y_train_temp)
    X_train[['prob0','prob1']] = X_train[['prob0','prob1']] + logmodel.predict_proba(X_train.drop(['id','claim_count','claim_cost','prob0','prob1'],axis=1))
    X_test[['prob0','prob1']] = X_test[['prob0','prob1']] + logmodel.predict_proba(X_test.drop(['id','prob0','prob1'],axis=1))
    
#Divide the log_proba and log_probb values by 10 to get the average log probabilities
X_train['prob0']=X_train['prob0']/n
X_train['prob1']=X_train['prob1']/n

X_test['prob0']=X_test['prob0']/n
X_test['prob1']=X_test['prob1']/n
    
#Check the metrics on training and test data
logProb = 1
temp = logProb > (X_test['prob0']/X_test['prob1'])
X_test['predicted_claim_count']= [1 if (p==True) else 0 for p in temp]

#Metrics on Test set
#from sklearn.metrics import classification_report
#print(classification_report(X_test['claim_count'],X_test['predicted_claim_count']))
#X_test.drop('predicted_claim_count',axis=1,inplace=True)

#Calculate Claim Frequency for Test Set
X_test['predicted_freq']= X_test['prob1'].divide(X_test['prob0'])
X_test['predicted_freq']= X_test['predicted_freq'].apply(lambda x: x**3.65)



In [15]:
#Part 2: Use Regression for predicting Claim Severity

#Train only on the subset of positive claim counts
X_train_regress = X_train[X_train['claim_count']>0].copy()
y_train_regress = np.log(X_train[X_train['claim_count']>0]['claim_cost'])
X_train_regress

Unnamed: 0,id,veh_value,exposure,claim_count,claim_cost,veh_age_2,veh_age_3,veh_age_4,dr_age_2,dr_age_3,...,TRUCK,UTE,M,B,C,D,E,F,prob0,prob1
5075,15231,0.050781,0.895292,1,1140.515620,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0.382263,0.617737
5076,15232,0.046484,0.307267,1,3436.550152,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0.674718,0.325282
5077,15237,0.041797,0.501639,1,366.525669,0,1,0,0,1,...,0,0,1,0,0,0,0,0,0.527401,0.472599
5078,15238,0.097266,0.740908,1,1724.475407,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0.383235,0.616765
5079,15242,0.081250,0.781441,1,1399.084569,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0.388318,0.611682
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22605,67847,0.089844,0.909445,1,5163.224416,0,1,0,1,0,...,0,1,0,0,0,0,0,1,0.371815,0.628185
22606,67848,0.058203,0.999321,1,5230.467325,1,0,0,1,0,...,0,0,0,0,0,0,0,1,0.246610,0.753390
22607,67849,0.064453,0.783724,1,1502.970766,1,0,0,0,1,...,0,0,0,0,0,0,0,1,0.329306,0.670694
22608,67851,0.093359,0.841333,1,1169.773526,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0.376309,0.623691


In [16]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(interaction_only=True, include_bias=False)
X_train_regress = X_train_regress.drop(['id','exposure','claim_count','claim_cost','prob0','prob1'],axis=1)
poly.fit_transform(X_train_regress)
X_train_regress

Unnamed: 0,veh_value,veh_age_2,veh_age_3,veh_age_4,dr_age_2,dr_age_3,dr_age_4,dr_age_5,dr_age_6,CONVT,...,SEDAN,STNWG,TRUCK,UTE,M,B,C,D,E,F
5075,0.050781,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5076,0.046484,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5077,0.041797,0,1,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
5078,0.097266,0,1,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
5079,0.081250,1,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22605,0.089844,0,1,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
22606,0.058203,1,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
22607,0.064453,1,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
22608,0.093359,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,1


In [17]:
#Implement GLM for regression
lm1 = LinearRegression()
lm1.fit(X_train_regress,np.log(y_train_regress+1))

predictions = lm1.predict(X_test.drop(['id','exposure','prob0','prob1','predicted_claim_count','predicted_freq'],axis=1))
X_test['predicted_claim_cost']=[np.exp(p) for p in predictions]
X_test['predicted_claim_cost']=X_test['predicted_claim_cost'].multiply(X_test['predicted_freq'])

In [18]:
X_test

Unnamed: 0,id,veh_value,exposure,veh_age_2,veh_age_3,veh_age_4,dr_age_2,dr_age_3,dr_age_4,dr_age_5,...,B,C,D,E,F,prob0,prob1,predicted_claim_count,predicted_freq,predicted_claim_cost
0,1,0.102815,0.289392,0,0,0,0,0,0,1,...,0,0,0,0,0,0.707841,0.292159,0,0.039559,0.291274
1,2,0.042432,0.540060,0,0,1,0,1,0,0,...,0,0,0,0,0,0.482874,0.517126,1,1.284203,9.984130
2,7,0.055080,0.938813,1,0,0,0,0,1,0,...,0,0,0,0,0,0.308844,0.691156,1,18.919404,141.628950
3,8,0.046920,0.226887,0,0,1,0,0,0,0,...,0,0,0,0,0,0.685295,0.314705,0,0.058398,0.455757
4,9,0.052632,0.657806,0,0,1,0,0,1,0,...,0,0,0,0,0,0.538170,0.461830,0,0.572144,4.677731
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22624,67841,0.252142,0.349599,0,0,0,1,0,0,0,...,0,0,0,0,1,0.595615,0.404385,0,0.243319,1.917321
22625,67845,0.195022,0.897175,1,0,0,0,0,0,0,...,0,0,0,0,1,0.225063,0.774937,1,91.181822,732.498208
22626,67846,0.166055,0.495885,1,0,0,1,0,0,0,...,0,0,0,0,1,0.475001,0.524999,1,1.440936,11.491966
22627,67850,0.157079,0.270942,1,0,0,1,0,0,0,...,0,0,0,0,1,0.584707,0.415293,0,0.286860,2.285898


In [19]:
output = pd.DataFrame()
output['claim_cost'] = X_test.predicted_claim_cost
output.index.name = 'id'
output.index += 1
output.to_csv('test_bag.csv',index=True)
output

Unnamed: 0_level_0,claim_cost
id,Unnamed: 1_level_1
1,0.291274
2,9.984130
3,141.628950
4,0.455757
5,4.677731
...,...
22625,1.917321
22626,732.498208
22627,11.491966
22628,2.285898


# We obtained a Gini index of 0.12636 on this data. Better, but room for improvement.

We used 100 iterations.