# AllStateClaim Severity - predicting loss using Regression

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder 
from collections import defaultdict
from sklearn import linear_model


## Reading the training data

In [2]:
train_data = pd.read_csv('train.csv')
print(type(train_data))
train_data.drop('id',axis=1,inplace=True)

<class 'pandas.core.frame.DataFrame'>


In [3]:
#taking a small portion of the training data instead of the whole data 
#train_data = train_data.head()


## Converting categorical variables to numerical values
### Create a dataframe with only cat variables to convert them to numerical using hot-code-encoder

## select columns starting with cat and form a list

In [4]:
def splitFeaturesIntoCatAndCont(features):

    cat_col_list = []
    cont_col_list = []
    for i in features:
        if i.startswith("cat"):
            cat_col_list.append(i)
        else:
            #id column will also be appended to the cont_col_list
            cont_col_list.append(i)

    
    return cat_col_list,cont_col_list


### convert list to dataframe of cat and col

In [5]:
def split_DataFrame_intoCatAndCont(data_frame, train_cat_col_list,train_cont_col_list):
    cat_col_df = pd.DataFrame(data_frame, columns = train_cat_col_list)
    cont_col_df = pd.DataFrame(data_frame, columns = train_cont_col_list)
    return cat_col_df,cont_col_df
    


## Label Encoding of all data

In [59]:
def labelEncode(categoricalData, continuousData):
    #Categorical features analysis
    from sklearn.preprocessing import LabelEncoder
    catFeatures = []
    for colName in categoricalData.columns:
        le = LabelEncoder()
        le.fit(categoricalData[colName].unique())
        categoricalData[colName] = le.transform(categoricalData[colName])

    encoded_train_data = categoricalData.join(continuousData)
    #encoded_train_data = pd.concat(categoricalData, continuousData, axis = 1, ignore_index= True)
    return encoded_train_data



### 10-fold cross validation

In [1]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.cross_validation import KFold


def cross_validate(total_samples, n_folds, X,Y,clf):
    accuracy = []
    kf = KFold(total_samples,n_folds, True)# shuffle=True,
                               
    for train_index, test_index in kf:
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        
        clf.fit(X_train,y_train)
        predictions = clf.predict(X_test)
        accuracy.append(mean_squared_error(y_test,predictions))
        print(accuracy)
        
    return clf,np.array(accuracy)


    

### Scaling the features (except response feature)

In [8]:
from sklearn.preprocessing import scale

def normalize(X):
    scale(X, axis=0, with_mean=True, with_std=True, copy=True)

    return X

### Recursive Feature Elimination

In [None]:
#

#def recursiveFeatureElim(X):
    

### list of commands to execute for training data

In [58]:
features = list(train_data.columns.values)

#cat_col_list,cont_col_list

train_cat_col_list,train_cont_col_list = splitFeaturesIntoCatAndCont(features)



train_cat_col_df,train_cont_col_df = split_DataFrame_intoCatAndCont(train_data, train_cat_col_list,train_cont_col_list )

train_feature_df = labelEncode(train_cat_col_df, train_cont_col_df)



train_feature = np.array(train_feature_df)
r, c = train_feature.shape


#create an array which has indexes of columns
i_cols = []
for i in range(0,c-1):
    i_cols.append(i)

#Y is the target column, X has the rest
X_train = train_feature[:,0:(c-1)]
y_train = train_feature[:,(c-1)]

# normalization of features
X_train = normalize(X_train)



print("Shape of X")
print(X_train.shape)
print(type(X_train))
print(type(train_feature_df))

Shape of X
(188318, 130)
<class 'numpy.ndarray'>
<class 'pandas.core.frame.DataFrame'>


## Linear Regression

In [28]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.cross_validation import train_test_split
from sklearn.feature_selection import RFE




#Linear Regression
clf = LinearRegression(fit_intercept = True, normalize= True)


# RFE 
selector = RFE(clf, n_features_to_select = 20, step=1)
selector = selector.fit(X_train, y_train)










    



array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True,  True,  True,
        True,  True,  True,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False, False,  True, False, False, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [43]:
print(np.where(selector.support_ == True)[0])

featureNames  = train_cat_col_list+train_cont_col_list
#print(featureNames)

finalFeatures = []

for num in np.where(selector.support_ == True)[0]:
    finalFeatures.append(featureNames[num])

print(finalFeatures)
print(type(finalFeatures))

[ 15  16  17  18  19  20  21  35  36  37  38  40  41  42  43  44  47  56
  89 102]
['cat16', 'cat17', 'cat18', 'cat19', 'cat20', 'cat21', 'cat22', 'cat36', 'cat37', 'cat38', 'cat39', 'cat41', 'cat42', 'cat43', 'cat44', 'cat45', 'cat48', 'cat57', 'cat90', 'cat103']
<class 'list'>


In [60]:
X_finalFeatures_df = pd.DataFrame(train_feature_df, columns = finalFeatures)
X_finalFeatures_df.head()

Unnamed: 0,cat16,cat17,cat18,cat19,cat20,cat21,cat22,cat36,cat37,cat38,cat39,cat41,cat42,cat43,cat44,cat45,cat48,cat57,cat90,cat103
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Linear Regression

In [2]:
#support vector regressor with linear kernel
#clf = SVR(kernel = "rbf", C=1.0, epsilon=0.2)

#
#cross_validation with 1000 samples, 10 fold
clf = LinearRegression(fit_intercept = True, normalize= True)
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_finalFeatures_df, y_train, test_size=0.33, random_state=42)
clf.fit(X_train1,y_train1)

predictions = clf.predict(X_test1)
print(mean_absolute_error(y_test1,predictions))

#rint(X.shape)
#print(Y.shape)
#clf, accuracy =  cross_validate(188318,10,X_train,y_train,clf)
#plt.plot(x, y)

#print("mean_accuracy")
#print(np.mean(accuracy))

NameError: name 'LinearRegression' is not defined

### ridge ression

In [64]:
from sklearn import linear_model

reg = linear_model.Ridge (alpha = .5)
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_finalFeatures_df, y_train, test_size=0.33, random_state=42)
reg.fit(X_train1,y_train1)

predictions = reg.predict(X_test1)
print(mean_squared_error(y_test1,predictions))

6950416.26625


### lasso

In [65]:

clf = linear_model.Lasso(alpha = 0.1)
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_finalFeatures_df, y_train, test_size=0.33, random_state=42)
clf.fit(X_train1,y_train1)

predictions = clf.predict(X_test1)
print(mean_squared_error(y_test1,predictions))

6950718.22038


### SVM Linear

In [67]:
from sklearn import svm

clf = svm.SVC(kernel='linear')
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_finalFeatures_df, y_train, test_size=0.33, random_state=42)
clf.fit(X_train1,y_train1)

predictions = clf.predict(X_test1)
print(mean_squared_error(y_test1,predictions))

ValueError: Unknown label type: array([  840.33,  1303.28,  8485.27, ...,  2622.53,   574.45,  3615.77])

### SVM rbf

In [None]:


clf = svm.SVC(kernel='rbf')
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_finalFeatures_df, y_train, test_size=0.33, random_state=42)
clf.fit(X_train1,y_train1)

predictions = clf.predict(X_test1)
print(mean_squared_error(y_test1,predictions))

### decision tree regressor

In [69]:

from sklearn.tree import DecisionTreeRegressor


regressor = DecisionTreeRegressor(random_state=0)



X_train1, X_test1, y_train1, y_test1 = train_test_split(X_finalFeatures_df, y_train, test_size=0.33, random_state=42)
regressor.fit(X_train1,y_train1)

predictions = regressor.predict(X_test1)
print(mean_squared_error(y_test1,predictions))

7014446.13671


### xgb regressor

In [71]:
import xgboost as xgb

gbm = xgb.train(dtrain=T_train_xgb,params=params)

X_train1, X_test1, y_train1, y_test1 = train_test_split(X_finalFeatures_df, y_train, test_size=0.33, random_state=42)
clf.fit(X_train1,y_train1)

predictions = clf.predict(X_test1)
print(mean_squared_error(y_test1,predictions))

ImportError: No module named 'xgboost'

## reading test data

In [12]:
test_data = pd.read_csv('test.csv')
#test_data = pd.read_csv('/test.csv')
print(type(test_data))
test_data.drop('id',axis=1,inplace=True)

<class 'pandas.core.frame.DataFrame'>


## list of commands to execute for testing data

In [13]:
features = list(test_data.columns.values)

test_cat_col_list,test_cont_colList = splitFeaturesIntoCatAndCont(features)

test_cat_col_df, test_cont_col_df = split_DataFrame_intoCatAndCont(test_data, test_cat_col_list,test_cont_colList)

test_feature = labelEncode(test_cat_col_df, test_cont_col_df)

test_feature = np.array(test_feature)

r, c = test_feature.shape

#create an array which has indexes of columns
i_cols = []
for i in range(0,c-1):
    i_cols.append(i)

#Y is the target column, X has the rest
#X_test = test_feature[:,0:(c-1)]
#y_test = test_feature[:,(c-1)]

X_test = np.array(test_feature)

# normalization of features
X_test = normalize(X_test)

print("Shape of X_test")
print(X_test.shape)

Shape of X_test
(125546, 130)


### predicting the response variable in test data

In [14]:
prediction = clf.predict(X_test)

In [15]:
print(prediction)

[  1106.22813814   1946.13223877  11219.34941277 ...,   2831.03603756
   1072.21286394   4633.57977601]
