In [None]:
from AnalysisModule import data_analysis
from TransformationModule import data_transformation
import SettingsModule as sm
import pandas as pd
from time import strftime, localtime
import logging,time,json
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import numpy as np
from scipy.stats import chi2_contingency,chisquare
import pandas, copy, pickle
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import IsolationForest
from sklearn import preprocessing
from keras.models import Sequential
from keras.layers import Dense
from keras.models import model_from_json
from keras.optimizers import SGD

def main():
    logging.basicConfig(filename=sm.LOG_FILE,level=logging.DEBUG)
    da = data_analysis()
    
    data_path = sm.DATA_FILE_PATH + sm.DATA_FILE_NAME
    logging.info(strftime("%a, %d %b %Y %H:%M:%S +0000", localtime()) + '\tInfo: Loading data from location : ' + data_path)
    da.get_data(data_path)
    logging.info(strftime("%a, %d %b %Y %H:%M:%S +0000", localtime()) + '\tInfo: Data Loading complete.')
    logging.info(strftime("%a, %d %b %Y %H:%M:%S +0000", localtime()) + '\tInfo: Memory Usage : ' 
                 + str(da.data.memory_usage(index=True,deep=True).sum()/1000000) + 'KB')
    logging.info(strftime("%a, %d %b %Y %H:%M:%S +0000", localtime()) + '\tInfo: Data Description below')
    logging.info(strftime("%a, %d %b %Y %H:%M:%S +0000", localtime()) + '\tInfo: Columns : ' + str(da.data.shape[0]))
    logging.info(strftime("%a, %d %b %Y %H:%M:%S +0000", localtime()) + '\tInfo: Rows : ' + str(da.data.shape[1]))
    
    da.get_column_details().to_csv(sm.DOCUMENT_FOLDER + '\Data_Availability_Report.csv')
    print('Data_Availability_Report created')
    
    datatypes_json = sm.DATATYPE
    split_data = da.split_data_on_datatypes(datatypes_json)
    
    dt = data_transformation()
    
    for each in split_data['numerical']:
        y = 0
        split_data['numerical'][each] = split_data['numerical'][each].apply(lambda x: dt.impute(x,y))
        
    for each in split_data['categorical']:
        split_data['categorical'][each] = split_data['categorical'][each].apply(lambda x: dt.impute(x,'unknown'))

    features = ['Layer Aggregate_l', 'Total Layer Limit_l', 'Per Occurrence limit_l','Aggregate_l']

    # Separating out the features
    df = split_data['numerical']
    x = df[features]

    # Standardizing the features
#     std_scaler = StandardScaler().fit(x)
#     pickle.dump( std_scaler, open( r'C:\Users\shraddha.sharma\Projects\Premium advise\pickle\encoder\std_scaler.pkl', "wb" ) )
    std_scale = pickle.load( open( r'C:\Users\shraddha.sharma\Projects\Premium advise\pickle\encoder\std_scaler.pkl', "rb" ) )
    x = std_scale.transform(x)

    # PCA
    
#     pca = PCA(n_components=1)
#     principalComponents = pca.fit(x)
#     pickle.dump( principalComponents, open( r'C:\Users\shraddha.sharma\Projects\Premium advise\pickle\encoder\pca.pkl', "wb" ) )
    principalComponent = pickle.load( open( r'C:\Users\shraddha.sharma\Projects\Premium advise\pickle\encoder\pca.pkl', "rb" ) )
    principalComponents = principalComponent.transform(x)
    principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1'])
    
    # merging PCA and other numerical columns

    df.drop(features, axis=1, inplace=True)
    num_data = pd.concat([principalDf,df],axis=1)
    
    y_cat =  split_data['categorical']['status']

    ## label encoding categorical variable
    df =pd.DataFrame()
    for each in split_data['categorical'].columns:
#         label_encoder = LabelEncoder()
#         enc = label_encoder.fit(split_data['categorical'][each])
#         pickle.dump( enc, open( r'C:\Users\shraddha.sharma\Projects\Premium advise\pickle\encoder\encoder_' +  str(each.replace('/','_')) +'.pkl', "wb" ) )
        encoder = pickle.load( open( r'C:\Users\shraddha.sharma\Projects\Premium advise\pickle\encoder\encoder_' +  str(each.replace('/','_')) +'.pkl', "rb" ) )
        enc = encoder.transform(split_data['categorical'][each])
        df = pd.concat([df,pd.DataFrame(enc)],axis=1)
    df.columns = split_data['categorical'].columns
    y_num = df['status']
    df.drop('Line of Business',axis = 1,inplace = True) # removing Line of Business because there is only one Line of Business
    
#     def chisq_of_df_cols(df, c1, c2):
#         groupsizes = df.groupby([c1, c2]).size()
#         ctsum = groupsizes.unstack(c1)
#         # fillna(0) is necessary to remove any NAs which will cause exceptions
#         return(chi2_contingency(ctsum.fillna(0)))

#     mtx = pd.DataFrame(columns = df.columns, index =df.columns)

#     for col1 in df.columns:
#         for col2 in df.columns:
#             if col1 == col2:
#                 mtx[col1][col2] = 1
#             else:
#                 mtx[col1][col2] = chisq_of_df_cols(df, col1, col2)[1]
                
    # Win score

    def f1(x,y):
        return (x-y).days

    duration = split_data['datetime'].apply(lambda x:f1(x['ExpiryDate'],x['InceptionDate']),axis=1)
    cols = list(num_data.columns.values)
    cols.extend(split_data['categorical'].columns.values)
    cols.remove('Line of Business')
    cols.append('duration')
    cols.remove('status')
    df1 = pd.concat([num_data,df,duration,y_num],axis = 1)
    df1['status_cat'] = y_cat
    X = df1[df1['Coverage Premium_l'] > 0]
    y = X['status_cat']
    X = X.drop('status',axis=1)
    X = X.drop('status_cat',axis=1)
    X.columns = cols
    
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=100)
    
    # feature importance and top imp features

    clf = DecisionTreeClassifier(random_state=0)
    clf.fit(X, y)
    temp = pd.DataFrame([X.columns.values,clf.feature_importances_],index = ['columns','imp']).T
    imp_col = list(temp[temp['imp'] > 0.009]['columns'])
    print(imp_col)
    
#     clf = GradientBoostingClassifier()
#     clf.fit(X[imp_col], y)
#     pickle.dump( clf, open( r'C:\Users\shraddha.sharma\Projects\Premium advise\pickle\model\model.pkl', "wb" ) )
    model = pickle.load( open( r'C:\Users\shraddha.sharma\Projects\Premium advise\pickle\model\model.pkl', "rb" ) )
        
    y_pred_prob = pd.DataFrame(model.predict_proba(X[imp_col]),columns = model.classes_)
    
    def get_score(prob):
        score = (((((prob['Bound'] + prob['Issued']+prob['Cleared']+prob['Outstanding Quote'])-(prob['Cancelled']+prob['Dead']+prob['Declined']+prob['Terminated']+prob['Void']))+1)/2)*0.9 + 0.05)*100
        return score
    print('Total rows: ',len(y_pred_prob))
    y_pred_prob['Score'] = None
    for i in range(len(y_pred_prob)):
        y_pred_prob['Score'][i] = get_score(y_pred_prob[i:i+1])[i]
        
    df_prem = pd.concat([df[:36020],num_data[:36020],duration[:36020],y_pred_prob['Score'][:36020]],axis = 1)
    
    # outlier analysis
    PREDICTED_COL = 'Coverage Premium_l'

    X_col =  copy.deepcopy(list(df_prem.columns.values))
    X_col.remove(PREDICTED_COL)

    isof = IsolationForest()
    isof.fit(df_prem)
    df_prem['Outlier'] = isof.predict(df_prem)
    
    df_prem_no_outlier = df_prem[df_prem['Outlier'] == 1]
    X = df_prem_no_outlier[X_col]
    y = df_prem_no_outlier[PREDICTED_COL]
    
    # Predicting coverage with winscore
#     all_col = copy.deepcopy(df_prem.columns)
#     y_col = 'Coverage Premium_l'
#     X_col = all_col.drop('Coverage Premium_l')

#     X_lr = df_prem[X_col]
#     y_lr = df_prem[y_col]

#     #Encode status column
#     status = X_lr['status']
#     enc = dt.generate_encoder(status)
    
#     pickle.dump( enc, open( r'C:\Users\shraddha.sharma\Projects\Premium advise\pickle\encoder\status.pkl', "wb" ) )
#     status_enc = pickle.load( open( r'C:\Users\shraddha.sharma\Projects\Premium advise\pickle\encoder\status.pkl', "rb" ) )
    
    
#     X_lr['status'] = dt.encode(status,status_enc)

#     x = X_lr.values #returns a numpy array
#     min_max_scaler = preprocessing.MinMaxScaler()
#     x_scaled = min_max_scaler.fit_transform(x)
#     X_lr = pandas.DataFrame(x_scaled,columns = X_lr.columns.values)

    def train_model(X,y,epochs=2):

        input_dim = X.shape[1]

        model = Sequential()
        model.add(Dense(input_dim, input_dim=input_dim, kernel_initializer='normal', activation='relu'))
        model.add(Dense(32, input_dim=16, kernel_initializer='normal', activation='relu'))
        model.add(Dense(1, kernel_initializer='normal'))

        # Compile model
    #     sgd = SGD(lr=0.01,nesterov=True)
        model.compile(loss='mean_squared_error', optimizer='adam')

        model.fit(X, y, nb_epoch=epochs)
        return model

#     X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=100)

#     model = train_model(X,y,epochs=2000)
    
#     pickle.dump( model, open( r'C:\Users\shraddha.sharma\Projects\Premium advise\pickle\model\premium_advice.pkl', "wb" ) )
#     model = pickle.load( open( r'C:\Users\shraddha.sharma\Projects\Premium advise\pickle\model\premium_advice.pkl', "rb" ) )
    
#     y_pred = model.predict(X)
    prem_adv_model = train_model(X,y,epochs=20)
    
#     # serialize model to JSON
    model_json = prem_adv_model.to_json()
    with open(sm.PKL_PATH + '\premium_advice.json', "w" ) as json_file:
        json_file.write(model_json)
    prem_adv_model.save_weights(sm.PKL_PATH + '\premium_advice_weights.h5')

    json_file = open(sm.PKL_PATH + '\premium_advice.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    pa_model = model_from_json(loaded_model_json)
    pa_model.load_weights(sm.PKL_PATH + '\premium_advice_weights.h5')
    y_pred = pa_model.predict(X)    
    result = pd.concat([X.reset_index(drop=True)
           ,y.reset_index(drop=True)
           ,pd.DataFrame([i[0] for i in y_pred],columns=['Predicted'])]
          ,axis=1)
    
    logging.debug('debug: ' + strftime("%a, %d %b %Y %H:%M:%S +0000", localtime()))
    logging.info('info: ' + strftime("%a, %d %b %Y %H:%M:%S +0000", localtime()))
    logging.warning('warning: ' + strftime("%a, %d %b %Y %H:%M:%S +0000", localtime()))
    
    return result

if __name__ == "__main__":
    print(main())

Using TensorFlow backend.


Data Analysis object initialised
Data_Availability_Report created


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


data transformation object initialised
['principal component 1', 'General Liability Allocation_l', 'Coverage Premium_l', 'Attachment Amount_l', 'Terrorism_l', 'Auto Liability Allocation_l', 'BusinessClassification', 'InsuredState', 'duration']
Total rows:  36021


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [None]:
var = {"Line of Business":"Excess & Umbrella",
"BusinessClassification":"unknown",
"InsuredState":"NJ",
"InceptionDate":"7/1/2015",
"ExpiryDate":"7/1/2016",
"IsAdmitted":1,
"Type":"Renewal",
"status":"Issued",
"Lead Excess-All Other_c":0,
"Lead Excess-Real Estate/Property Mgmt/Office Building_c":0,
"Lead Excess-Contracting/Construction_c":0,
"Excess-Real Estate/Property Mgmt/Office Building_c":0,
"Excess-Manufacturing_c":0,
"Lead Excess-Retail/Wholesale_c":0,
"Lead Excess-Municipalities/Hospitals/Political_c":0,
"Lead Excess-Installation/Service or Repair_c":0,
"Excess-Installation/Service or Repair_c":0,
"Excess-Entertainment/Recreation_c":0,
"Excess-Retail/Wholesale_c":0,
"Excess-Entertainment/Recreation_l":0,
"Excess-Municipalities/Hospitals/Political_c":1,
"Lead Excess-Manufacturing_c":0,
"Excess-Manufacturing_l":0,
"Excess-Contracting/Construction_c":0,
"Lead Excess-All Other_l":0,
"Lead Excess-Real Estate/Property Mgmt/Office Building_l":0,
"Excess-All Other_c":0,
"Umbrella-Contracting/Construction":0,
"Lead Excess-Entertainment/Recreation_c":0,
"Excess-Contracting/Construction_l":0,
"Excess-Installation/Service or Repair_l":0,
"Lead Excess-Installation/Service or Repair_l":0,
"Excess-Real Estate/Property Mgmt/Office Building_l":0,
"Excess-All Other_l":0,
"Excess-Retail/Wholesale_l":0,
"Umbrella-All Other":0,
"Lead Excess-Contracting/Construction_l":0,
"Lead Excess-Entertainment/Recreation_l":0,
"Lead Excess-Municipalities/Hospitals/Political_l":0,
"Lead Excess-Manufacturing_l":0,
"Excess-Municipalities/Hospitals/Political_l":0,
"Lead Excess-Retail/Wholesale_l":0,
"Total Layer Limit_c":1,
"General Liability Allocation_c":1,
"Coverage Premium_c":1,
"Uninsured/Under Insured Motorist Premium_c":0,
"Attachment Amount_c":1,
"Per Occurrence limit_c":1,
"Layer Aggregate_c":1,
"Terrorism_c":1,
"Aggregate_c":1,
"Auto Liability Allocation_c":1,
"Total Layer Limit_l":6000000,
"General Liability Allocation_l":1866,
"Coverage Premium_l":19658,
"Uninsured/Under Insured Motorist Premium_l":0,
"Attachment Amount_l":15000000,
"Per Occurrence limit_l":6000000,
"Layer Aggregate_l":6000000,
"Terrorism_l":3034,
"Aggregate_l":6000000,
"Auto Liability Allocation_l":17724}

In [None]:
import pandas as pd
import pickle,copy
import sys
from AnalysisModule import data_analysis
from TransformationModule import data_transformation
import SettingsModule as sm

from keras.models import Sequential
from keras.layers import Dense
from keras.models import model_from_json
from keras.optimizers import SGD

def main_1(var):

    df = pd.DataFrame([var]) # convert dict to dataframe
    da = data_analysis()
    da.set_data(df)
    datatypes_json = sm.DATATYPE
    split_data = da.split_data_on_datatypes(datatypes_json)
    
    dt = data_transformation()
    for each in split_data['numerical']:
        y = 0
        split_data['numerical'][each] = split_data['numerical'][each].apply(lambda x: dt.impute(x,y))   
    for each in split_data['categorical']:
        split_data['categorical'][each] = split_data['categorical'][each].apply(lambda x: dt.impute(x,'unknown'))

    features = ['Layer Aggregate_l', 'Total Layer Limit_l', 'Per Occurrence limit_l','Aggregate_l']

    # Separating out the features
    df = split_data['numerical']
    x = df[features]

    # Standardizing the features
    std_scale = pickle.load( open( sm.ENC_PATH + '\std_scaler.pkl', "rb" ) )
    x = std_scale.transform(x)

    # PCA    
    pc = pickle.load( open( sm.ENC_PATH + '\pca.pkl', "rb" ) )
    principalComponent = pc.transform(x)
    principalDf = pd.DataFrame(data = principalComponent, columns = ['principal component 1'])
    
    # merging PCA and other numerical columns
    df.drop(features, axis=1, inplace=True)
    num_data = pd.concat([principalDf,df],axis=1)
    
    y_cat =  split_data['categorical']['status']

    ## label encoding categorical variable
    df =pd.DataFrame()
    for each in split_data['categorical'].columns:
        encoder = pickle.load( open( sm.ENC_PATH + '\encoder_' +  str(each.replace('/','_')) +'.pkl', "rb" ) )
        enc = encoder.transform(split_data['categorical'][each])
        df = pd.concat([df,pd.DataFrame(enc)],axis=1)
    df.columns = split_data['categorical'].columns
    y_num = df['status']
    df.drop('Line of Business',axis = 1,inplace = True) # removing Line of Business because there is only one Line of Business
                
    # Win score

    def f1(x,y):
        x = dt.datetime_transformation(x)
        y = dt.datetime_transformation(y)
        return (x-y).days

    duration = split_data['datetime'].apply(lambda x:f1(x['ExpiryDate'],x['InceptionDate']),axis=1)
    
    cols = list(num_data.columns.values)
    cols.extend(split_data['categorical'].columns.values)
    cols.remove('Line of Business')
    cols.remove('status')
    cols.append('duration')
    
    df1 = pd.concat([num_data,df,duration,y_num],axis = 1)
    df1['status_cat'] = y_cat
    
    X = df1[df1['Coverage Premium_l'] > 0]
    y = X['status_cat']
    X = X.drop('status',axis=1)
    X = X.drop('status_cat',axis=1)
    X.columns = cols

    # feature importance and top imp features
    imp_col = ['principal component 1', 'General Liability Allocation_l', 'Coverage Premium_l', 'Attachment Amount_l'
               , 'Terrorism_l', 'Auto Liability Allocation_l', 'BusinessClassification', 'InsuredState', 'duration']
    model = pickle.load( open( sm.PKL_PATH + '\model.pkl', "rb" ) )
    y_pred_prob = pd.DataFrame(model.predict_proba(X[imp_col]),columns = model.classes_)    
    y_pred_prob['Score'] = dt.get_score(y_pred_prob)
        
    df_prem = pd.concat([df,num_data,duration,y_pred_prob['Score']],axis = 1)
    
    PREDICTED_COL = 'Coverage Premium_l'

    X_col =  copy.deepcopy(list(df_prem.columns.values))
    X_col.remove(PREDICTED_COL)
    X = df_prem[X_col]
    y = df_prem[PREDICTED_COL]
    
    ## Predicting coverage

#     prem_adv_model = train_model(X,y,epochs=20)
    
#     # serialize model to JSON
#     model_json = prem_adv_model.to_json()
#     with open(sm.PKL_PATH + '\premium_advice.json', "w" ) as json_file:
#         json_file.write(model_json)
#     prem_adv_model.save_weights(sm.PKL_PATH + '\premium_advice_weights.h5')

    json_file = open(sm.PKL_PATH + '\premium_advice.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    pa_model = model_from_json(loaded_model_json)
    pa_model.load_weights(sm.PKL_PATH + '\premium_advice_weights.h5')
    y_pred = pa_model.predict(X)    
    return y_pred[0][0]

main_1(var)