In [35]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

pd.options.mode.chained_assignment = None

In [36]:
from sklearn.model_selection import train_test_split

In [37]:
train = pd.read_csv('train_ZoGVYWq.csv',index_col=0)
test = pd.read_csv('test_66516Ee.csv',index_col=0)

In [38]:
train.columns

Index(['perc_premium_paid_by_cash_credit', 'age_in_days', 'Income',
       'Count_3-6_months_late', 'Count_6-12_months_late',
       'Count_more_than_12_months_late', 'application_underwriting_score',
       'no_of_premiums_paid', 'sourcing_channel', 'residence_area_type',
       'premium', 'renewal'],
      dtype='object')

In [39]:
train['Income_per_age'] = train['Income']/train['age_in_days']
test['Income_per_age'] = test['Income']/test['age_in_days']

train['total_premium'] = train['premium']*train['no_of_premiums_paid']
test['total_premium'] = test['premium']*test['no_of_premiums_paid']

train['premium_cash_credit'] = train['total_premium']*train['perc_premium_paid_by_cash_credit']
test['premium_cash_credit'] = test['total_premium']*test['perc_premium_paid_by_cash_credit']

In [40]:
train.head(2)

Unnamed: 0_level_0,perc_premium_paid_by_cash_credit,age_in_days,Income,Count_3-6_months_late,Count_6-12_months_late,Count_more_than_12_months_late,application_underwriting_score,no_of_premiums_paid,sourcing_channel,residence_area_type,premium,renewal,Income_per_age,total_premium,premium_cash_credit
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
110936,0.429,12058,355060,0.0,0.0,0.0,99.02,13,C,Urban,3300,1,29.446011,42900,18404.1
41492,0.01,21546,315150,0.0,0.0,0.0,99.89,21,A,Urban,18000,1,14.626845,378000,3780.0


In [41]:
desc = train.describe().loc[['mean','std']]
desc.drop('renewal',inplace=True,axis=1)

In [42]:
#train
for col in desc.columns:
    mean = desc[col]['mean']
    std = desc[col]['std']
    train[col] = (train[col]-mean)/std
#test
for col in desc.columns:
    mean = desc[col]['mean']
    std = desc[col]['std']
    test[col] = (test[col]-mean)/std

In [43]:
unique_channel = list(sorted(set(train.sourcing_channel)))
unique_res = list(sorted(set(train.residence_area_type)))

In [44]:
def encode_feature(feature_values,unique_list,column):
    encoded_list = []
    index = feature_values.index
    feature_values = feature_values.values
    for element in feature_values:
        encoded_vec = [0]*len(unique_list)
        encoded_vec[unique_list.index(element)] = 1
        encoded_list.append(encoded_vec)
    return pd.DataFrame(encoded_list,index=index,columns=[column + element for element in unique_list])

In [45]:
#train
temp_channel = encode_feature(train.sourcing_channel,unique_channel,'sourcing_channel')
temp_res = encode_feature(train.residence_area_type,unique_res,'residence_area_type')
#test
test_temp_channel = encode_feature(test.sourcing_channel,unique_channel,'sourcing_channel')
test_temp_res = encode_feature(test.residence_area_type,unique_res,'residence_area_type')

In [46]:
#train
train.drop(['sourcing_channel','residence_area_type'],inplace=True,axis=1)
#test
test.drop(['sourcing_channel','residence_area_type'],inplace=True,axis=1)

In [47]:
#train
train = pd.concat([train,temp_channel,temp_res],axis=1)
del temp_channel,temp_res
#test
test = pd.concat([test,test_temp_channel,test_temp_res],axis=1)
del test_temp_channel,test_temp_res

In [48]:
train_nan = train[(train.isnull().sum(axis=1)>0).values]
#test
test_nan = test[(test.isnull().sum(axis=1)>0).values]

In [49]:
assert train.shape[0] == train_nan.shape[0] + train.dropna().shape[0]
assert test.shape[0] == test_nan.shape[0] + test.dropna().shape[0]

In [50]:
train.dropna(inplace=True)
test.dropna(inplace=True)

In [51]:
def impute(i,train,data_nan):
    row = data_nan.loc[i]
    temp_col = row[pd.isnull(row)==False].index
    fill_col = row[pd.isnull(row)==True].index
    mag = np.linalg.norm(train[temp_col].values - row[temp_col].values,axis=1)
    index = np.where(mag == mag.min())[0][0]
    id_ = train.iloc[index].name
    return fill_col,id_

In [52]:
#train
for i in train_nan.index:
    fill_col,id_ = impute(i,train,train_nan)
    train_nan.loc[i,fill_col] = train.loc[id_,fill_col]
#test
for i in test_nan.index:
    fill_col,id_ = impute(i,train,test_nan)
    test_nan.loc[i,fill_col] = train.loc[id_,fill_col]

In [53]:
train = pd.concat([train,train_nan],axis=0)
del train_nan
#test
test = pd.concat([test,test_nan],axis=0)
del test_nan

In [54]:
y = train['renewal']
x = train.drop('renewal',axis=1)

In [55]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20)

In [57]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import confusion_matrix,roc_auc_score,roc_curve,precision_score,recall_score,f1_score
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,BaggingClassifier
from imblearn import over_sampling,under_sampling,combine
from sklearn.svm import SVC

In [58]:
ros = over_sampling.ADASYN()
rus = under_sampling.NearMiss()
rcs = combine.SMOTEENN()       
rcs2 = combine.SMOTETomek()

In [59]:
from xgboost import XGBClassifier

In [60]:
x_rs,y_rs = ros.fit_sample(x_train,y_train)

In [82]:
import pickle

In [80]:
xgb = XGBClassifier(n_estimators=300,subsample=0.7,max_depth=3,learning_rate=0.01).fit(x_rs,y_rs)
roc_auc_score(y_test,xgb.predict(x_test.values))

  if diff:


0.7610138332037997

In [83]:
filename = 'C:/Users/cheekati/Desktop/ml/AV Mck/xgb.pkl'
f =  open(filename, 'wb')
pickle.dump(xgb, f)
print('model complete')


model complete


In [None]:
log = BaggingClassifier(LogisticRegressionCV(Cs=6))
rf = BaggingClassifier(RandomForestClassifier())
gbc = BaggingClassifier(GradientBoostingClassifier(n_estimators=250,learning_rate=0.01))
sv = SVC(C=0.8,probability=True)
for sample in [rcs,rcs2]:
    x_rs,y_rs = sample.fit_sample(x_train,y_train)
    for model in [log,rf,gbc]:
        model.fit(x_rs,y_rs)
        print('roc : ',roc_auc_score(y_test,model.predict_proba(x_test)[:,1]))

In [None]:
log = BaggingClassifier(LogisticRegressionCV(Cs=6)).fit(x_rs,y_rs)
print('roc : ',roc_auc_score(y_test,log.predict(x_test)))
print('Precision : ', precision_score(y_test,log.predict(x_test)))
print('Recall : ', recall_score(y_test,log.predict(x_test)))
print('f1 : ', f1_score(y_test,log.predict(x_test)))
confusion_matrix(y_test,log.predict(x_test))

In [None]:
rf = BaggingClassifier(RandomForestClassifier()).fit(x_rs,y_rs)
print('roc : ',roc_auc_score(y_test,rf.predict(x_test)))
print('Precision : ', precision_score(y_test,rf.predict(x_test)))
print('Recall : ', recall_score(y_test,rf.predict(x_test)))
print('f1 : ', f1_score(y_test,rf.predict(x_test)))
confusion_matrix(y_test,rf.predict(x_test))

In [None]:
gbc = BaggingClassifier(GradientBoostingClassifier(n_estimators=250,learning_rate=0.01)).fit(x_rs,y_rs)
print('roc : ',roc_auc_score(y_test,gbc.predict(x_test)))
print('Precision : ', precision_score(y_test,gbc.predict(x_test)))
print('Recall : ', recall_score(y_test,gbc.predict(x_test)))
print('f1 : ', f1_score(y_test,gbc.predict(x_test)))
confusion_matrix(y_test,gbc.predict(x_test))

In [None]:
sv = SVC(C=10).fit(x_rs,y_rs)
print('roc : ',roc_auc_score(y_test,sv.predict(x_test)))
# print('Precision : ', precision_score(y_test,sv.predict(x_test)))
# print('Recall : ', recall_score(y_test,sv.predict(x_test)))
# print('f1 : ', f1_score(y_test,sv.predict(x_test)))
# confusion_matrix(y_test,log.predict(x_test))

In [None]:
fpr,tpr,_ = roc_curve(y_test.values,gbc.predict_proba(x_test)[:,1])
plt.plot(fpr,tpr)
fpr,tpr,_ = roc_curve(y_test.values,log.predict_proba(x_test)[:,1])
plt.plot(fpr,tpr)

In [None]:
(y_test == 0).sum()

In [None]:
y_pred = 1
premium = 3300
def revenue(incen,y_pred=1,premium=1200):
    effort = 10*(1-np.exp(-incen/400))
    delp = 20*(1-np.exp(-effort/5))
    revenue = ((y_pred + delp)*premium) - incen
    return revenue

In [None]:
x = []
for i in range(10000):
    x.append(revenue(i))

In [None]:
x.index(max(x))

In [None]:
plt.plot(x)

In [None]:
train = pd.read_csv('train_ZoGVYWq.csv',index_col=0)
train.head(2)

In [None]:
incen = list(range(5000))
from scipy.optimize import fmin

In [None]:
fmin(revenue,incen)