In [None]:
import lightgbm as lgbm
from hyperopt import hp, tpe, Trials

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from utils.preprocessing import preprocessing 
from utils.utils import feature_importances,initial_hyperparam_search,cast_params_to_proper_types,lgb_f1_score,feature_imp_lgbm
import pickle

In [None]:
prepos= preprocessing()
df_credit_application = prepos.read_data("credit_applications.csv")
df_customers = prepos.read_data("customers.csv")
df_customers.drop("Unnamed: 0",axis=1,inplace=True)
df_credit_application.drop("Unnamed: 0",axis=1,inplace=True)

In [None]:
df_complete_data = df_customers.merge(df_credit_application,how="inner",on=["client_nr","yearmonth"])
print(df_complete_data.shape)

##### EDA And feature importances

In [None]:
df_complete_data.describe()

In [None]:
sns.countplot(x=df_complete_data["credit_application"])

# 0    27971
# 1     2025

In [None]:
prepos.missing_values_intable(df_complete_data)

In [None]:
# plt.rcParams["figure.figsize"] = [5,5]  #set the graph to a smaller size 
df_complete_data["debit_credit_ratio"]=df_complete_data["volume_debit_trx"]/df_complete_data["volume_credit_trx"]

In [None]:
df_complete_data["debit_credit_ratio"] = np.where(df_complete_data["debit_credit_ratio"]==np.inf,df_complete_data["volume_debit_trx"],df_complete_data["debit_credit_ratio"])

In [None]:
corl= df_complete_data.corr()
corl["credit_application"].sort_values(ascending=False)

In [None]:
# fimp=feature_importances()
# X_train_feature_selection, accepted_columns,assoc_result,disallowed_columns = fimp.filter_out_features_based_on_statistical_approach(df_complete_data,[],"credit_application")
# print(accepted_columns)
# print(assoc_result["credit_application"].sort_values(ascending=False))

df_complete_data.drop("nr_credit_applications",axis=1,inplace=True)
fimp=feature_importances()
X_train_feature_selection, accepted_columns,assoc_result,disallowed_columns = fimp.filter_out_features_based_on_statistical_approach(df_complete_data,[],"credit_application")
print(accepted_columns)
print(assoc_result["credit_application"].sort_values(ascending=False))

For a Time being delete rows where CRG is not present

In [None]:
df_complete_data[df_complete_data["CRG"].isna()]["credit_application"].value_counts()
# 0    5395
# 1     142

In [None]:
df_complete_data = df_complete_data[~df_complete_data["CRG"].isna()]

In [None]:
df_complete_data.shape

#### To do : Missing value treatment for CRG

In [None]:
df_complete_data.groupby(["client_nr","yearmonth","CRG"])["total_nr_trx"].count().reset_index()

###### New features ideas
Calculate how many times client has applied for credit in the past for each month

In [None]:
def calculate_nr_credit_in_past(current_index):
#     print(current_index)
    client_nr = df_complete_data.loc[[current_index]]["client_nr"].values[0]
    tmp_sum = df_complete_data[(df_complete_data["client_nr"]==client_nr) & (df_complete_data.index <current_index)]["credit_application"].sum()
    tmp_cnt = df_complete_data[(df_complete_data["client_nr"]==client_nr) & (df_complete_data.index <current_index)]["credit_application"].count()
    if(tmp_cnt==0):
        return 0
    else:
        return tmp_sum/tmp_cnt
    
    


In [None]:
df_complete_data["credit_applied_before_ratio"]= df_complete_data.index.map(calculate_nr_credit_in_past)

 ###### Is_debit_more_average
 
 Is volume_debit_trx is more than the average of the volume_debit_trx for that client
 If yes 1 else 0

In [None]:
# debit_average_per_client = df_complete_data.groupby(["client_nr"]).mean()["volume_debit_trx"].reset_index()
# df_complete_data["debit_average_per_client"] = df_complete_data["client_nr"].apply(lambda x: debit_average_per_client[debit_average_per_client["client_nr"]==x]["volume_debit_trx"].values[0])
# df_complete_data["Is_debit_more_average"]=np.where(df_complete_data["debit_average_per_client"] < df_complete_data["volume_debit_trx"],1,0)
# df_complete_data.drop(["debit_average_per_client"],axis=1,inplace=True)

#### Split data strategy

As part of train/test data split, 20% of data will be reserved as test data and will not seen by any model.
That amounts to approximately = 6 months worth of data.
Instead of splitting train/test data randomly, I decided to leave last 6 month data as test data set i.e. March 2016 to August 2016

In [None]:
X_test = df_complete_data[df_complete_data["yearmonth"].isin([201608,201607,201606,201605,201604,201603])].copy()
y_test = X_test["credit_application"]

X_train_org = df_complete_data[~df_complete_data["yearmonth"].isin([201608,201607,201606,201605,201604,201603])].copy()
y_train_org = X_train_org["credit_application"]

X_train,X_val,y_train,y_val=train_test_split(X_train_org,X_train_org["credit_application"],test_size=0.2,random_state=42,stratify=X_train_org["credit_application"])

X_test.drop(["credit_application"],axis=1,inplace=True)
X_train_org.drop(["credit_application","client_nr"],axis=1,inplace=True)
X_train.drop(["credit_application","client_nr"],axis=1,inplace=True)
X_val.drop(["credit_application","client_nr"],axis=1,inplace=True)


# X_test.drop(["credit_application","client_nr","nr_credit_applications"],axis=1,inplace=True)
# X_train_org.drop(["credit_application","client_nr","nr_credit_applications"],axis=1,inplace=True)
# X_train.drop(["credit_application","client_nr","nr_credit_applications"],axis=1,inplace=True)
# X_val.drop(["credit_application","client_nr","nr_credit_applications"],axis=1,inplace=True)

##### 2. Using Lightgbm feature importance

In [None]:
from lightgbm import plot_importance
from sklearn.model_selection import GridSearchCV
params_scope = {
    'learning_rate': hp.uniform('learning_rate', 0.001, 0.1),
    'num_leaves': hp.quniform('num_leaves', 2, 15, 1),
    'n_estimators': hp.quniform('n_estimators', 10, 500, 10),
    'scale_pos_weight': hp.quniform('scale_pos_weight', 5, 100, 5),
    'reg_lambda ': hp.uniform('reg_lambda', 10.00, 100.0),
    'pos_bagging_fraction': hp.uniform('pos_bagging_fraction', 0.0, 1.0),
    'max_bin': hp.quniform('max_bin', 16, 256, 16)
                            }


In [None]:
# params_scope = {
# #                 'learning_rate': [x/1000 for x in range(1, 110,10)],
#                 'n_estimators': [x for x in range(10, 500,50)],
#                'num_leaves': [x for x in range(2, 15,2)],
#                'scale_pos_weight':[x for x in range(85, 100,5)],
#                'objective': ['binary'],
# #                'reg_lambda ': [x for x in range(10, 100,10)],
#                }

# import lightgbm as lgb
# model = lgb.LGBMClassifier(**params_scope)
# grid = GridSearchCV(model, param_grid=params_scope, verbose=1, cv=3, n_jobs=-1)
# # Run the grid
# grid.fit(X_train,y_train)

# # Print the best parameters found
# print(grid.best_params_)
# print(grid.best_score_)
# model = grid.best_estimator_
# feature_imp_df = pd.DataFrame(data={'feature_name':model.feature_name_,
#                    'feature_importance':model.feature_importances_},
            
#             )
# feature_imp_df.sort_values(by="feature_importance",ascending=False,inplace=True)

In [None]:
model,initial_params = feature_imp_lgbm(X_train,y_train,X_val,y_val,params_scope)
feature_imp_df = pd.DataFrame(data={'feature_name':model.feature_name_,
                   'feature_importance':model.feature_importances_},
            
            )
feature_imp_df.sort_values(by="feature_importance",ascending=False,inplace=True)

In [None]:
print(initial_params)
feature_imp_df

In [None]:
import shap
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_val[X_train.columns])
shap.summary_plot(shap_values, X_val[X_train.columns], plot_type="bar")

In [None]:
# import matplotlib.pyplot as plt
# from sklearn.model_selection import StratifiedKFold
# from sklearn.metrics import roc_auc_score,f1_score,confusion_matrix,precision_score,recall_score,roc_curve

# kfolds = StratifiedKFold(5)
# current_f1_scores=[]
# roc_auc_scores_val=[]
# precision_score_val=[]
# recall_score_val=[]
# fpr_scores=[]
# tpr_scores=[]
# mean_roc_auc_scores_val = []
# mean_precision_scores_val = []
# mean_recall_scores_val = []
# mean_f1_scores_val = []

# for train_idx ,val_index in kfolds.split(X_train_org,y_train_org):
#     model.fit(
#                 X_train_org.iloc[train_idx],
#                 y_train_org.iloc[train_idx],
#                eval_metric=lgb_f1_score
#             )
#     y_pred=np.where(model.predict_proba(X_train_org.iloc[val_index])[:,1] >0.4,1,0)
#     current_f1_scores.append(f1_score(y_train_org.iloc[val_index], y_pred))
#     tn, fp, fn, tp = confusion_matrix(y_train_org.iloc[val_index],y_pred).ravel()
#     print(confusion_matrix(y_train_org.iloc[val_index],y_pred))
#     roc_auc_scores_val.append(roc_auc_score(y_true=y_train_org.iloc[val_index], y_score=y_pred))
#     precision_score_val.append(precision_score(y_true=y_train_org.iloc[val_index],y_pred=y_pred))
#     recall_score_val.append(recall_score(y_true=y_train_org.iloc[val_index],y_pred=y_pred))
#     fpr_val, tpr_val, _ = roc_curve(y_true=y_train_org.iloc[val_index], y_score=y_pred)
#     fpr_scores.append(fpr_val)
#     tpr_scores.append(tpr_val)
    
    

In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score,f1_score,confusion_matrix,precision_score,recall_score,roc_curve

def cross_validation(model,X_train_org,y_train_org,threshold):
    
    kfolds = StratifiedKFold(5)
    current_f1_scores=[]
    roc_auc_scores_val=[]
    precision_score_val=[]
    recall_score_val=[]
    fpr_scores=[]
    tpr_scores=[]

    
    
    for train_idx ,val_index in kfolds.split(X_train_org,y_train_org):
        model.fit(
                    X_train_org.iloc[train_idx],
                    y_train_org.iloc[train_idx]
#                   ,eval_metric=lgb_f1_score
                )
        y_pred=np.where(model.predict_proba(X_train_org.iloc[val_index])[:,1] >threshold,1,0)
        current_f1_scores.append(f1_score(y_train_org.iloc[val_index], y_pred))
        tn, fp, fn, tp = confusion_matrix(y_train_org.iloc[val_index],y_pred).ravel()
        print(confusion_matrix(y_train_org.iloc[val_index],y_pred))
        print(y_pred)
        roc_auc_scores_val.append(roc_auc_score(y_train_org.iloc[val_index], y_pred))
        precision_score_val.append(precision_score(y_true=y_train_org.iloc[val_index],y_pred=y_pred))
        recall_score_val.append(recall_score(y_true=y_train_org.iloc[val_index],y_pred=y_pred))
        fpr_val, tpr_val, _ = roc_curve(y_true=y_train_org.iloc[val_index], y_score=y_pred)
        fpr_scores.append(fpr_val)
        tpr_scores.append(tpr_val)
    
    return current_f1_scores,roc_auc_scores_val,precision_score_val,recall_score_val,fpr_scores,tpr_scores

    

def plot_roc(fpr,tpr,roc_auc,color_ip):
    print('ROC AUC=%0.2f'%roc_auc)
    plt.plot(fpr,tpr,label='AUC=%0.2f'%roc_auc,color=color_ip)
    plt.legend(loc='lower right')
    plt.plot([0,1],[0,1],'b--')
    plt.xlim([0,1])
    plt.ylim([0,1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.grid(True) 
#     plt.show()

In [None]:
current_f1_scores,roc_auc_scores_val,precision_score_val,recall_score_val,fpr_scores,tpr_scores=cross_validation(model,X_train_org,y_train_org,0.4)

In [None]:
color_list=["green","red","black","yellow","orange"]
for fpr_val,tpr_val,roc_auc_score,color in zip(fpr_scores,tpr_scores,roc_auc_scores_val,color_list):
    plot_roc(fpr_val,tpr_val,roc_auc_score,color)
    
    
print('Mean ROC AUC score on validation data =%0.2f'%np.mean(roc_auc_scores_val))

In [None]:
def print_metrics(f1_scores,recall_scores,precision_scores,dataset="validation"):
    print("F1 scores on "+ dataset+" data:")
    print(f1_scores)
    print("-----------------------------------------------------")
    print("Recall scores on "+ dataset+" data:")
    print(recall_scores)
    print("-----------------------------------------------------")
    print("Mean f1 score on "+ dataset+" data:")
    print(np.mean(f1_scores))

    print("-----------------------------------------------------")
    print("Mean recall score on "+ dataset+" data:")
    print(np.mean(recall_scores))

    print("-----------------------------------------------------")
    print("Mean precision score on "+ dataset+" data:")
    print(np.mean(precision_scores))

In [None]:
print_metrics(current_f1_scores,recall_score_val,precision_score_val,"validation")

###### On test data

In [None]:
# y_test_pred = model.predict(X_test)
for i in np.arange(0.3,0.65,0.051):
    i=(round(i,2))
    print("---------------THRESHOLD ="+str(i)+" ----------------------")

    y_test_pred=np.where(model.predict_proba(X_test[X_train.columns])[:,1] >i,1,0)

    print(confusion_matrix(y_test,y_test_pred))
    print("F1 score on Test data:")
    print(f1_score(y_test,y_test_pred))
    print("-----------------------------------------------------")
    print("Recall score on Test data:")
    print(recall_score(y_test,y_test_pred))
    print("-----------------------------------------------------")
    print("Precision score on Test data:")
    print(precision_score(y_test,y_test_pred))

In [None]:
# X_test=X_test[X_train.columns]
y_test_pred=np.where(model.predict_proba(X_test[X_train.columns])[:,1] >0.4,1,0)
print(confusion_matrix(y_test,y_test_pred))
print("F1 score on Test data:")
print(f1_score(y_test,y_test_pred))
print("-----------------------------------------------------")
print("Recall score on Test data:")
print(recall_score(y_test,y_test_pred))
print("-----------------------------------------------------")
print("Precision score on Test data:")
print(precision_score(y_test,y_test_pred))

In [None]:
### Save preds to csv file

X_test["ground_truth"] = y_test
X_test["pred_lgbm"]=y_test_pred
# X_test.to_csv("predictions_on_test_set_lgbm.csv")


###### SVM 

In [None]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler

In [None]:
std=StandardScaler()
Xformed_data = std.fit_transform(X_train_org)

In [None]:
Xformed_data = pd.DataFrame(Xformed_data,columns=X_train_org.columns)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
def svm_hyper(X_train_transformed, y_train):
    svc = SVC()
    C_range = [0.001,0.1,1,10]
#     gamma_range = [0.001,0.1,1,'scale', 'auto']    
    param_grid = { 
        "C": C_range,
        "kernel": ['rbf', 'poly']
#         ,"gamma": gamma_range
        }
    print(param_grid)
    scoring = ['f1']
#     kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
    random_search = RandomizedSearchCV(estimator=svc, 
                               param_distributions=param_grid, 
                               n_iter=5,
                               scoring=scoring,
                               refit="f1",        
                               n_jobs=-1, 
                               cv=3)
    # Fit grid search
    random_result = random_search.fit(X_train_transformed, y_train)
    # Print grid search summary
    model = random_search.best_estimator_
    return model,random_search

In [None]:
model,random_search = svm_hyper(Xformed_data,y_train_org)

In [None]:
import pickle
# pickle.dump(model,open("output files/model_svm.pkl","wb"))
model_svm = pickle.load(open("output files/model_svm.pkl","rb"))


In [None]:
y_test_pred = model_svm.predict(X_test[X_train.columns])
print(confusion_matrix(y_test_pred,y_test))
print(f1_score(y_test_pred,y_test))

###### Random forest 

In [None]:
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import cross_val_score,RandomizedSearchCV


In [None]:
# from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]

max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4,10]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

random_grid

In [None]:
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    model_rf = BalancedRandomForestClassifier()
    rf_random = RandomizedSearchCV(estimator = model_rf, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, random_state=42, n_jobs = -1)
    rf_random.fit(X_train_org, y_train_org)


In [None]:
model_rf=rf_random.best_estimator_

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
        
    current_f1_scores,roc_auc_scores_val,precision_score_val,recall_score_val,fpr_scores,tpr_scores=cross_validation(model_rf,X_train_org,y_train_org,0.45)

In [None]:
color_list=["green","red","black","yellow","orange"]
for fpr_val,tpr_val,roc_auc_score,color in zip(fpr_scores,tpr_scores,roc_auc_scores_val,color_list):
    plot_roc(fpr_val,tpr_val,roc_auc_score,color)
    
    
print('Mean ROC AUC score on validation data =%0.2f'%np.mean(roc_auc_scores_val))

In [None]:
print_metrics(current_f1_scores,recall_score_val,precision_score_val,"validation")

In [None]:
# y_test_pred = model.predict(X_test)
for i in np.arange(0.3,0.65,0.051):
    i=(round(i,2))
    print("---------------THRESHOLD ="+str(i)+" ----------------------")

    y_test_pred=np.where(model_rf.predict_proba(X_test[X_train.columns])[:,1] >i,1,0)

    print(confusion_matrix(y_test,y_test_pred))
    print("F1 score on Test data:")
    print(f1_score(y_test,y_test_pred))
    print("-----------------------------------------------------")
    print("Recall score on Test data:")
    print(recall_score(y_test,y_test_pred))
    print("-----------------------------------------------------")
    print("Precision score on Test data:")
    print(precision_score(y_test,y_test_pred))

In [None]:

y_test_pred=np.where(model_rf.predict_proba(X_test[X_train.columns])[:,1] >0.5,1,0)
print(confusion_matrix(y_test,y_test_pred))
print("F1 score on Test data:")
print(f1_score(y_test,y_test_pred))
print("-----------------------------------------------------")
print("Recall score on Test data:")
print(recall_score(y_test,y_test_pred))
print("-----------------------------------------------------")
print("Precision score on Test data:")
print(precision_score(y_test,y_test_pred))

In [None]:
### Save preds to csv file

# X_test["ground_truth"] = y_test
X_test["pred_rf"]=y_test_pred
X_test.to_csv("output files/predictions_on_test_set_rf_threshold_0.5.csv")


pickle.dump(model,open("output files/model_rf.pkl","wb"))

##### Unsupervised (Isolation forest)

In [None]:

from sklearn.ensemble import IsolationForest
from sklearn.model_selection import ParameterSampler
from sklearn.metrics import f1_score,roc_auc_score,confusion_matrix,recall_score,precision_score

In [None]:
model_iso = IsolationForest(contamination=0.03, max_features=0.1, max_samples=0.1,n_estimators=230)
model_iso.fit(X_train_org)

In [None]:
model_iso.predict(X_test[X_train.columns])
y_test_pred = model_iso.predict(X_test[X_train.columns])
y_test_pred = [1 if x==-1 else 0 for x in y_test_pred ]
score = f1_score(y_test, y_test_pred)

In [None]:
print(score)
print(recall_score(y_test, y_test_pred))

In [None]:
X_test["iso_pred"]=y_test_pred

In [None]:
X_test["final_pred"]=np.where(X_test["pred_lgbm"]+X_test["pred_rf"]+X_test["iso_pred"]>1,1,0)
confusion_matrix(y_test,X_test["final_pred"])
recall_score(y_test,X_test["final_pred"])

In [None]:
print(f1_score(X_test["ground_truth"],X_test["final_pred"]))
print(recall_score(X_test["ground_truth"],X_test["final_pred"]))
print(precision_score(X_test["ground_truth"],X_test["final_pred"]))
confusion_matrix(X_test["ground_truth"],X_test["final_pred"])

In [None]:
print(f1_score(X_test["ground_truth"],X_test["pred_lgbm"]))
print(recall_score(X_test["ground_truth"],X_test["pred_lgbm"]))
print(precision_score(X_test["ground_truth"],X_test["pred_lgbm"]))
confusion_matrix(X_test["ground_truth"],X_test["pred_lgbm"])

In [None]:
X_test.to_csv("output files/ensemble_voting.csv")
pickle.dump(model_iso,open('output files/model_iso.pkl','wb'))