# Data inputs and Display Libraries



In [None]:

import pandas as pd
import numpy as np

pd.set_option('display.float_format', lambda x: '%.5f' % x)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'


In [None]:
#Installing Libraries
!pip install sweetviz
!pip install shap
!pip install unrar

# EDA Libraries

In [None]:
import sweetviz as sv
#!pip install sweetviz

# Data Preprocessing Libraries

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder


# Feature Selection & Modelling Libraries

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import pickle

# Metrics Libraries

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import f1_score
from matplotlib import pyplot
import matplotlib.pyplot as plt
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve


### Model Explanantion

In [None]:
import shap

In [None]:
!git clone https://github.com/univai-ghf/Classificationworkshop

In [None]:
#! unzip -q 'workshop_classification/prep_file.rar' -d 'workshop_classification/prep_file.csv'
!unrar x 'Classificationworkshop/prep_file.rar'  'Classificationworkshop'

In [None]:
### Run till here

In [None]:
t1 = pd.read_csv("Classificationworkshop/prep_file.csv",sep=",")

# Step 1-  Look at the data

In [None]:
t1.head()

In [None]:
#![](./presentation/Capture1.png)


# Look at data - now to get the target variable distribution

In [None]:
t1["risk_flag"].value_counts()

In [None]:
t1["risk_flag"].value_counts(normalize=True)

# Look at data - listing string and numeric columns

In [None]:
str_col_name_df = pd.read_csv("Classificationworkshop/str_cols.csv")
str_col_name_df.columns = ["index","col_name"]
str_col_name_list =list(str_col_name_df["col_name"])

num_col_name_df = pd.read_csv("Classificationworkshop/num_cols.csv")
num_col_name_df.columns = ["index","col_name"]
num_col_name_df=num_col_name_df.reset_index()
num_col_name_list = list(num_col_name_df["col_name"])

In [None]:
print (str_col_name_list)
print (num_col_name_list)

In [None]:
####Back to Slide

# EDA

In [None]:
sweet_report = sv.analyze([t1,"full_data"],target_feat="risk_flag")

In [None]:
sweet_report.show_html('sweet_report.html')

In [None]:
#5 mins to run the code

In [None]:
t0 = t1.copy()
for i in str_col_name_list:
    t1[i] = t1[i].str.lower().str.lstrip().str.rstrip()
    t1[i] = t1[i].str.replace("[^a-z\s]+","")


In [None]:
####back to slide

# Train test split
### Before we do any preprocessing we want to keep train and test seperate

In [None]:
x_train, x_test, y_train, y_test = train_test_split(t1, t1["risk_flag"], test_size=0.33, random_state=42)

In [None]:
x_train0 = x_train.reset_index()
x_test0 = x_test.reset_index()

In [None]:
x_train0.shape, x_test0.shape

#Label encoding string Variables -- baseline categorical approach

In [None]:


enc = OrdinalEncoder()

x_train_str= pd.DataFrame(enc.fit_transform(x_train0[str_col_name_list]))
x_test_str = pd.DataFrame(enc.transform(x_test0[str_col_name_list]))

In [None]:
viz1 = x_train_str.head()
viz1 = viz1.astype(int)
viz1.columns = str_col_name_list
viz1

In [None]:
x_train_str.shape

In [None]:
enc.categories_

# Concatenating Numeric and categorical

In [None]:
df_all_train1 = pd.concat([x_train_str,x_train0[num_col_name_list]],axis=1)
df_all_test1 = pd.concat([x_test_str,x_test0[num_col_name_list]],axis=1)

In [None]:
sel_cols = str_col_name_list + num_col_name_list

In [None]:
##back to presentation

# Building Model

In [None]:

le = preprocessing.LabelEncoder()
y_train1 = le.fit_transform(y_train)
y_test1 = le.transform(y_test)

In [None]:
class_weights = [0.1,0.9]



xgb = XGBClassifier(n_estimators=300,max_depth= 5,subsample= 0.2,class_weights = class_weights,scale_pos_weight=6,
                    colsample_bytree= 0.3)
xgb.fit(df_all_train1,y_train1)


In [None]:
##back to presentation

# Measure

In [None]:
def cf_mat_conv(cf_mat):
    cf_mat1 = pd.DataFrame(cf_mat)

    
    cols0 = cf_mat1.columns
    #print (cols0)
    cols1 = []
    rows1 = []
    for i in cols0:
        i1 = "pred_" + str(i)
        i2 = "actual_" + str(i)
        cols1.append(i1)
        rows1.append(i2)
    #print (rows1)
    cf_mat1.columns = cols1
    cf_mat1["vals"] = rows1
    return cf_mat1

In [None]:
def get_metrics1(mod1,test_set,actual1,fg):
    mod = eval(mod1)
    pred=mod.predict(test_set)
    #print(pred)
    pred1=mod.predict_proba(test_set)[:,1]
    
    ac1 = accuracy_score(actual1, pred)
    cf_mat1 = confusion_matrix(actual1, pred, labels=None, sample_weight=None)
    cf_mat_orig = cf_mat1.copy()

    
    #print (b_test_b.shape)
    cf_mat1 = cf_mat_conv(cf_mat1)

    
    fpr, tpr, thresholds = roc_curve(actual1, pred1)
    auc_pr = average_precision_score(actual1, pred1)
    auc1 = auc(fpr, tpr)
    f1scr = f1_score(actual1, pred, average='macro')
    
    if(fg==1):
        pyplot.plot([0, 1], [0, 1], linestyle='--')
    # plot the roc curve for the model
        pyplot.plot(fpr, tpr, marker='.')
    # show the plot
        pyplot.show()

        precision, recall, thresholds = precision_recall_curve(actual1, pred1)
        pyplot.plot(precision, recall, marker='.')
        pyplot.show()

        cmd = ConfusionMatrixDisplay(cf_mat_orig)
        cmd.plot(values_format='')

    return ac1,cf_mat1,auc1,f1scr,auc_pr


In [None]:
print (get_metrics1("xgb",df_all_test1,y_test1,1))

In [None]:
##back to presentation

# Explaining the Model

In [None]:
df1_tr = pd.DataFrame(df_all_train1)
df1_tr.columns =sel_cols
explainer = shap.TreeExplainer(xgb)
shap_values = explainer.shap_values(df1_tr)



In [None]:
shap.summary_plot(shap_values, df1_tr, plot_type="bar")

# Exercises

## One hot encoding

In [None]:
#OneHotEncoding the data
enc = OneHotEncoder(handle_unknown='ignore')

#use fit_transform method of onehotencoder to one hot encode categorical columns of x_train0 and x_test0

df_one_hot_tr = pd.DataFrame(.fit_transform(np.array(_____[str_col_name_list])).todense())
df_one_hot_te = pd.DataFrame(enc.transform(np.array(x_test0[_________])).todense())
colnames = enc.get_feature_names_out()

#assign column names of one hot dataframes to colnames
df_one_hot_tr.columns = ________
df_one_hot_te.columns = ________

In [None]:
# take a look at the one hot coded training dataframe using .head()
df_one_hot_tr.______()

In [None]:
# concatenate one hot encoded colmns to the numerical columns of the train and test data [num_col_name_list] along axis =1
df_all_one_hot_train1 = pd.concat([df_one_hot_tr,x_train0[______________]],axis=__)
df_all_one_hot_test1 = pd.concat([df_one_hot_te,x_test0[______________]],axis=___)

In [None]:
#make a list of all selected columns adding list of colnames and num_col_name_list
sel_cols = list(colnames) + ___________

In [None]:
class_weights = [0.1,0.9]


#Create model instance of XGBClassifier with n_estimators =100, max_depth =5, subsample =0.2, 
# class_weights as class_weights we created above, scale_pos_weight =6, colsample_bytree = 0.3
xgb_ohe = XGBClassifier(n_estimators=____,max_depth= ___,subsample= ____,class_weights = class_weights,scale_pos_weight=___,
                    colsample_bytree= ___,n_jobs=6)
# for the model on train data(df_all_one_hot_train1 and y_train1)
xgb_ohe.fit(________________,______)


In [None]:
print (get_metrics1("xgb_ohe",df_all_one_hot_test1,y_test1,1))

In [None]:
df1_tr_ohe = pd.DataFrame(df_all_one_hot_train1)
df1_tr_ohe.columns =sel_cols

#Lets do some XAI using SHAP input the model xgb_ohe in shap.treeexplainer
explainer_ohe = shap.TreeExplainer(_____)
#input the training dataframe df1_tr_ohe 
shap_values_ohe = explainer_ohe.shap_values(_______)

In [None]:
#Lets make a summary plot of shap_values_ohe, df1_tr_ohe with a "bar" plot
shap.summary_plot(____________, _________, plot_type="___")

## Median impute of missing Values

In [None]:
#### median impute of missing values

In [None]:
#create simpleimputer object for replacing nan values with mean [missing_values=np.nan, strategy='mean']
imp_mean = SimpleImputer(missing_values=______, strategy='_____')
#we can impute numerical columns by this Simple Imputer. So pass numerical columns of x_train0[num_col_name_list]
imp_mean.fit(x_train0[____________])
#transform both test and train data using imp_mean 
x_train_num = pd.DataFrame(imp_mean.transform(x_train0[_________]))
x_test_num = pd.DataFrame(imp_mean.transform(x_test0[____________]))

x_train_num.columns = num_col_name_list
x_test_num.columns = num_col_name_list

In [None]:
#concatenate the string columns and the numerical columns of Train and test data along axis =1
df_all_imp_train1 = pd.concat([x_train_str,_______],axis=___)
df_all_imp_test1 = pd.concat([x_test_str,________],axis=1)

In [None]:
class_weights = [0.1,0.9]

#Create model instance of XGBClassifier with n_estimators =300, max_depth =5, subsample =0.2, 
# class_weights as class_weights we created above, scale_pos_weight =6, colsample_bytree = 0.3

xgb_imp = XGBClassifier(n_estimators=300,max_depth= 5,subsample= 0.2,class_weights = class_weights,scale_pos_weight=6,
                    colsample_bytree= 0.3)
xgb_imp.fit(df_all_imp_train1,y_train1)

In [None]:
print (get_metrics1("xgb_imp",df_all_imp_test1,y_test1,1))

## Putting all it together

In [None]:
#Making a dataframe with the metrics : accuracy, auc, f1, auc_pr for the different models we created using different data sets

#make a list of datasets created : df_all_test1,df_all_one_hot_test1,df_all_imp_test1
df_datasets = [________,________,___________]

# loop over the models we created respectively in the list above: "xgb","xgb_ohe","xgb_imp"

for num,i in enumerate(["___","_______","___________"]):
    print (num)
    ac1,cf_mat1,auc1,f1scr,auc_pr = get_metrics1(i,df_datasets[num],y_test1,0)
    
    # make a data frame of selected metrics (ac1,auc1,f1scr,auc_pr)
    df_met = pd.DataFrame([(__,__,___,____)])
    #name the columns of data frame
    df_met.columns = ["accuracy","auc","f1","auc_pr"]
    df_met["model"] = i
    
    if(num==0):  
        df_met_all = df_met
    else:
        #concatenate the data frames to create one data frame
        df_met_all = pd.concat([df_met,df_met_all],axis=0)

In [None]:
#visualise the metrics data frame using .head()
df_met_all._____()

In [None]:
##back to presentation