# Kel 2: Avengers Team: E Commerce Churn Prediction 

## Stage 2 - Data Pre-Processing & Feature Engineering

In [105]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
# sklearn import for data pre-processing
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer

# sklearn import for LogisticRegression and RandomForest algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# ignore warnings( not suggested for real-life projects)
import warnings
warnings.filterwarnings('ignore')

In [106]:
from matplotlib import rcParams

rcParams['figure.figsize'] = 12, 6
rcParams['lines.linewidth'] = 3
rcParams['xtick.labelsize'] = 'x-large'
rcParams['ytick.labelsize'] = 'x-large'

- <b> Data Exploration

In [107]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [108]:
pip install openpyxl

In [109]:
df = pd.read_excel('/kaggle/input/ecommerce-customer-churn-analysis-and-prediction/E Commerce Dataset.xlsx', sheet_name='E Comm')
df_stg_2 = df.copy()
df_stg_2.head()

In [110]:
nums = ['Tenure','CityTier','WarehouseToHome','HourSpendOnApp','NumberOfDeviceRegistered','SatisfactionScore','NumberOfAddress','Complain','OrderAmountHikeFromlastYear','CouponUsed','OrderCount','DaySinceLastOrder','CashbackAmount']
cats = ['PreferredLoginDevice','PreferredPaymentMode','Gender','PreferedOrderCat','MaritalStatus']

In [7]:
features = nums
plt.figure(figsize=(30, 20))
for i in range(0, len(features)):
    plt.subplot(3, 5, i+1)
    sns.kdeplot(x=df_stg_2[features[i]], color='salmon')
    plt.xlabel(features[i])

In [8]:
plt.figure(figsize=(17,10))
for i in range(0, len(cats)):
    plt.subplot(3, 3, i+1)
    sns.countplot(x = df_stg_2[cats[i]], color='salmon', orient='v')
    plt.xticks(fontsize=9, rotation=45)
    plt.tight_layout()

+ <b> Feature Target <b>

In [111]:
df_stg_2['Churn'].value_counts()

### <b> 1. Data Cleansing

<b> A. Handle missing values <br>    

In [112]:
df_stg_2.isna().sum().sort_values(ascending=False)

In [113]:
missing_value_columns = []

# Iteration
for i in df_stg_2.columns:
    # Mengambil unique value pada setiap kolom yang dilakukan iterasi
    column_loop = df_stg_2[i].unique().tolist()
    # Check dua arah
    # 1. Apakah ada " " didalam kolom
    # 2. Apakah ada nan didalam kolom
    if " " in column_loop or sum([1 if str(x) == "nan" else 0 for x in column_loop]) > 0:
        # Jika ada, append nama kolomnya kesini
        missing_value_columns.append(i)

# Print isinya
print(missing_value_columns)

In [114]:
percentage_of_null_values = ((df_stg_2.isnull().sum())*100/len(df_stg_2)).sort_values(ascending = False)
percentage_of_null_values

<B> B. Handle duplicated data

In [115]:
df_stg_2.duplicated().sum()

Jika data missing <5% sebetulnya bisa kita abaikan saja jika tidak memiliki konteks lain

Strategi dalam imputasi data null values:
1. `Tenure` distribusi datanya right skewed, kita menggunakan median
2. `WarehouseToHome` distribusi datanya juga right skewed, kita menggunakan median
3. `HourSpendOnApp` distribusi datanya berbentuk trimodal, kita menggunakan median
4. `OrderAmountHikeFromlastYear` distribusi datanya right skewed, kita menggunakan median
5. `CouponUsed` distribusi datanya right skewed, kita menggunakan median
6. `OrderCount` distribusi datanya right skewed, kita menggunakan median
7. `DaySinceLastOrder` distribusi datanya right skewed, kita menggunakan median

In [116]:
df_stg_2['Tenure'].fillna(df_stg_2['Tenure'].median(), inplace=True)
df_stg_2['WarehouseToHome'].fillna(df_stg_2['WarehouseToHome'].median(), inplace=True)
df_stg_2['HourSpendOnApp'].fillna(df_stg_2['HourSpendOnApp'].median(), inplace=True)
df_stg_2['OrderAmountHikeFromlastYear'].fillna(df_stg_2['OrderAmountHikeFromlastYear'].median(), inplace=True)
df_stg_2['CouponUsed'].fillna(df_stg_2['CouponUsed'].median(), inplace=True)
df_stg_2['OrderCount'].fillna(df_stg_2['OrderCount'].median(), inplace=True)
df_stg_2['DaySinceLastOrder'].fillna(df_stg_2['DaySinceLastOrder'].median(), inplace=True)

In [117]:
df_stg_2.isna().sum()

+ <b>Replace Some Columns

In [118]:
df_stg_2['PreferredPaymentMode'].value_counts()

In [119]:
df_stg_2['PreferredPaymentMode'] = df_stg_2['PreferredPaymentMode'].replace({'COD': 'Cash on Delivery'})
df_stg_2['PreferredPaymentMode'] = df_stg_2['PreferredPaymentMode'].replace({'CC':'Credit Card'})
df_stg_2['PreferredPaymentMode'].value_counts()

In [120]:
df_stg_2['PreferredLoginDevice'].value_counts()

In [121]:
df_stg_2['PreferredLoginDevice'] = df_stg_2['PreferredLoginDevice'].replace({'Phone':'Mobile Phone'})
df_stg_2['PreferredLoginDevice'].value_counts()

In [122]:
df_stg_2['PreferedOrderCat'].value_counts()

In [123]:
df_stg_2['PreferedOrderCat'] = df_stg_2['PreferedOrderCat'].replace({'Mobile':'Mobile Phone'})
df_stg_2['PreferedOrderCat'].value_counts()

<b> C. Handle outliers <br>

In [95]:
plt.figure(figsize=(30,20))
features = nums
for i in range(0, len(features)):
    plt.subplot(3, 7, i+1)
    sns.boxplot(y=df_stg_2[features[i]], color='salmon', orient='v')
    plt.xlabel(features[i])

In [96]:
df_stg_2.describe()

In [124]:
#Memeriksa apakah ada outliers
df_outliers = df_stg_2.select_dtypes(include=['int64','float64'])

plt.figure(figsize=(12,5))
sns.boxplot(data=df_outliers, orient='v')
plt.show()

#### <b> Handling 
Kita akan menggunakan Z-score filtering

In [125]:
from scipy import stats

print(f'Jumlah baris sebelum memfilter outlier: {len(df_stg_2)}')

filtered_entries = np.array([True] * len(df_stg_2))

for col in df_stg_2[nums]:
    zscore = abs(stats.zscore(df_stg_2[col])) # hitung absolute z-scorenya
    filtered_entries = (zscore < 3) & filtered_entries # keep yang kurang dari 3 absolute z-scorenya
    
df_stg_2 = df_stg_2[filtered_entries] # filter, cuma ambil yang z-scorenya dibawah 3

print(f'Jumlah baris setelah memfilter outlier: {len(df_stg_2)}')

<b> D. Feature encoding

+ <b>Strategi encoding </b><br>
1. `PreferredLoginDevice` , `PreferredPaymentMode`, `PreferedOrderCat` & `MaritalStatus` \: One Hot Encoding 
2. `Gender`: encoding/ordinal encoding
3. `CityTier` dan `SatisfactionScore` sudah berbentuk label encoding/ordinal encoding

In [126]:
df_stg_2.head()

In [127]:
df_stg_2 = pd.get_dummies(df_stg_2,columns=['PreferredLoginDevice', 'PreferredPaymentMode', 'PreferedOrderCat', 'MaritalStatus'])
df_stg_2

In [128]:
mapping_Gender = {
    'Male' : 0,
    'Female' : 1
}
df_stg_2['Gender'] = df_stg_2['Gender'].map(mapping_Gender)


In [129]:
df_stg_2.info()

<b> E. Feature transformation</b> <br> 
Normalization/Standardization dan log transform

 <b>Strategi transformasi data : </b><br>
1. normalisasi = `Tenure`, `HourSpendOnApp`, `NumberOfAddress`, `CouponUsed`, `DaySinceLastOrder`, `CashbackAmount`
2. log trasform = `WarehouseToHome` , `NumberOfDeviceRegistered`, `OrderAmountHikeFromlastYear`

In [130]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

df_stg_2['Tenure_norm'] = MinMaxScaler().fit_transform(df_stg_2['Tenure'].values.reshape(len(df_stg_2), 1))
df_stg_2['WarehouseToHome_log'] = np.log(df_stg_2['WarehouseToHome'])
df_stg_2['HourSpendOnApp_norm'] = MinMaxScaler().fit_transform(df_stg_2['HourSpendOnApp'].values.reshape(len(df_stg_2), 1))
df_stg_2['NumberOfDeviceRegistered_log'] = np.log(df_stg_2['NumberOfDeviceRegistered'])
df_stg_2['NumberOfAddress_norm'] = MinMaxScaler().fit_transform(df_stg_2['NumberOfAddress'].values.reshape(len(df_stg_2), 1))
df_stg_2['OrderAmountHikeFromlastYear_log'] = np.log(df_stg_2['OrderAmountHikeFromlastYear'])
df_stg_2['CouponUsed_norm'] = MinMaxScaler().fit_transform(df_stg_2['CouponUsed'].values.reshape(len(df_stg_2), 1))
df_stg_2['OrderCount_log'] = np.log(df_stg_2['OrderCount'].values.reshape(len(df_stg_2), 1))
df_stg_2['DaySinceLastOrder_norm'] = MinMaxScaler().fit_transform(df_stg_2['DaySinceLastOrder'].values.reshape(len(df_stg_2), 1))
df_stg_2['CashbackAmount_norm'] = MinMaxScaler().fit_transform(df_stg_2['CashbackAmount'].values.reshape(len(df_stg_2), 1))

In [131]:
# cek keberhasilan feature transformation
df_stg_2.describe()

In [162]:
df_stg_2.describe().transpose()

<b> Drop kolom-kolom yang outdated dan tidak digunakan

In [163]:
df_stg_2.decsrip()

In [133]:
df_stg3 = df_stg_2.drop(columns=['CustomerID', 'Gender', 'WarehouseToHome','Tenure','HourSpendOnApp','NumberOfDeviceRegistered','NumberOfAddress',
                      'OrderAmountHikeFromlastYear','CouponUsed','OrderCount','DaySinceLastOrder','CashbackAmount'])
df_stg3.info()

In [134]:
df_stg3.describe()

In [135]:
df_stg3.info()

<b> F. Handle class imbalance

In [136]:
df_stg3 = df_stg_2.drop(columns=['CustomerID', 'Gender'],axis=1)

In [137]:
df_stg3.info()

In [138]:
df_stg3['Churn'].value_counts()

In [139]:
# pemisahan features vs target
X = df_stg3[[col for col in df_stg3.columns if (str(df_stg3[col].dtype) != 'object') and col not in ['Churn']]]
y = df_stg3['Churn'].values
print(X.shape)
print(y.shape)

<b>Data test tdk perlu di smote untuk semua dataset mendapat perlakuan yang sama kecuali di balancing.

In [140]:
from imblearn import under_sampling, over_sampling
X_under, y_under = under_sampling.RandomUnderSampler(1).fit_resample(X, y)
X_over, y_over = over_sampling.RandomOverSampler(1).fit_resample(X, y)
X_over_SMOTE, y_over_SMOTE = over_sampling.SMOTE(1).fit_resample(X, y)

In [141]:
print('Original')
print(pd.Series(y).value_counts())
print('UNDERSAMPLING')
print(pd.Series(y_under).value_counts())
print('OVERSAMPLING')
print(pd.Series(y_over).value_counts())
print('SMOTE')
print(pd.Series(y_over_SMOTE).value_counts())

<b> Kesimpulan: </b><br>
- Kita akan memakai metode SMOTE dalam menghandle imbalance data yang ada pada target

## Stage 3 - ML MODELING

#### <b> A. Split Train & Test Data

In [142]:
X = X_over_SMOTE
y = y_over_SMOTE

In [143]:
from sklearn.metrics import precision_score, recall_score, roc_auc_score

def eval_classification(model, X_train, y_train, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    
    print("Precision (Train Set): %.4f" % precision_score(y_train,y_pred_train))
    print("Precision (Test Set): %.4f" % precision_score(y_test,y_test))
    print("Recall (Train Set): %.4f" % recall_score(y_train,y_pred_train))
    print("Recall (Test Set: %.4f" % recall_score(y_test,y_pred))
    
    y_pred_proba = model.predict_proba(X_test)
    y_pred_train_proba = model.predict_proba(X_train)
    print("AUC (Test Set): %.4f" % roc_auc_score(y_test,y_pred_proba[:,1]))
    print("AUC (Train Set): %.4f" % roc_auc_score(y_train,y_pred_train_proba[:,1]))
    
def show_feature_importance(model):
    feat_importances = pd.Series(model.feature_importances_, index=X.columns)
    ax = feat_importances.nlargest(35).plot(kind='barh', figsize=(10,8))
    ax.invert_yaxis()
    
    plt.xlabel('score')
    plt.ylabel('feature')
    plt.title('featue importance score')
    
def show_best_hyperparameter(model,hyperparameter):
    for key, value in hyperparameters.items():
        print('Best'+key+':',model.get_params()[key])

In [144]:
#Splitting the data into Train & Test
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 42)

### <b>Logistic Regression

In [145]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)

eval_classification(lr, X_train,y_train, X_test, y_test)

+ <b> Hyperparameter Tuning

In [146]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

#list Hyperparameter yang akan di uji
penalty = ['l2','l1','elasticnet']
C = [0.0001,0.001,0.002,0.03,0.4,0.9,2,4,5] #Invers of regularization strength; smaller values specify stronger regularization.
hyperparameters = dict(penalty=penalty, C=C)

#inisiasi model
lr = LogisticRegression(random_state=42) #Init lr dengan GridSearch, cross validation = 5
lr_tunned = RandomizedSearchCV(lr,hyperparameters, cv=5, random_state=42,scoring='recall')

#fitting model & evaluation
lr_tunned.fit(X_train, y_train)
eval_classification(lr_tunned,X_train,y_train, X_test, y_test)

In [147]:
show_best_hyperparameter(lr_tunned.best_estimator_, hyperparameters)

In [148]:
print('Train score:' + str(lr_tunned.score(X_train, y_train))) 
print('Test score:' + str(lr_tunned.score(X_test,y_test)))

### <b> K-nearest Neighbor

In [49]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)

eval_classification(knn,X_train, y_train, X_test, y_test)

+ <b>Hyperparameter Tuning with RandomSearch

In [50]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from sklearn.neighbors import KNeighborsClassifier

#List of Hyperparameter
n_neighbors = list(range(1,50))
p=[1,2]
algorithm = ['auto','ball_tree','kd_tree','brute']
hyperparameters = dict(n_neighbors=n_neighbors, p=p, algorithm=algorithm)

#Init Model
knn = KNeighborsClassifier()
knn_tunned = RandomizedSearchCV(knn, hyperparameters, cv=5, random_state=42, scoring='recall')

#fit model & evaluasi
knn_tunned.fit(X_train,y_train)
eval_classification(knn_tunned,X_train,y_train, X_test, y_test)

In [51]:
show_best_hyperparameter(knn_tunned.best_estimator_,hyperparameters)

In [52]:
print('Train score:' + str(knn_tunned.score(X_train, y_train)))#Recall
print('Test score:' + str(knn_tunned.score(X_test,y_test)))

### <b> Decision Tree

In [53]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train,y_train)

eval_classification(dt,X_train, y_train, X_test, y_test)

+ <b>Hyperparameter Tuning with RandomSearch

In [54]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
import numpy as np

#list of Hyperparameters
max_depth = [int(x) for x in np.linspace(1,110,num=20)]#maximum number of levels in tree
min_samples_split = list(range(1,50)) # Minimum number of samples required to split a node
min_samples_leaf = list(range(1,50)) # Minimum number of samples required at each leaf node
max_features = ['auto','sqrt'] #number of features to consider at every split

hyperparameters = dict(max_depth=max_depth,
                       min_samples_split=min_samples_split,
                       min_samples_leaf=min_samples_leaf,
                       max_features = max_features
                      )

# Inisialisasi Model
dt = DecisionTreeClassifier(random_state=42)
dt_tunned = RandomizedSearchCV(dt, hyperparameters, cv=5, random_state=42,scoring='recall')
dt_tunned.fit(X_train,y_train)

# Predict & Evaluation
eval_classification(dt_tunned,X_train,y_train,X_test,y_test)

In [55]:
show_best_hyperparameter(dt_tunned.best_estimator_,hyperparameters)

In [56]:
print('Train score: ' + str(dt_tunned.score(X_train,y_train)))#recall
print('Test score: ' + str(dt_tunned.score(X_test,y_test)))

### <b> Bagging: Random Forest

In [149]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train,y_train)

eval_classification(rf,X_train,y_train,X_test,y_test)

+ <b> Hyperparameter Tuning with RandomSearch

In [58]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# list Hyperparameters yang akan diuji
hyperparameters = dict(
                        n_estimators = [int(x) for x in np.linspace(start = 100, stop= 5000, num = 20)], #Jumlah subtree
                        bootstrap = [False], #Apakah pakai bootsrapping atau tidak
                        criterion = ['gini','entropy'],
                        max_depth = [int(x) for x in np.linspace(10,20, num = 11)], #Maximum kedalaman tree
                        min_samples_split = [int(x) for x in np.linspace(start = 2, stop = 30, num = 5)], # Jml minimum samples pada node agar boleh di split menjadi leaf baru
                        min_samples_leaf = [int(x) for x in np.linspace(start = 1, stop = 30, num = 5)], # Jml minimum samples pada leaf agar boleh terbentuk leaf baru
                        max_features = ['auto','sqrt','log2'], # Jml features yang di pertimbangkan di masing-maisng split
                        n_jobs = [-1] # Core untuk paralel computation. -1 untuk menggunakan semua core
                     )

# Init
rf = RandomForestClassifier(random_state=42)
rf_tuned = RandomizedSearchCV(rf,hyperparameters, cv=5, random_state=42, scoring='recall')
rf_tuned.fit(X_train,y_train)

# Predict Evaluation
eval_classification(rf_tuned, X_train,y_train, X_test,y_test)

In [59]:
show_best_hyperparameter(rf_tuned.best_estimator_,hyperparameters)

In [60]:
print("Train score :" + str(rf_tuned.score(X_train,y_train))) # recall
print("Test score :" + str(rf_tuned.score(X_test,y_test)))# recall

In [61]:
show_feature_importance(rf_tuned.best_estimator_)

<b> Confusion Matrix

In [150]:
y_pred = rf_tuned.predict(X_test)
y_pred_train = rf_tuned.predict(X_train)

In [151]:
from sklearn.metrics import confusion_matrix

#Generate the confusion matrix
cf_matrix = confusion_matrix(y_test, y_pred)

print(cf_matrix)

In [152]:
sns.heatmap(confusion_matrix(y_test,y_pred),annot=True,linewidths=3,cbar=False)
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Prediction')
plt.show()

In [153]:
cf_matrix1 = confusion_matrix(y_train, y_pred_train)

print(cf_matrix1)

In [154]:
sns.heatmap(confusion_matrix(y_train,y_pred_train),annot=True,linewidths=3,cbar=False)
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Prediction')
plt.show()

In [155]:
rf.fit(X_train,y_train)
y_pred_test = rf.predict(X_test)
y_pred_test=pd.DataFrame(y_pred_test)

In [156]:
y_pred_train = rf.predict(X_train)
y_pred_train =pd.DataFrame(y_pred_test)

In [157]:
X_train=pd.DataFrame(X_train)
X_test=pd.DataFrame(X_test)

In [158]:
y_train=pd.DataFrame(y_train)
y_test=pd.DataFrame(y_test)

In [159]:
df_train = pd.concat([X_train, y_train, y_pred_train],axis=1)
df_test = pd.concat([X_test, y_test, y_pred_test],axis=1)
df_model = pd.concat([df_train, df_test],axis=0)
df_model.info()

### <B>Boosting: AdaBoost

In [160]:
from sklearn.ensemble import AdaBoostClassifier
ab = AdaBoostClassifier(random_state=42)
ab.fit(X_train,y_train)

eval_classification(ab, X_train, y_train, X_test, y_test)

+ <B>Hyperparameter Tuning with RandomSearch

In [161]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import numpy as np

# List of hyperparameter
hyperparameters = dict(n_estimators = [int(x) for x in np.linspace(start = 50, stop = 2000, num = 2000)], # Jumlah iterasi
                       learning_rate = [float(x) for x in np.linspace(start = 0.001, stop = 0.1, num = 200)],  
                       algorithm = ['SAMME', 'SAMME.R']
                      )

# Init model
ab = AdaBoostClassifier(random_state=42)
ab_tuned = RandomizedSearchCV(ab, hyperparameters, random_state=42, cv=5, scoring='recall')
ab_tuned.fit(X_train,y_train)

# Predict & Evaluation
eval_classification(ab_tuned, X_train, y_train, X_test, y_test)

In [None]:
show_best_hyperparameter(ab_tuned.best_estimator_, hyperparameters)

In [None]:
print('Train score: ' + str(ab_tuned.score(X_train, y_train)))#recall
print('Test score:' + str(ab_tuned.score(X_test, y_test)))#recall

### <b>Boosting: XGBoost

In [164]:
from xgboost import XGBClassifier, XGBRegressor
xg = XGBClassifier(random_state=42)
xg.fit(X_train, y_train)

eval_classification(xg, X_train, y_train, X_test, y_test)

+ <b> Hyperparameter Tuning with RandomSearch

In [165]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import numpy as np

#Menjadikan ke dalam bentuk dictionary
hyperparameters = {
                    'max_depth' : [int(x) for x in np.linspace(10, 200, num = 9)],
                    'min_child_weight' : [int(x) for x in np.linspace(1, 40, num = 9)],
                    'gamma' : [float(x) for x in np.linspace(0, 1, num = 9)],
                    'tree_method' : ['auto', 'exact', 'approx', 'hist'],

                    'colsample_bytree' : [float(x) for x in np.linspace(0, 1, num = 9)],
                    'eta' : [float(x) for x in np.linspace(0, 1, num = 100)],

                    'lambda' : [float(x) for x in np.linspace(0, 1, num = 9)],
                    'alpha' : [float(x) for x in np.linspace(0, 1, num = 9)]
                    }

# Init
xg = XGBClassifier(random_state=42)
xg_tuned = RandomizedSearchCV(xg, hyperparameters, cv=5, random_state=42, scoring='recall')
xg_tuned.fit(X_train,y_train)

# Predict & Evaluation
eval_classification(xg_tuned, X_train, y_train, X_test, y_test)

In [166]:
show_best_hyperparameter(xg_tuned.best_estimator_, hyperparameters)

In [167]:
print('Train score: ' + str(xg_tuned.score(X_train, y_train)))#recall
print('Test score:' + str(xg_tuned.score(X_test, y_test)))#recall

In [168]:
show_feature_importance(xg_tuned.best_estimator_)

<b> Confusion Matrix

In [None]:
y_pred = xg_tuned.predict(X_test)
y_pred_train = xg_tuned.predict(X_train)

In [None]:
from sklearn.metrics import confusion_matrix

#Generate the confusion matrix
cf_matrix = confusion_matrix(y_test, y_pred)

print(cf_matrix)

In [None]:
sns.heatmap(confusion_matrix(y_test,y_pred),annot=True,linewidths=3,cbar=False)
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Prediction')
plt.show()

### <b> Feature Selection