In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, auc, multilabel_confusion_matrix
from sklearn.metrics import roc_curve, precision_recall_curve, accuracy_score, cohen_kappa_score
from sklearn.model_selection import train_test_split,GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier

from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from scipy import stats
import statsmodels.api as sm

from yellowbrick.classifier import ROCAUC
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB, CategoricalNB, BernoulliNB, ComplementNB
from xgboost import XGBClassifier

from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn import svm

ModuleNotFoundError: No module named 'yellowbrick'

In [None]:
df = pd.read_csv('train.csv')
df.head(5)

In [None]:
df.skew()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

##### Checking for Missing values

In [None]:
df['avg_frequency_login_days'].value_counts()

In [None]:
df["avg_frequency_login_days"].replace({'Error': np.NaN}, inplace=True)

In [None]:
df['avg_frequency_login_days'] = df['avg_frequency_login_days'].astype('float')

In [None]:
df.isnull().sum() / len(df) * 100

In [None]:
df['region_category'].value_counts()

In [None]:
#Since these variables involve personal information, it is not recommended to impute the values.
df["region_category"].fillna("Missing", inplace=True)

df["preferred_offer_types"].fillna("Missing", inplace=True)

df["points_in_wallet"].fillna(0, inplace=True)

Dropping In-significant variables

Looks like avg_freq_login_days(Represents the no. of times a customer has logged in to the website) variable is holding numeric datatype. Hence converted to float.

ERROR value infers that the website was unable to register the avg_freq_login_days. It could be due to various factors like software glitches, etc. Also, the variable days since last login and average frequency login days holds redundancy in terms of their usage. Hence dropping the variable.

Customer-id, Name, security_no are unique variables. 
referral_id is completely irrelavant to the dataset.
Hence, dropping the above mentioned variables.

In [None]:
df = df.drop(['customer_id','Name','security_no','referral_id','avg_frequency_login_days'],axis=1)
df.info()

In [None]:
df.isnull().sum() / len(df) * 100

In [None]:
#There can be a third gender, hence we are not replacing the unknown
df['gender'].value_counts()

In [None]:
df['region_category'].value_counts()

In [None]:
# Changing the column 'joining_date' datatype as date
df['joining_date']= pd.to_datetime(df['joining_date'], format="%Y-%m-%d")

In [None]:
df['year']= pd.DatetimeIndex(df['joining_date']).year

In [None]:
df['joined_through_referral'].value_counts()

In [None]:
df["joined_through_referral"].replace({'?': "Missing"}, inplace=True)

In [None]:
df['medium_of_operation'].value_counts()

In [None]:
df["medium_of_operation"].replace({'?': "Missing"}, inplace=True)

In [None]:
df['days_since_last_login'].value_counts()

It looks like -999 is an 'Error',i.e., the website didn't populate the variable when the data was recorded. Hence replacing it with 31days as their value so that we can visualize how the data is spread out.

In [None]:
df["days_since_last_login"].replace({-999: 31}, inplace=True)

In [None]:
df[df['days_since_last_login'] == -999]

In [None]:
df.info()

In [None]:
df.isnull().sum() 

In [None]:
df['churn_risk_score'].value_counts()

### Outlier Check

In [None]:
df.skew()

In [None]:
df['churn_risk_score'].value_counts()

In [None]:
df.isnull().sum()

### Analysis

#### Univariate Analysis

In [None]:
#Plot1
plt.figure(figsize=(16,6))
sns.countplot(x="age", data=df)

The plot shows that the Age is widely spread from 10-64 with almost equal weightage.

In [None]:
#plot2
plt.figure(figsize=(6,6))
ax = sns.countplot(x="gender", data=df, palette="Accent")
total = len(df['gender'])
patches = ax.patches
for p in ax.patches:
        ax.annotate('{:.2f}%'.format(p.get_height()/total * 100), (p.get_x()+0.1, p.get_height()+50))
plt.show()

The Gender is almost equally spread in the data except the unknown subclass

In [None]:
#plot3
plt.figure(figsize=(6,6))
ax = sns.countplot(x="region_category", data=df, palette="Set1_r")
total = len(df['region_category'])
patches = ax.patches
for p in ax.patches:
        ax.annotate('{:.2f}%'.format(p.get_height()/total * 100), (p.get_x()+0.1, p.get_height()+50))
plt.show()

The region category Town is having the maximum counts. And the region Village is having the minimum counts. 
The plot shows that the town population is attracted to this particular e-commerce site.

In [None]:
#plot4
plt.figure(figsize=(10,6))
ax = sns.countplot(y=df['membership_category'], orient='v', palette="PiYG_r")
total = len(df['membership_category'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_width()/total)
        x = p.get_x() + p.get_width() + 0.02
        y = p.get_y() + p.get_height()/2
        ax.annotate(percentage, (x, y))

plt.show()

The plot shows that the Basic Membership and No Membership are having the highest count. With the platinum membership being the lowest in count

In [None]:
#plot5
plt.figure(figsize=(10,10))
ax = sns.countplot(x="preferred_offer_types", data=df, palette="mako")
total = len(df['preferred_offer_types'])
patches = ax.patches
for p in ax.patches:
        ax.annotate('{:.2f}%'.format(p.get_height()/total * 100), (p.get_x()+0.1, p.get_height()+50))
plt.show()

The preferred_offer_types is almost equally spread in the data except the missing values.We cannot impute missing values as this the variable is related to personal information.

In [None]:
#plot6
plt.figure(figsize=(10,10))
ax = sns.countplot(x="medium_of_operation", data=df, palette="rocket")
total = len(df['medium_of_operation'])
patches = ax.patches
for p in ax.patches:
        ax.annotate('{:.2f}%'.format(p.get_height()/total * 100), (p.get_x()+0.1, p.get_height()+50))
plt.show()

From the Barplot, we can infer that both Desktop and Smartphone is spread equally. We can also infer that only 10% of people using both Smartphone and Desktop.

In [None]:
#plot7
plt.figure(figsize=(10,10))
ax = sns.countplot(x="internet_option", data=df, palette="flare")
total = len(df['internet_option'])
patches = ax.patches
for p in ax.patches:
        ax.annotate('{:.2f}%'.format(p.get_height()/total * 100), (p.get_x()+0.1, p.get_height()+50))
plt.show()

The Internet option is showing equal weightage to all the subclasses being Wi-Fi, Mobile, Fiber_Optic.

In [None]:
#plot8
plt.figure(figsize=(15,15))
ax = sns.countplot(x="days_since_last_login", data=df, palette="crest")
total = len(df['days_since_last_login'])
patches = ax.patches
for p in ax.patches:
        ax.annotate('{:.2f}%'.format(p.get_height()/total * 100), (p.get_x()+0.1, p.get_height()+50))
plt.show()

The days_since_last_login variable is holding the number of days since the customer has logged in. The plot shows that the average lies around 13.The maximum days since logged-in is 31. And the minimum is 1.

In [None]:
#plot9
df_cols = df[["offer_application_preference",'joined_through_referral','used_special_discount','past_complaint']]
fig, ax = plt.subplots(nrows=2, ncols=2,figsize=(20,10))
for var, subplot in zip(df_cols.columns[:], ax.flatten()):
    ax = sns.countplot(x=df_cols[var], data=df, ax=subplot, palette="cubehelix")
    total = len(df_cols[var])
    patches = ax.patches
    for p in ax.patches:
        ax.annotate('{:.2f}%'.format(p.get_height()/total * 100), (p.get_x()+0.1, p.get_height()+50))
    
plt.tight_layout()
plt.show()

Inference for the above barplots 'Yes' or 'No' varibales: 
The highest 'Yes' is given to 'offer_application_preference'and the highest 'No' is given to 'past_complaint'.

'offer_application_preference' and 'used_special_discount' variable are midly disturbed in the data.

'joined_through_referral' and 'past_complaint' variables are equally distributed.

In [None]:
#plot10
plt.figure(figsize=(10,10))
ax = sns.countplot(y=df['feedback'], orient='v', palette="Set3")
total = len(df['feedback'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_width()/total)
        x = p.get_x() + p.get_width() + 0.02
        y = p.get_y() + p.get_height()/2
        ax.annotate(percentage, (x, y))

plt.show()

Maximum number of negative feedback for the variable is poor product quality and maximum number of positive feedback for the variable is user friendly website and reasonable price.

In [None]:
#plot11
plt.figure(figsize=(10,10))
ax = sns.countplot(y=df['complaint_status'], orient='v', palette="Set2")
total = len(df['complaint_status'])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_width()/total)
        x = p.get_x() + p.get_width() + 0.02
        y = p.get_y() + p.get_height()/2
        ax.annotate(percentage, (x, y))

plt.show()

The plot shows that the complaint status' subclasses holds almost equal weights except 'Not Applicable'.

#### Bi-Variate Analysis

In [None]:
#plot13
fig, ax = plt.subplots(nrows=3, ncols=2,figsize=(20,10))
l1 = df['churn_risk_score'].value_counts().keys().tolist()
j=['orchid','lightgreen','lightblue','pink','tan','green','red']
i = 0
for var, subplot in zip(df.columns[:], ax.flatten()):
        df_1 = df[df['churn_risk_score'] == l1[i]]
        subplot.set_title("churn risk score = {}". format(l1[i]))
        sns.histplot(x=df_1['avg_transaction_value'], ax=subplot,color=j[i])
        i += 1
plt.tight_layout()
plt.show()

The customers who are in churn risk score 1 have the average transaction total range is  100000 and customers has the maximum transactions around 80000 to 90000.

The customers who are in churn risk score 2 have the average transaction total range is  100000 and customers has the maximum transactions around 90000.

The customers who are in churn risk score 3 have the average transaction total range is 50000 and customers has the maximum transactions around 30000.

The customers who are in churn risk score 4 have the average transaction total range is  50000 and customers has the maximum transactions around 30000 to 45000.

The customers who are in churn risk score 5 have the average transaction total range is  50000 and customers has the maximum transactions around 20000.

The customers who are in churn risk score -1 have the average transaction total range is 100000, customers has the maximum transactions till 40000 and customers has the minimum transactions from 60000 to 100000 .


In [None]:
#plot14
fig, ax = plt.subplots(nrows=3, ncols=2,figsize=(20,10))
l1 = df['churn_risk_score'].value_counts().keys().tolist()
j=['orchid','lightgreen','lightblue','pink','tan','green','red']
i = 0
for var, subplot in zip(df.columns[:], ax.flatten()):
        df_1 = df[df['churn_risk_score'] == l1[i]]
        subplot.set_title("churn risk score = {}". format(l1[i]))
        sns.histplot(x=df_1['avg_time_spent'], ax=subplot,color=j[i])
        i += 1
plt.tight_layout()
plt.show()

We are able to infer that the spread is almost equal for all the 'churn_risk_score' with respect to the 'avg_time_spent'

From all the plots, we could see that the highest count of people 'avg_time_spent' is 0 - 100

There are few negative values in the plots indicating that people may totally forgot about the website or didnt use for a very long time.

With respective to 'churn_risk_score' 1 and 2 the distribution is similar with respect to counts, and with 'churn_risk_score' 3 to 5 the distribution is similar with respect to counts in 'avg_time_spent'

In [None]:
#plot16
sns.countplot(x=df['joined_through_referral'],hue=df['churn_risk_score'])

From the Multiple barplot, Count of 'Yes' and "No' in the variable 'joined_through_referral' are more likely to get equal distribution of around 3 to 5 'churn_risk_score'.

In [None]:
#plot17
plt.figure(figsize=(10,10))
sns.countplot(x=df['preferred_offer_types'],hue=df['churn_risk_score'])

The maximum customers in the churn risk score 3 mostly preferred when there is no offers.

The minimum customers in the churn risk score 2 mostly preferred when there is no offers.

The maximum customers in the churn risk score 3 mostly preferred when there is credit/debit card offers.

The minimum customers in the churn risk score 1 mostly preferred when there is credit/debit card offers.

The maximum customers in the churn risk score 3 mostly preferred when there is Gift vouchers/coupons.

The minimum customers in the churn risk score 2 mostly preferred when there is Gift vouchers/coupons.

In [None]:
#plot18
plt.figure(figsize=(10,10))
sns.countplot(x=df['medium_of_operation'],hue=df['churn_risk_score'])

From the Multiple barplot we can infer that, people using only desktop and smartphone only are not likely to comeback to the website. Because they have the similary distribution in the 'churn_risk_score'

In [None]:
#plot19
plt.figure(figsize=(10,10))
sns.countplot(x=df['internet_option'],hue=df['churn_risk_score'])

The maximum customers used wifi or mobile data internet option for visiting the website and the minimum customer used fibre optic or wifi for visiting the website.

In [None]:
#plot20
sns.countplot(hue=df['gender'],x=df['churn_risk_score'])

We are able to infer that the count is almost equal for all the churn scores with respect to the gender except for the unknown

In [None]:
#plot21
sns.countplot(hue=df['region_category'],x=df['churn_risk_score'])

The Town subclass is showing higher churn risk score when compared with other regions

In [None]:
#plot22
plt.figure(figsize=(12,6))
sns.countplot(hue=df['membership_category'],x=df['churn_risk_score'])
plt.legend(loc='upper left')

The customers with no membership and basic membership are not likely
comeback to visit the website where as customers with premium
membership have a chance of coming back to the e-commerce site.

In [None]:
#plot23
plt.figure(figsize=(12,6))
sns.boxenplot(y=df['days_since_last_login'],x=df['churn_risk_score'])

The customers who are visiting the website with no membership and basic membership are not
likely comeback to visit the website where as customers with premium membership has a chance of
coming to the website again.

In [None]:
#plot24
plt.figure(figsize=(12,6))
sns.boxplot(y=df['avg_time_spent'],x=df['churn_risk_score'])

From the above boxen plot, we can clearly see that there is outliers between average time spent
and churn risk score.

In [None]:
#plot25
plt.figure(figsize=(12,6))
sns.boxenplot(y=df['avg_transaction_value'],x=df['churn_risk_score'])

From the above boxen plot,we can clearly see that there is no outliers between average
transaction value and churn risk score.

In [None]:
#plot26
plt.figure(figsize=(12,6))
sns.violinplot(y=df['points_in_wallet'],x=df['churn_risk_score'])

We can infer that the spread is large value of points in wallet with values between 500-1000 all churn risk scores.
And the points in wallet is at its least for churn risk score 4 and 5.

In [None]:
df.info()

In [None]:
#plot27
plt.figure(figsize=(12,6))
sns.countplot(hue=df['used_special_discount'],x=df['churn_risk_score'])

From the above countplot we can clearly understand when there is special discounts maximum
customers visited the website and used the discounts.

In [None]:
df.info()

In [None]:
#plot28
plt.figure(figsize=(12,6))
sns.countplot(hue=df['offer_application_preference'],x=df['churn_risk_score'])

From the multiple barplot we can infer that 'offer_application_preference' of 'Yes' and 'No' has been equally distributed among the 'churn_risk_score' of 3-5


In [None]:
#plot29
plt.figure(figsize=(18,10))
ax = sns.countplot(x=df['past_complaint'],hue=df['churn_risk_score'])

total = len(df['past_complaint'])
patches = ax.patches
for p in ax.patches:
        ax.annotate('{:.2f}%'.format(p.get_height()/total * 100), (p.get_x()+0.01, p.get_height()+50))
plt.show()

The 'past_complaint' count status of 'Yes' and 'No' is equally disturbed in the 'churn_risk_score' from the range of 3-5, which infers whether the person has past complaint or not. Most people are likely not to comeback to the website

The past complaint influnce the churn risk rate. Although it doesnt show a great difference in term of its subcategories 

In [None]:
#plot30
plt.figure(figsize=(24,10))
ax = sns.countplot(x=df['complaint_status'],hue=df['churn_risk_score'],palette="viridis")

total = len(df['complaint_status'])
patches = ax.patches
for p in ax.patches:
        ax.annotate('{:.2f}%'.format(p.get_height()/total * 100), (p.get_x()+0.01, p.get_height()+50))
plt.show()

The 'Not-Applicable' count status is extremely high in high 'churn_risk_score' comparing with the other category counts.

The Not-Applicable count status is extremely high in high risk churn scores.

In [None]:
df['churn_risk_score'].value_counts()

In [None]:
pd.crosstab(df['feedback'], df['churn_risk_score'])

In [None]:
pd.crosstab(df['complaint_status'], df['churn_risk_score'])

In [None]:
#The Churn risk score holding -1 value
df_1['feedback'].value_counts(normalize=True) * 100

From the Graph and Cross tab, we can infer that the negative feedback holds a major portion of our dataset.
Also, the following 5 feedback categories influence the churn risk rate at a higher level.
    1.Poor Website
    2.No reason specified
    3.Poor Product Quality
    4.Poor Customer Service
    5.Too many ads
    
Hence doing proportion imputation for the churn risk score : -1.

In [None]:
#plot31
sns.countplot(df['churn_risk_score'])

In [None]:
df['feedback'].value_counts()

In [None]:
plt.figure(figsize=(30,30))
g = sns.catplot(
        x='avg_transaction_value', 
        y='feedback', 
        data=df,
        palette='bright',
        height=7, aspect=1.3,
        kind='bar',
        hue='membership_category', 
        col ='churn_risk_score',
        col_wrap=2)
#g.set_titles(
#    'Title: {col_name}');

In [None]:
df['membership_category'].value_counts()

def feed(x, y,z):
    l1 = ['Poor Product Quality','Too many ads','Poor Website','Poor Customer Service']
    l2 = ['Platinum Membership','Premium Membership','Gold Membership','Silver Membership']
    if y == -1:
        if x in l1:
            if z in l2:
                return 3
            else:
                return 5
        else:
            return 1
    else:
        return y
df["churn_risk_score"] = df.apply(lambda x: feed(x['feedback'],x['churn_risk_score'],x['membership_category']), axis = 1)

In [None]:
df[df['churn_risk_score'] == -1]

In [None]:
#plot32
sns.countplot(df['churn_risk_score'])

Bucketising the churn risk score

In [None]:
def bucket(x):
    if (x == 1) | (x == 2) | (x==-1):
        return 1
    elif (x == 3):
        return 2
    else:
        return 3
df["churn_risk_score"] = df.apply(lambda x: bucket(x['churn_risk_score']), axis = 1)

In [None]:
#plot33
sns.countplot(df['churn_risk_score'])

In [None]:
df['churn_risk_score'].value_counts(normalize=True) * 100

In [None]:
df['churn_risk_score'] = df['churn_risk_score'].astype('object')

#### Multi-Variate Analysis

In [None]:
df.info()

In [None]:
pd.crosstab(df['feedback'], df['membership_category'], df['churn_risk_score'], aggfunc='count')

In [None]:
plt.figure(figsize=(20,20))
sns.barplot(data=df, y='feedback', x='churn_risk_score', hue='membership_category')

The churn risk score 1 class with positive feedback have Platinum, Premium, Gold and silver membership.
The churn risk score 2 ad 3 class with negative feedback have some platinum, premium membership and many basic membership and no membership

In [None]:
#plot15
plt.figure(figsize=(20,20))
sns.pairplot(df, hue='churn_risk_score')

The average transaction value is holding maximum range for churn risk score 1 irrespective of the age, days_since_last_login, avg_time_Spent, points_in_wallet. And the values are low for churn risk score 2,3.
We are able to see the seperation of clusters between 1 and 2,3 after bucketising.
The points_in_wallet shows the dominance of cluster1 for points above 500, whereas cluster 3 is showing its dominance for points below 500.

In [None]:
#plot34
plt.figure(figsize=(12,10))
sns.scatterplot(df['age'],df['avg_transaction_value'], hue=df['churn_risk_score'])

The average transaction value is holding maximum range for churn risk score 1 irrespective of the age, and the values are low for churn risk score 2,3.

In [None]:
#plot35
plt.figure(figsize=(12,10))
sns.scatterplot(df['age'],df['avg_time_spent'], hue=df['churn_risk_score'])

From the plots, there is a little dominance in cluster 1 for the higher avg_time_spent with lesser age.

In [None]:
#plot36
plt.figure(figsize=(12,10))
sns.scatterplot(df['age'],df['days_since_last_login'], hue=df['churn_risk_score'])

The plot shows that the age and days_since_last_login have almost no relationship with respect to churn risk rate since, the values are equally distributed with increase in age.

In [None]:
#plot36
plt.figure(figsize=(12,10))
sns.scatterplot(df['age'],df['points_in_wallet'], hue=df['churn_risk_score'])

The churn risk score 1 is holding the same value irrespective of the age for the values between 700-900.
The churn risk score 2 is holding the same value irrespective of the age for the values between 400-500.
Also, we could see that the churn risk score 3 is showing its dominance for points below 500.
Also, we could see that the churn risk score 1 and 2 is showing its dominance for points above 1000.

In [None]:
#plot38
plt.figure(figsize=(12,10))
sns.scatterplot(df['avg_transaction_value'],df['avg_time_spent'], hue=df['churn_risk_score'])

The average transaction value is holding maximum range for churn risk score 1 irrespective of the avg_time_Spent. And the values are low for churn risk score 2,3.
The avg time spent is higher for lesser avg transactional value for churn risk score of 2 and 3.

In [None]:
#plot40
plt.figure(figsize=(12,10))
sns.scatterplot(df['days_since_last_login'],df['avg_transaction_value'], hue=df['churn_risk_score'])

The average transaction value is holding maximum range for churn risk score 1 irrespective of the days_since_last_login, and the values are low for churn risk score 2,3.

In [None]:
#plot41
plt.figure(figsize=(12,10))
sns.scatterplot(df['days_since_last_login'],df['points_in_wallet'], hue=df['churn_risk_score'])

The churn risk score 1 is holding same value irrespective of the days_since_last_login for the values between 600-800.
The churn risk score is holding same value irrespective of the age for the values between 400-500.
Also, we could see that the churn risk score 3 is showing its dominance for points below 700.
Also, we could see that the churn risk score 1 and 2 is showing its dominance for points above 1000.

In [None]:
#plot41
plt.figure(figsize=(12,10))
sns.scatterplot(df['avg_time_spent'],df['points_in_wallet'], hue=df['churn_risk_score'])

The churn risk score 1,2 is holding same value irrespective of the days_since_last_login for the values between 700-800.
The churn risk score 3 is holding same value irrespective of the age for the values between 500-650.
we could see that the churn risk score 3 is showing its dominance for points below 700.
And, we could see that the churn risk score 1 and 2 is showing its dominance for points above 1000.
Also, The avg_time_spent is lesser datapoints below 0.

In [None]:
#plot42
plt.figure(figsize=(12,10))
sns.scatterplot(df['avg_transaction_value'],df['points_in_wallet'], hue=df['churn_risk_score'])

The average transaction value is holding maximum range for churn risk score 1 irrespective of the points_in_wallet. And the values are low for churn risk score 2,3.
The points_in_wallet is higher for lesser avg transactional value for churn risk score of 2 and 3.

In [None]:
plt.figure(figsize=(20,20))
sns.swarmplot(df['membership_category'],df['points_in_wallet'], df['churn_risk_score'])

In [None]:
plt.figure(figsize=(20,20))
sns.swarmplot(df['feedback'],df['avg_transaction_value'], df['churn_risk_score'])

Customers with churn risk rate 1 who has given positive feedback have avg transactional value's range till 100000.Also, we could see that there is no single customer who has range till 100000 with churn risk rate 2 and 3.
On the other hand, the customers with churn risk rate 2 and 3 who have given negative feedback have the average transactional value that is less than 50000.
The cluster's seperability is clearly defined.

In [None]:
plt.figure(figsize=(20,20))
sns.swarmplot(df['feedback'],df['avg_time_spent'], df['churn_risk_score'])

The plot clearly signifies that the customers who gave negative feedback have avg time spent higher till 3000. On the other hand, the customers who have given positive feedback have the average time spent that is less than 2000 with countable number of outliers.

In [None]:
sns.heatmap(df.corr(),annot=True)

In [None]:
vif = pd.DataFrame()
df_n = df.select_dtypes(include=[np.number])
vif["Features"] = df_n.columns
vif["VIF"] = [variance_inflation_factor(df_n.values, i) for i in range(df_n.shape[1])]
vif.sort_values('VIF', ascending=False)

heatmap clearly states that the numerical variables are independant of each other. No multicollinearity with respect to the numerical variables.

In [None]:
g = sns.catplot(
        x='churn_risk_score', 
        y='complaint_status', 
        data=df,
        palette='bright',
        height=3, aspect=1.3,
        kind='bar',
        hue='gender', 
        col ='membership_category',
        col_wrap=2)
g.set_titles(
    'Title: {col_name}');

In [None]:
g = sns.catplot(
        x='churn_risk_score', 
        y='preferred_offer_types', 
        data=df,
        palette='bright',
        height=3, aspect=1.3,
        kind='bar',
        hue='gender', 
        col ='membership_category',
        col_wrap=2)
g.set_titles(
    'Title: {col_name}');

In [None]:
g = sns.catplot(
        x='churn_risk_score', 
        y='feedback', 
        data=df,
        palette='bright',
        height=3, aspect=1.3,
        kind='bar',
        hue='complaint_status', 
        col ='membership_category',
        col_wrap=2)
g.set_titles(
    'Title: {col_name}');

In [None]:
g = sns.catplot(
        x='churn_risk_score', 
        y='feedback', 
        data=df,
        palette='bright',
        height=3, aspect=1.3,
        kind='bar',
        hue='complaint_status', 
        col ='membership_category',
        col_wrap=2)
g.set_titles(
    'Title: {col_name}');

In [None]:
g = sns.catplot(
        x='churn_risk_score', 
        y='feedback', 
        data=df,
        palette='bright',
        height=3, aspect=1.3,
        kind='bar',
        hue='gender', 
        col ='membership_category',
        col_wrap=2)
g.set_titles(
    'Title: {col_name}');

The churn risk score 1 class with positive feedback have Platinum, Premium, Gold and Silver membership.
The churn risk score 2 class with negative feedback have some Platinum, Premium membership and many Basic membership and No membership
The churn risk score 3 class with negative feedback have almost all values in Basic and No membership

In [None]:
plt.figure(figsize=(20,20))
sns.catplot(
        x='churn_risk_score', 
        y='feedback', 
        data=df,palette='bright',height=3, aspect=1.3,kind='bar',
        hue='complaint_status', 
        col ='past_complaint',
        col_wrap=2)
g.set_titles(
    'Title: {col_name}');

In [None]:
df_mem = pd.crosstab(df['churn_risk_score'], df['membership_category'])

In [None]:
df_grp = df.groupby('churn_risk_score')['avg_transaction_value'].mean()

In [None]:
df_sum = df.groupby('churn_risk_score')['avg_transaction_value'].sum()

In [None]:
df_concat = pd.concat([df_mem, df_grp, df_sum], axis=1)
#df_concat.to_csv('mem_sum_tran.csv')

In [None]:
df['churn_risk_score'].value_counts(normalize=True) * 100

# Data Preparation for Model Building

In [None]:
df1=df.select_dtypes(include=np.number)
df1 = df1.drop('year',axis=1)
ss=StandardScaler()
df_s=ss.fit_transform(df1)
df_s=pd.DataFrame(df_s,columns=df1.columns,index=df1.index)
df_s.head()

In [None]:
vif = pd.DataFrame()
vif["Features"] = df_s.columns
vif["VIF"] = [variance_inflation_factor(df_s.values, i) for i in range(df_s.shape[1])]
vif.sort_values('VIF', ascending=False)

In [None]:
df.skew()

In [None]:
df_n = df.select_dtypes(include=[np.number])
X = df_n[['days_since_last_login','avg_transaction_value']]
PT_yj = PowerTransformer(method='yeo-johnson')
trans_X = pd.DataFrame(PT_yj.fit_transform((X)),columns = X.columns)
trans_X.shape

In [None]:
df.shape

In [None]:
df['days_since_last_login'] = trans_X['days_since_last_login']
df['avg_transaction_value'] = trans_X['avg_transaction_value']
df.skew()

In [None]:
df.isnull().sum()

In [None]:
df_cat = df.select_dtypes(include=[np.object])
df_cat=df_cat.drop(['last_visit_time'],axis=1)
for i in df_cat.columns:
    df_cat[i]=LabelEncoder().fit_transform(df_cat[i])
df_cat.head()

In [None]:
df2=pd.concat([df_s,df_cat,df['year']],axis=1)
df2.head()

In [None]:
df['churn_risk_score'] = df['churn_risk_score'].astype('int')

In [None]:
x=df2.drop('churn_risk_score',axis=1)
#x = df2
y=df['churn_risk_score']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 5, stratify=y)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

# Statistical Tests

In [None]:
m1 = ols('age ~ churn_risk_score', data = df2).fit() 
anov_table1 = anova_lm(m1, typ=1)
anov_table1

In [None]:
m2 = ols('days_since_last_login ~ churn_risk_score', data = df2).fit() 
anov_table2 = anova_lm(m2, typ=1)
anov_table2

In [None]:
m3 = ols('avg_time_spent ~ churn_risk_score', data = df2).fit() 
anov_table3 = anova_lm(m3, typ=1)
anov_table3

In [None]:
m4 = ols('avg_transaction_value ~ churn_risk_score', data = df2).fit() 
anov_table4 = anova_lm(m4, typ=1)
anov_table4

In [None]:
m5 = ols('points_in_wallet ~ churn_risk_score', data = df2).fit() 
anov_table5 = anova_lm(m5, typ=1)
anov_table5

In [None]:
m6 = ols('year ~ churn_risk_score', data = df2).fit() 
anov_table6 = anova_lm(m6, typ=1)
anov_table6

In [None]:
ct1=pd.crosstab(df['gender'],df['churn_risk_score'])
stats.chi2_contingency(ct1)

In [None]:
ct2=pd.crosstab(df['region_category'],df['churn_risk_score'])
stats.chi2_contingency(ct2)

In [None]:
ct3=pd.crosstab(df['membership_category'],df['churn_risk_score'])
stats.chi2_contingency(ct3)

In [None]:
ct4=pd.crosstab(df['joined_through_referral'],df['churn_risk_score'])
stats.chi2_contingency(ct4)

In [None]:
ct5=pd.crosstab(df['preferred_offer_types'],df['churn_risk_score'])
stats.chi2_contingency(ct5)

In [None]:
ct6=pd.crosstab(df['medium_of_operation'],df['churn_risk_score'])
stats.chi2_contingency(ct6)

In [None]:
ct7=pd.crosstab(df['internet_option'],df['churn_risk_score'])
stats.chi2_contingency(ct7)

In [None]:
ct8=pd.crosstab(df['used_special_discount'],df['churn_risk_score'])
stats.chi2_contingency(ct8)

In [None]:
ct9=pd.crosstab(df['offer_application_preference'],df['churn_risk_score'])
stats.chi2_contingency(ct9)

In [None]:
ct10=pd.crosstab(df['past_complaint'],df['churn_risk_score'])
stats.chi2_contingency(ct10)

In [None]:
ct11=pd.crosstab(df['complaint_status'],df['churn_risk_score'])
stats.chi2_contingency(ct11)

In [None]:
ct12=pd.crosstab(df['feedback'],df['churn_risk_score'])
stats.chi2_contingency(ct12)

In [None]:
mod1 = ols('churn_risk_score ~ age+days_since_last_login+avg_time_spent+avg_transaction_value+points_in_wallet+year', data = df2).fit()
aov_table =anova_lm(mod1, typ=1)
print("In-Significant numerical variables for predicting churn_risk_score:")
print(aov_table[aov_table['PR(>F)'] > 0.05].index.tolist())

In [None]:
l1 = df.select_dtypes(include=[np.object]).columns
l2 = []
l3 = []
for i in range(len(l1)):
    ct = pd.crosstab(df['churn_risk_score'],df[l1[i]])
    tstat, pval, ddof, exp_val = stats.chi2_contingency(ct)
    if pval > 0.05:
        l2.append(l1[i])
    else:
        l3.append(l1[i])
print("In-Significant categorical variables for predicting churn_risk_score:")
print(l2)

In [None]:
print("Significant categorical variables for predicting churn_risk_score:")
print(l3)

# BASE-MODEL Logistic Regression

In [None]:
y_train.reset_index(drop=True, inplace=True)

In [None]:
X_train.reset_index(inplace=True)

In [None]:
y_train.index

In [None]:
y_train.shape, X_train.shape

In [None]:
X_train.drop('index',axis=1, inplace=True)

In [None]:
ols_model=sm.OLS(y_train,sm.add_constant(X_train)).fit()
ols_model.summary()

In [None]:
score_card = pd.DataFrame(columns=['Model', 'Precision Score', 'Recall Score',
                                   'False Negatives', 'Kappa Score', 'f1-score'])

In [None]:
def update_score_card(model, FN_values, model_name):
    y_pred = model.predict(X_test)
    global score_card
    score_card = score_card.append({'Model': model_name,
                                    'Precision Score': precision_score(y_test, y_pred, average='macro'),
                                    'Recall Score': recall_score(y_test, y_pred, average='macro'),
                                    'False Negatives': FN_values,
                                    'Kappa Score': cohen_kappa_score(y_test, y_pred),
                                    'f1-score': f1_score(y_test, y_pred, average='macro')}, 
                                    ignore_index = True)

In [None]:
LR_model = LogisticRegression(multi_class='multinomial', solver='lbfgs').fit(X_train,y_train)

y_pred_xtest=LR_model.predict(X_test)
print(classification_report(y_test,y_pred_xtest))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test,y_pred_xtest, cmap='Reds')

In [None]:
LR_mul = multilabel_confusion_matrix(y_test, y_pred_xtest)
LR_mul

In [None]:
LR_FN = LR_mul[2][1][0]
update_score_card(LR_model, LR_FN, 'LR_model(Base)')

In [None]:
cohen_kappa_score(y_test, y_pred_xtest)

In [None]:
#For logistic regression model, the roc curve with yellowbrick package
LR_visualizer = ROCAUC(LR_model)

LR_visualizer.fit(X_train, y_train)        # Fit the training data to the visualizer
LR_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
LR_visualizer.show()   

# Decision Tree

In [None]:
X_train.columns

In [None]:
X_train.drop(['avg_time_spent', 'age', 'year', 'gender', 'internet_option', 'used_special_discount', 'complaint_status'],axis=1, inplace=True)
X_test.drop(['avg_time_spent', 'age', 'year', 'gender', 'internet_option', 'used_special_discount', 'complaint_status'],axis=1, inplace=True)

In [None]:
decision_tree_classification = DecisionTreeClassifier(criterion = 'entropy', random_state = 10)#default=gini
decision_tree = decision_tree_classification.fit(X_train, y_train)
y_pred_xtest=decision_tree.predict(X_test)
print(classification_report(y_test, y_pred_xtest))

In [None]:
DT_mul = multilabel_confusion_matrix(y_test, y_pred_xtest)
DT_mul

In [None]:
DT_FN = DT_mul[2][1][0]
update_score_card(decision_tree, DT_FN, 'Decision Tree')

In [None]:
cohen_kappa_score(y_test, y_pred_xtest)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test,y_pred_xtest, cmap='Reds')

In [None]:
#For logistic regression model, the roc curve with yellowbrick package
DT_visualizer = ROCAUC(decision_tree)

DT_visualizer.fit(X_train, y_train)        # Fit the training data to the visualizer
DT_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
DT_visualizer.show()  

In [None]:
X_train.shape

In [None]:
decision_tree_classification_tuned = DecisionTreeClassifier(criterion='entropy',
                                                           max_depth=11,
                                                           max_leaf_nodes=9,
                                                           min_samples_leaf=8,
                                                           min_samples_split=3,
                                                           random_state=10,
                                                           class_weight='balanced')
decision_tree_tuned = decision_tree_classification_tuned.fit(X_train, y_train)
y_pred_xtest=decision_tree.predict(X_test)
print(classification_report(y_test, y_pred_xtest))

In [None]:
DTT_mul = multilabel_confusion_matrix(y_test, y_pred_xtest)
DTT_mul

In [None]:
DTT_FN = DTT_mul[2][1][0]
update_score_card(decision_tree_tuned, DTT_FN, 'Decision Tree Tuned')

In [None]:
cohen_kappa_score(y_test, y_pred_xtest)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test,y_pred_xtest, cmap='Reds')

In [None]:
#For logistic regression model, the roc curve with yellowbrick package
DTT_visualizer = ROCAUC(decision_tree_classification_tuned)

DTT_visualizer.fit(X_train, y_train)        # Fit the training data to the visualizer
DTT_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
DTT_visualizer.show()   

# Random Forest

In [None]:
rf_classification = RandomForestClassifier(n_estimators = 30, random_state = 10,max_depth=10,max_features=8,min_samples_split=9,min_samples_leaf= 10,criterion = 'entropy', bootstrap = True,class_weight='balanced')
# use fit() to fit the model on the train set
rf_model = rf_classification.fit(X_train, y_train)
y_pred_xtest=rf_model.predict(X_test)

# print the performance measures
print(classification_report(y_test,y_pred_xtest))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test,y_pred_xtest, cmap='Reds')

In [None]:
RF_mul = multilabel_confusion_matrix(y_test, y_pred_xtest)
RF_mul

In [None]:
RF_FN = RF_mul[2][1][0]
update_score_card(rf_model, RF_FN, 'Random Forest Tuned')

In [None]:
cohen_kappa_score(y_test, y_pred_xtest)

In [None]:
RF_visualizer = ROCAUC(rf_model)

RF_visualizer.fit(X_train, y_train)        # Fit the training data to the visualizer
RF_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
RF_visualizer.show()   

In [None]:
knn_classification = KNeighborsClassifier(n_neighbors = 10)#default =minkowski,p=2,ie: enclidean distance
knn_model = knn_classification.fit(X_train, y_train)
y_pred_xtest=knn_model.predict(X_test)
print(classification_report(y_test,y_pred_xtest))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test,y_pred_xtest, cmap='Reds')

In [None]:
KNN_mul = multilabel_confusion_matrix(y_test, y_pred_xtest)
KNN_mul

In [None]:
KNN_FN = KNN_mul[2][1][0]
update_score_card(knn_model, KNN_FN, 'KNN model Tuned')

In [None]:
cohen_kappa_score(y_test, y_pred_xtest)

In [None]:
KNN_visualizer = ROCAUC(knn_model)

KNN_visualizer.fit(X_train, y_train)        # Fit the training data to the visualizer
KNN_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
KNN_visualizer.show()   

In [None]:
gnb = GaussianNB()
gnb_model = gnb.fit(X_train, y_train)
y_pred_xtest=gnb_model.predict(X_test)
print(classification_report(y_test,y_pred_xtest))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test,y_pred_xtest, cmap='Reds')

In [None]:
GNB_mul = multilabel_confusion_matrix(y_test, y_pred_xtest)
GNB_mul

In [None]:
GNB_FN = GNB_mul[2][1][0]
update_score_card(gnb_model, GNB_FN, 'Gaussian NB')

In [None]:
cohen_kappa_score(y_test, y_pred_xtest)

In [None]:
GNB_visualizer = ROCAUC(gnb_model)

GNB_visualizer.fit(X_train, y_train)        # Fit the training data to the visualizer
GNB_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
GNB_visualizer.show()   

In [None]:
ada_model = AdaBoostClassifier(n_estimators = 110, random_state = 10, learning_rate=1.0)

# fit the model using fit() on train data
ada_model.fit(X_train, y_train)
y_pred_xtest=ada_model.predict(X_test)
print(classification_report(y_test,y_pred_xtest))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test,y_pred_xtest, cmap='Reds')

In [None]:
ADA_mul = multilabel_confusion_matrix(y_test, y_pred_xtest)
ADA_mul

In [None]:
ADA_FN = ADA_mul[2][1][0]
update_score_card(ada_model, ADA_FN, 'ADA Boost')

In [None]:
cohen_kappa_score(y_test, y_pred_xtest)

In [None]:
ADA_visualizer = ROCAUC(ada_model)

ADA_visualizer.fit(X_train, y_train)        # Fit the training data to the visualizer
ADA_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
ADA_visualizer.show()   

In [None]:
y_train.value_counts()

In [None]:
# create a dataframe that stores the feature names and their importance
# 'feature_importances_' returns the features based on the gini importance
important_features = pd.DataFrame({'Features': X_train.columns, 
                                   'Importance': ada_model.feature_importances_})

# sort the dataframe in the descending order according to the feature importance
important_features = important_features.sort_values('Importance', ascending = False)

# create a barplot to visualize the features based on their importance
sns.barplot(x = 'Importance', y = 'Features', data = important_features)

# add plot and axes labels
# set text size using 'fontsize'
plt.title('Feature Importance', fontsize = 15)
plt.xlabel('Importance', fontsize = 15)
plt.ylabel('Features', fontsize = 15)

# display the plot
plt.show()

In [None]:
xgb_model = XGBClassifier(max_depth=10, max_leaves=2, n_estimators=250)

# fit the model using fit() on train data
y_train1=y_train.map({1:0,2:1,3:2})
y_test1=y_test.map({1:0,2:1,3:2})
xgb_model.fit(X_train, y_train1)
y_pred_xtest=xgb_model.predict(X_test)
print(classification_report(y_test1,y_pred_xtest))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test1,y_pred_xtest, cmap='Reds')

In [None]:
XGB_mul = multilabel_confusion_matrix(y_test1, y_pred_xtest)
XGB_mul

In [None]:
XGB_FN = XGB_mul[2][1][0]
score_card = score_card.append({'Model': 'XGBoost',
                                'Precision Score': precision_score(y_test1, y_pred_xtest, average='macro'),
                                'Recall Score': recall_score(y_test1, y_pred_xtest, average='macro'),
                                'False Negatives': XGB_FN,
                                'Kappa Score': cohen_kappa_score(y_test1, y_pred_xtest),
                                'f1-score': f1_score(y_test1, y_pred_xtest, average='macro')}, 
                                ignore_index = True)

In [None]:
cohen_kappa_score(y_test1, y_pred_xtest)

In [None]:
XGB_visualizer = ROCAUC(xgb_model)

XGB_visualizer.fit(X_train, y_train1)        # Fit the training data to the visualizer
XGB_visualizer.score(X_test, y_test1)        # Evaluate the model on the test data
XGB_visualizer.show()   

In [None]:
svc_model=svm.SVC(C=100,gamma=0.0001, kernel='rbf').fit(X_train,y_train)
y_pred_xtest=svc_model.predict(X_test)
print(classification_report(y_test,y_pred_xtest))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test,y_pred_xtest, cmap='Reds')

In [None]:
SVC_mul = multilabel_confusion_matrix(y_test, y_pred_xtest)
SVC_mul

In [None]:
SVC_FN = SVC_mul[2][1][0]
update_score_card(svc_model, SVC_FN, 'SVC')

In [None]:
cohen_kappa_score(y_test, y_pred_xtest)

In [None]:
SVC_visualizer = ROCAUC(svc_model)

SVC_visualizer.fit(X_train, y_train)        # Fit the training data to the visualizer
SVC_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
SVC_visualizer.show()   

# SMOTE

In [None]:
y=df[['churn_risk_score']].astype(dtype='int')
y.value_counts()

In [None]:
df1=df.select_dtypes(include=np.number)
X = pd.concat([df1,df_cat],axis=1)
X.drop('churn_risk_score',axis=1,inplace=True)
X.head()

In [None]:
print('Class distribution BEFORE')
print(y.value_counts().sort_index())

ovr_sample = SMOTE(k_neighbors = 5, random_state=100)
X_new, y_new = ovr_sample.fit_resample(X, y)
print('\nClass distribution AFTER')
print(y_new.value_counts().sort_index())
df_new = pd.concat([X_new,y_new],axis=1)

In [None]:
X_new.skew()

In [None]:
X_new.shape

In [None]:
X_new.isnull().sum()

In [None]:
x=X_new
y=y_new

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 5, stratify=y)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

# Random Forest AFTER SMOTE

In [None]:
rf_classification = RandomForestClassifier(n_estimators = 30, random_state = 10,max_depth=10,max_features=7,min_samples_split=10,min_samples_leaf= 7,criterion = 'gini', bootstrap = True)

# use fit() to fit the model on the train set
smote_rf_model = rf_classification.fit(X_train, y_train)
y_pred_xtest=smote_rf_model.predict(X_test)
# print the performance measures
print(classification_report(y_test,y_pred_xtest))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test,y_pred_xtest, cmap='Reds')

In [None]:
SMOTE_mul = multilabel_confusion_matrix(y_test, y_pred_xtest)

In [None]:
SMOTE_FN = SMOTE_mul[2][1][0]
update_score_card(smote_rf_model, SMOTE_FN, 'Random Forest After SMOTE')

In [None]:
cohen_kappa_score(y_test, y_pred_xtest)

In [None]:
SMT_visualizer = ROCAUC(smote_rf_model)

SMT_visualizer.fit(X_train, y_train)        # Fit the training data to the visualizer
SMT_visualizer.score(X_test, y_test)        # Evaluate the model on the test data
SMT_visualizer.show()   

In [None]:

fig = plt.figure(figsize=(15,10))

plt.plot(LR_visualizer.fpr['macro'], LR_visualizer.tpr['macro'], 'b', label ='LinearRegression_AUC :{}'.format(round(LR_visualizer.roc_auc['macro'],2)))
plt.plot(DTT_visualizer.fpr['macro'], DTT_visualizer.tpr['macro'], 'r', label ='Decision_Tree_Tuned_AUC :{}'.format(round(DTT_visualizer.roc_auc['macro'],2)))
plt.plot(RF_visualizer.fpr['macro'], RF_visualizer.tpr['macro'], 'grey', label='Random_Forest_AUC :{}'.format(round(RF_visualizer.roc_auc['macro'],2)))
plt.plot(KNN_visualizer.fpr['macro'], KNN_visualizer.tpr['macro'], 'm', label='K_Nearest_Neighbors_AUC :{}'.format(round(KNN_visualizer.roc_auc['macro'],2)))
plt.plot(GNB_visualizer.fpr['macro'], GNB_visualizer.tpr['macro'], 'y', label='Gaussian_NB_AUC :{}'.format(round(GNB_visualizer.roc_auc['macro'],2)))
plt.plot(ADA_visualizer.fpr['macro'], ADA_visualizer.tpr['macro'], 'k', label='Ada_Boost_AUC :{}'.format(round(ADA_visualizer.roc_auc['macro'],2)))
plt.plot(XGB_visualizer.fpr['macro'], XGB_visualizer.tpr['macro'], 'brown', label='XGBoost_AUC :{}'.format(round(XGB_visualizer.roc_auc['macro'],2)))
plt.plot(SVC_visualizer.fpr['macro'], SVC_visualizer.tpr['macro'], 'cyan', label='SVC_AUC :{}'.format(round(SVC_visualizer.roc_auc['macro'],2)))

plt.plot([0,1], [0,1], color='orange', linestyle='--')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])


plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)

plt.legend()
plt.show()

In [None]:
score_card.sort_values('False Negatives', ascending=False)