In [23]:
from sklearn.naive_bayes import GaussianNB
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import RepeatedStratifiedKFold,cross_val_score,train_test_split
from numpy import mean,std
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score

### loading Data

In [24]:
data = data = pd.read_csv('../Datasets/telecommunications_churn.csv',sep=';')
data.head()

Unnamed: 0,account_length,voice_mail_plan,voice_mail_messages,day_mins,evening_mins,night_mins,international_mins,customer_service_calls,international_plan,day_calls,day_charge,evening_calls,evening_charge,night_calls,night_charge,international_calls,international_charge,total_charge,churn
0,128,1,25,265.1,197.4,244.7,10.0,1,0,110,45.07,99,16.78,91,11.01,3,2.7,75.56,0
1,107,1,26,161.6,195.5,254.4,13.7,1,0,123,27.47,103,16.62,103,11.45,3,3.7,59.24,0
2,137,0,0,243.4,121.2,162.6,12.2,0,0,114,41.38,110,10.3,104,7.32,5,3.29,62.29,0
3,84,0,0,299.4,61.9,196.9,6.6,2,1,71,50.9,88,5.26,89,8.86,7,1.78,66.8,0
4,75,0,0,166.7,148.3,186.9,10.1,3,1,113,28.34,122,12.61,121,8.41,3,2.73,52.09,0


In [13]:
#When using oversampled data the accuracy is decreasing
# data = pd.read_csv('../Datasets/oversampled_data.csv')
# del data['Unnamed: 0']
# data.head()

## Using Balanced data is decreasing the accuracy of Naive Bayes, so prefering imbalanced dataset

### scaling and seperating X and y

### Since Naive Bayes works on normally distributed data, we are using MinMaxScalar to normalize the data

In [25]:
std_scr = MinMaxScaler()
X = data.drop(labels='churn',axis=1)
cols = X.columns
X = pd.DataFrame(std_scr.fit_transform(X))
X.columns = cols
y = data[['churn']]
X.shape,y.shape

((3333, 18), (3333, 1))

### Naive Bayes is a classification technique based on Bayes’ Theorem with an assumption of independence among predictors. So multicolinearity doesnt effect this algorithm.


### Calculating VIF to check for multicolinearity

In [26]:
#calculating VIF of each independent variables against all other independent variable
#variance inflation factor

def cal_VIF(X):
    
    vif = pd.DataFrame()
    vif['variables'] = X.columns
    vif['VIF'] = [variance_inflation_factor(X.values,i) for i in range(X.shape[1])]
    
    return vif

##calling cal_VIF function with using only the predictor
# using original data fro VIF calculation
print(cal_VIF(X))

#all the VIF values are <5 which is acceptable

                 variables           VIF
0           account_length  1.004366e+00
1          voice_mail_plan  1.190453e+01
2      voice_mail_messages  1.190351e+01
3                 day_mins  1.047616e+07
4             evening_mins  2.237334e+06
5               night_mins  6.387661e+05
6       international_mins  6.907441e+04
7   customer_service_calls  1.002660e+00
8       international_plan  1.008221e+00
9                day_calls  1.004704e+00
10              day_charge  1.246366e+08
11           evening_calls  1.002310e+00
12          evening_charge  3.738703e+07
13             night_calls  1.003217e+00
14            night_charge  8.529402e+06
15     international_calls  1.003815e+00
16    international_charge  9.985041e+05
17            total_charge  3.825585e+02


### Train test split

In [27]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,shuffle=True,random_state=12,stratify=y)

## Naive Bayes Gaussian model cross validation and Model Building

In [28]:
gnb = GaussianNB()
# evaluate model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(estimator=gnb,X=X,y=y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.875 (0.017)


### Model fitting ,prediction and evaluating

In [29]:
gnb.fit(x_train,y_train)
y_pred = gnb.predict(x_test)
accuracy = round(accuracy_score(y_test,y_pred)*100,2)
precision = round(precision_score(y_test,y_pred)*100,2)
recall = round(recall_score(y_test,y_pred)*100,2)
print("accuracy {}  precision {} recall {}".format(accuracy,precision,recall))

accuracy 88.31  precision 57.6 recall 74.23
