**Import Libraries and Data**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from google.colab import files
uploaded = files.upload()

Saving Churn_Modelling.csv to Churn_Modelling.csv


In [3]:
# Load the dataset into a pandas DataFrame
data = pd.read_csv("Churn_Modelling.csv")
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
# Drop unnecessary columns that don't contribute to the model
columns_to_drop = ['RowNumber', 'CustomerId', 'Surname']
data = data.drop(columns=columns_to_drop)
print(data.columns)

Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited'],
      dtype='object')


**Convert String Labels into Numbers​**

*   Geography

In [5]:
# Convert 'Geography' to dummy variables for model compatibility
one_hot_ft = ["Geography"]
data = pd.get_dummies(data, columns=one_hot_ft)

*   Gender

In [6]:
# Encode 'Gender' as binary values (0 and 1)
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
data['gender'] = label_encoder.fit_transform(data['Gender'])

# Drop the original 'Gender' column as it's now encoded
data = data.drop(columns=["Gender"])
data.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain,gender
0,619,42,2,0.0,1,1,1,101348.88,1,True,False,False,0
1,608,41,1,83807.86,1,0,1,112542.58,0,False,False,True,0
2,502,42,8,159660.8,3,1,0,113931.57,1,True,False,False,0
3,699,39,1,0.0,2,0,0,93826.63,0,True,False,False,0
4,850,43,2,125510.82,1,1,1,79084.1,0,False,False,True,0


**Split Data into Training and Testing​**

In [7]:
# Separate the features (X) and the target (y)
X = data.loc[:, data.columns != 'Exited']
y = data.loc[:, data.columns == 'Exited']
print(X)

      CreditScore  Age  Tenure    Balance  NumOfProducts  HasCrCard  \
0             619   42       2       0.00              1          1   
1             608   41       1   83807.86              1          0   
2             502   42       8  159660.80              3          1   
3             699   39       1       0.00              2          0   
4             850   43       2  125510.82              1          1   
...           ...  ...     ...        ...            ...        ...   
9995          771   39       5       0.00              2          1   
9996          516   35      10   57369.61              1          1   
9997          709   36       7       0.00              1          0   
9998          772   42       3   75075.31              2          1   
9999          792   28       4  130142.79              1          1   

      IsActiveMember  EstimatedSalary  Geography_France  Geography_Germany  \
0                  1        101348.88              True              

In [8]:
# Split the data into training and testing sets (30% test, 70% train)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

**Create Naïve Bayes classifiers​**

In [9]:
# Initialize the Gaussian Naïve Bayes classifier
from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()

**Fit Dataset on Classifier​**

In [10]:
# Fit the classifier to the training data
classifier.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)



**Perform Prediction​**

In [11]:
# Predict on the test set
y_pred = classifier.predict(X_test)
print(y_pred)

[0 0 0 ... 0 1 0]


In [12]:
# Evaluate using confusion matrix and accuracy score
from sklearn.metrics import confusion_matrix, accuracy_score

cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)

# Output the accuracy score
ac

0.7813333333333333

In [13]:
# Print the classification report for detailed metrics
from sklearn.metrics import classification_report as cr
print(cr(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.96      0.87      2379
           1       0.38      0.09      0.15       621

    accuracy                           0.78      3000
   macro avg       0.59      0.53      0.51      3000
weighted avg       0.71      0.78      0.72      3000



**Naive Bayes with Multiple Labels​**

In [14]:
# Load dataset of news articles with multiple categories
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups()

In [15]:
print(news.keys())

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])


In [16]:
news.target
news.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [17]:
# Define the categories of interest
cats = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

In [18]:
# Load training and test data for these categories
news_train = fetch_20newsgroups(subset='train', categories=cats)
news_test = fetch_20newsgroups(subset='test', categories=cats)

In [19]:
# Verify training data structure
print(news_train.keys())
print(news_train['target_names'])

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])
['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']


In [20]:
# Convert text data into a frequency matrix (CountVectorizer)
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
X_train_tf = count_vect.fit_transform(news_train.data)
X_train_tf.shape

(2257, 35788)

In [21]:
# Apply TF-IDF transformation to the frequency matrix
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_tf)
X_train_tfidf.shape

(2257, 35788)

In [22]:
from sklearn.naive_bayes import MultinomialNB

# Train the classifier on the transformed training data
clf = MultinomialNB().fit(X_train_tfidf, news_train.target)

In [23]:
# Transform the test data and make predictions
X_test_tf = count_vect.transform(news_test.data)
X_test_tfidf = tfidf_transformer.transform(X_test_tf)
predicted = clf.predict(X_test_tfidf)
predicted

array([2, 2, 3, ..., 2, 2, 1])

In [24]:
# Evaluate the performance with accuracy
from sklearn import metrics
from sklearn.metrics import accuracy_score

print("Accuracy", accuracy_score(news_test.target, predicted))

Accuracy 0.8348868175765646


In [25]:
# Generate and display the confusion matrix for multi-class classification
cm = metrics.confusion_matrix(news_test.target, predicted)
confMatrix = pd.DataFrame(cm, columns=['predicted_alt.atheism', 'predicted_soc.religion.christian', 'predicted_comp.graphics', 'predicted_sci.med'],
                          index=['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med'])
print(confMatrix)

                        predicted_alt.atheism  \
alt.atheism                               192   
soc.religion.christian                      2   
comp.graphics                               2   
sci.med                                     2   

                        predicted_soc.religion.christian  \
alt.atheism                                            2   
soc.religion.christian                               347   
comp.graphics                                         11   
sci.med                                                2   

                        predicted_comp.graphics  predicted_sci.med  
alt.atheism                                   6                119  
soc.religion.christian                        4                 36  
comp.graphics                               322                 61  
sci.med                                       1                393  
