In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix

# Load the dataset
data = pd.read_csv('Final_data.csv')


# Selecting features and target
features = data[['Monthly SKUs Purchased', 'Monthly Vendors Purchased', 'Monthly Number of Sales', 'Monthly Sum Sales ($)', 'Monthly Gross Margin ($)']]
target = data['Churned']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

In [2]:
# Creating the Naive Bayes model
nb_model = GaussianNB()

# Training the model
nb_model.fit(X_train, y_train)

In [3]:
# Predicting the test set results
y_pred = nb_model.predict(X_test)

# Evaluating the algorithm
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     60251
           1       0.00      0.00      0.00       828

    accuracy                           0.99     61079
   macro avg       0.49      0.50      0.50     61079
weighted avg       0.97      0.99      0.98     61079

Confusion Matrix:
[[60251     0]
 [  828     0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
from sklearn.metrics import accuracy_score

# Assuming the Naive Bayes model is already trained as per the previous steps

# Predictions on the training set
train_predictions = nb_model.predict(X_train)

# Predictions on the test set
test_predictions = nb_model.predict(X_test)

# Calculate accuracy on training and test set
train_accuracy = accuracy_score(y_train, train_predictions)
test_accuracy = accuracy_score(y_test, test_predictions)

print(f"Training Accuracy: {train_accuracy:.2f}")
print(f"Test Accuracy: {test_accuracy:.2f}")

# Check for overfitting or underfitting
if train_accuracy > test_accuracy:
    print("The model may be overfitting.")
elif train_accuracy < test_accuracy or (train_accuracy < threshold and test_accuracy < threshold):
    print("The model may be underfitting.")
else:
    print("The model seems to be well-fitted.")

Training Accuracy: 0.99
Test Accuracy: 0.99
The model may be overfitting.


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
file_path = 'Final_data.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Preprocessing
# Encode categorical variables
label_encoders = {}
categorical_columns = ['Division', 'Customer Type']  # Replace with your categorical columns
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Drop non-numerical or irrelevant columns for simplicity
# Example: Dropping date columns and any other non-relevant column for the model
data = data.drop(['Customer_Number', 'YearMonth', 'Monthly Last Sales Date', 'Monthly First Sales Date', 'End of Month'], axis=1)

# Split the data into features and target
X = data.drop('Churned', axis=1)
y = data['Churned']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Gaussian Naive Bayes model
gnb = GaussianNB()

# Train the model
gnb.fit(X_train, y_train)

# Predict on the test set
y_pred = gnb.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

Accuracy: 0.4263261296660118
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.42      0.59     40178
           1       0.02      0.67      0.03       542

    accuracy                           0.43     40720
   macro avg       0.50      0.54      0.31     40720
weighted avg       0.98      0.43      0.59     40720

