## Data Preparation

In [16]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_csv("/content/Churn_Modelling.csv")

In [17]:
# Remove dollar sign and convert 'Balance' to numeric
df['Balance'] = df['Balance'].replace('[\$,]', '', regex=True).astype(float)


# Remove dollar sign and commas, and convert 'EstimatedSalary' to numeric
df['EstimatedSalary'] = df['EstimatedSalary'].replace('[\$,]', '', regex=True).astype(float)







##  K-NN:

In [18]:
# Select relevant features (including 'Geography') and target variable
X = df[['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Geography']]
y = df['Exited']

# Perform one-hot encoding for 'Geography'
X = pd.get_dummies(X, columns=['Geography'], drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Build the K-NN model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print results
print(f"K-NN Accuracy: {accuracy}")
print("Classification Report:")
print(report)


K-NN Accuracy: 0.844
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.95      0.91      1607
           1       0.66      0.42      0.52       393

    accuracy                           0.84      2000
   macro avg       0.77      0.69      0.71      2000
weighted avg       0.83      0.84      0.83      2000



## K-NN Hyperparameter Tuning:

In [19]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {'n_neighbors': [3, 5, 7, 9],
              'weights': ['uniform', 'distance'],
              'p': [1, 2]}

# Create a K-NN classifier
knn = KNeighborsClassifier()

# Perform grid search with cross-validation
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Build the best K-NN model
best_knn_model = grid_search.best_estimator_

# Make predictions on the test set using the best model
y_pred_best = best_knn_model.predict(X_test)

# Evaluate the best model
accuracy_best = accuracy_score(y_test, y_pred_best)
report_best = classification_report(y_test, y_pred_best)

# Print results
print(f"Best K-NN Parameters: {best_params}")
print(f"Best K-NN Accuracy: {accuracy_best}")
print("Classification Report for Best K-NN Model:")
print(report_best)


Best K-NN Parameters: {'n_neighbors': 7, 'p': 2, 'weights': 'distance'}
Best K-NN Accuracy: 0.846
Classification Report for Best K-NN Model:
              precision    recall  f1-score   support

           0       0.87      0.95      0.91      1607
           1       0.67      0.42      0.52       393

    accuracy                           0.85      2000
   macro avg       0.77      0.69      0.71      2000
weighted avg       0.83      0.85      0.83      2000



## K-Means Clustering:

In [20]:
from sklearn.cluster import KMeans

# Select features for clustering
X_cluster = df[['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'EstimatedSalary']]

# Standardize the features for clustering
scaler_cluster = StandardScaler()
X_cluster_scaled = scaler_cluster.fit_transform(X_cluster)

# Choose the number of clusters (you can adjust this based on your analysis)
num_clusters = 3

# Build the K-Means clustering model
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df['Cluster'] = kmeans.fit_predict(X_cluster_scaled)

# Explore the characteristics of each cluster
cluster_characteristics = df.groupby('Cluster').mean()

# Print the cluster characteristics
print(cluster_characteristics)




           RowNumber    CustomerId  CreditScore        Age    Tenure  \
Cluster                                                                
0        4983.229127  1.568909e+07   650.382678  39.457774  4.978167   
1        5024.857260  1.569209e+07   649.554039  38.375000  5.063046   
2        4992.538745  1.569256e+07   652.457103  38.815498  4.994465   

               Balance  NumOfProducts  HasCrCard  IsActiveMember  \
Cluster                                                            
0        120139.112601       1.000000   0.699136        0.509837   
1           752.769795       1.776747   0.715884        0.518559   
2        120553.931287       2.132841   0.700185        0.519373   

         EstimatedSalary    Exited  
Cluster                             
0           99710.180837  0.255758  
1           98686.417102  0.135917  
2          103193.419197  0.218173  


  cluster_characteristics = df.groupby('Cluster').mean()


In [21]:
# Assuming kmeans is your KMeans model
inertia = kmeans.inertia_
print(f"Inertia: {inertia}")


Inertia: 34999.87224228427


In [22]:
from sklearn.metrics import silhouette_score

# Assuming X_cluster_scaled is your standardized feature matrix
silhouette_avg = silhouette_score(X_cluster_scaled, kmeans.labels_)
print(f"Silhouette Score: {silhouette_avg}")


Silhouette Score: 0.2078368231781088
