In [93]:
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
import statsmodels.api as sm
from statsmodels.stats.outliers_influence \
import variance_inflation_factor as VIF
from statsmodels.stats.anova import anova_lm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
summarize,
poly)
from sklearn.metrics import accuracy_score
from ISLP import confusion_table
from ISLP.models import contrast
from sklearn.discriminant_analysis import \
    (LinearDiscriminantAnalysis as LDA,
     QuadraticDiscriminantAnalysis as QDA)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV

In [3]:
train = pd.read_csv("dataset/train.csv")
test = pd.read_csv("dataset/test.csv")

In [8]:
train.corr()

Unnamed: 0,ID,Age,AI_Interaction_Level,Satisfaction_with_AI_Services,AI_Personalization_Effectiveness,AI_Response_Time,Overall_Usage_Frequency,Customer_Service_Interactions,Change_in_Usage_Patterns,Customer_Churn
ID,1.0,-0.00365,0.001213,-0.026875,-0.021246,-0.015012,-0.004322,0.009345,-0.006244,-0.019766
Age,-0.00365,1.0,-0.7946,0.005429,0.009565,-0.010322,-0.005757,-0.012153,-0.002563,0.109484
AI_Interaction_Level,0.001213,-0.7946,1.0,-0.016912,-0.018981,0.010988,0.006177,0.002196,0.001754,-0.153978
Satisfaction_with_AI_Services,-0.026875,0.005429,-0.016912,1.0,0.963572,0.002311,0.006076,0.01527,-0.026991,-0.103294
AI_Personalization_Effectiveness,-0.021246,0.009565,-0.018981,0.963572,1.0,0.002159,0.0033,0.01318,-0.024739,-0.09806
AI_Response_Time,-0.015012,-0.010322,0.010988,0.002311,0.002159,1.0,-0.019246,0.017841,0.013044,-0.016676
Overall_Usage_Frequency,-0.004322,-0.005757,0.006177,0.006076,0.0033,-0.019246,1.0,0.007322,-0.017665,-0.019843
Customer_Service_Interactions,0.009345,-0.012153,0.002196,0.01527,0.01318,0.017841,0.007322,1.0,0.006894,0.003646
Change_in_Usage_Patterns,-0.006244,-0.002563,0.001754,-0.026991,-0.024739,0.013044,-0.017665,0.006894,1.0,0.027809
Customer_Churn,-0.019766,0.109484,-0.153978,-0.103294,-0.09806,-0.016676,-0.019843,0.003646,0.027809,1.0


In [19]:

features = ["AI_Interaction_Level", "Satisfaction_with_AI_Services"]

# Selecting only the specified features for X
allvars= train[features]

design = MS(allvars)
X = design.fit_transform(train)
y = train["Customer_Churn"]
glm = sm.GLM(y,X,family=sm.families.Binomial())
results = glm.fit()
summarize(results)

Unnamed: 0,coef,std err,z,P>|z|
intercept,1.3657,0.282,4.846,0.0
AI_Interaction_Level,-0.4118,0.051,-8.008,0.0
Satisfaction_with_AI_Services,-0.0776,0.01,-7.57,0.0
Age,-0.0062,0.004,-1.629,0.103


In [20]:
# Assuming you have a DataFrame `test` for which you need to make predictions
X_test = design.transform(test)  # Transform the test dataset
y_test = test["Customer_Churn"]   # Actual outcomes

# Get the predicted probabilities for the positive class (1)
predicted_probabilities = results.predict(X_test)

# Convert probabilities to binary predictions using 0.5 as the threshold
predictions = (predicted_probabilities > 0.5).astype(int)


In [22]:
from sklearn.metrics import confusion_matrix, classification_report

# Generate the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)

# Generate a classification report
report = classification_report(y_test, predictions)
print("\nClassification Report:")
print(report)


Confusion Matrix:
[[589 125]
 [409 127]]

Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.82      0.69       714
           1       0.50      0.24      0.32       536

    accuracy                           0.57      1250
   macro avg       0.55      0.53      0.51      1250
weighted avg       0.55      0.57      0.53      1250



In [24]:
lda = LDA(store_covariance=True)

In [62]:
train_model = MS(["AI_Interaction_Level", "Satisfaction_with_AI_Services"]).fit(train)
test_model = MS(["AI_Interaction_Level", "Satisfaction_with_AI_Services"]).fit(test)
x_train = train_model.transform(train)
x_test = test_model.transform(test)
y_train = train["Customer_Churn"]
y_test = test["Customer_Churn"]

x_train, x_test = [M.drop(columns=['intercept']) for M in [x_train,x_test]]
lda.fit(x_train,y_train)


In [28]:
lda_pred = lda.predict(x_test)
confusion_table(lda_pred,y_test)

Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,582,398
1,132,138


In [111]:
accuracy = accuracy_score(y_test, lda_pred)
print(f"Accuracy: {accuracy:.2%}")

Accuracy: 57.60%


In [110]:
lda_prob = lda.predict_proba(x_test)
np.all(np.where(lda_prob[:,1]>=0.5,1,0)==lda_pred)

True

In [34]:
qda = QDA(store_covariance=True)
qda.fit(x_train,y_train)


In [35]:
qda_pred = qda.predict(x_test)
confusion_table(qda_pred,y_test)

Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,605,416
1,109,120


In [108]:
accuracy = accuracy_score(y_test, qda_pred)
print(f"Accuracy: {accuracy:.2%}")

Accuracy: 58.00%


In [37]:
NB = GaussianNB()
NB.fit(x_train,y_train)


In [38]:
nb_labels = NB.predict(x_test)
confusion_table(nb_labels,y_test)

Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,609,416
1,105,120


In [107]:
accuracy = accuracy_score(y_test, nb_labels)
print(f"Accuracy: {accuracy:.2%}")

Accuracy: 58.32%


In [49]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

for K in range(1, 200):
    # Initialize and fit the K-Nearest Neighbors classifier with K neighbors
    knn = KNeighborsClassifier(n_neighbors=K)
    knn_pred = knn.fit(x_train, y_train).predict(x_test)
    
    # Generate the confusion matrix
    C = confusion_matrix(y_test, knn_pred)
    
    # Assuming binary classification, and you're interested in the positive class ('1')
    TP = C[1, 1]  # True Positives: correctly predicted as class '1'
    TN = C[0, 0]  # True Negatives: correctly predicted as class '0'
    FP = C[0, 1]  # False Positives: incorrectly predicted as class '1'
    FN = C[1, 0]  # False Negatives: incorrectly predicted as class '0'
    
    total_predicted_positive = TP + FP  # Total predicted as class '1'
    did_actually_positive = TP  # Correctly predicted as class '1'
    
    if total_predicted_positive > 0:
        accuracy_of_positive_predictions = did_actually_positive / total_predicted_positive
    else:
        accuracy_of_positive_predictions = 0  # To handle division by zero
    
    templ = ('K = {0:2d}: # predicted as positive: {1:3d}, # actually positive: {2:3d}, accuracy of positive predictions: {3:.1%}')
    print(templ.format(K, total_predicted_positive, did_actually_positive, accuracy_of_positive_predictions))


K =  1: # predicted as positive: 575, # actually positive: 250, accuracy of positive predictions: 43.5%
K =  2: # predicted as positive: 238, # actually positive: 101, accuracy of positive predictions: 42.4%
K =  3: # predicted as positive: 515, # actually positive: 241, accuracy of positive predictions: 46.8%
K =  4: # predicted as positive: 237, # actually positive: 119, accuracy of positive predictions: 50.2%
K =  5: # predicted as positive: 541, # actually positive: 269, accuracy of positive predictions: 49.7%
K =  6: # predicted as positive: 238, # actually positive: 126, accuracy of positive predictions: 52.9%
K =  7: # predicted as positive: 496, # actually positive: 267, accuracy of positive predictions: 53.8%
K =  8: # predicted as positive: 265, # actually positive: 140, accuracy of positive predictions: 52.8%
K =  9: # predicted as positive: 584, # actually positive: 308, accuracy of positive predictions: 52.7%
K = 10: # predicted as positive: 438, # actually positive: 232, 

In [50]:
knn134 = KNeighborsClassifier(n_neighbors=134)
knn134.fit(x_train,y_train)
knn134_pred = knn134.predict(x_test)
confusion_table(knn134_pred,y_test)

Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,550,312
1,164,224


In [51]:
np.mean(knn134_pred==y_test)

0.6192

In [55]:
train_model_3 = MS(["AI_Personalization_Effectiveness", "Satisfaction_with_AI_Services","Change_in_Usage_Patterns"]).fit(train)
test_model_3 = MS(["AI_Personalization_Effectiveness", "Satisfaction_with_AI_Services","Change_in_Usage_Patterns"]).fit(test)
x_train_3 = train_model_3.transform(train)
x_test_3 = test_model_3.transform(test)
y_train_3 = train["Customer_Churn"]
y_test_3 = test["Customer_Churn"]

x_train_3, x_test_3 = [M.drop(columns=['intercept']) for M in [x_train_3,x_test_3]]


In [56]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Initialize a dictionary to store accuracy for each K
accuracy_for_each_k = {}

for K in range(1, 400):
    knn = KNeighborsClassifier(n_neighbors=K)
    knn.fit(x_train_3, y_train_3)
    knn_pred_3 = knn.predict(x_test_3)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test_3, knn_pred_3)
    
    # Store accuracy in the dictionary with K as the key
    accuracy_for_each_k[K] = accuracy

# Find the K with the highest accuracy
best_K = max(accuracy_for_each_k, key=accuracy_for_each_k.get)
best_accuracy = accuracy_for_each_k[best_K]

print(f"The best K is {best_K} with an accuracy of {best_accuracy:.2%}")


The best K is 136 with an accuracy of 59.92%


In [90]:
logit = LogisticRegression(C=1e10,solver='liblinear')
logit.fit(x_train,y_train)
logit_prob = logit.predict_proba(x_test)
logit_labels = np.where(logit_prob[:,1] > 0.475, 1, 0)

confusion_table(logit_labels,y_test)

Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,553,372
1,161,164


In [91]:
accuracy = accuracy_score(y_test, logit_labels)
print(f"Accuracy: {accuracy:.2%}")

Accuracy: 57.36%


In [94]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2'],  # Type of regularization
    'solver': ['liblinear']  # Algorithm to use in the optimization problem
}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(x_train, y_train)
# Best parameter set found
print("Best parameters found: ", grid_search.best_params_)

# Best score
print("Best score: ", grid_search.best_score_)

# Best estimator (model)
best_model = grid_search.best_estimator_

# You can now use best_model to make predictions


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters found:  {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
Best score:  0.5833999999999999


In [106]:
logit_new = LogisticRegression(C=0.1,solver='liblinear',penalty='l2')
logit_new.fit(x_train,y_train)
logit_prob_new = logit_new.predict_proba(x_test)
logit_labels_new = np.where(logit_prob_new[:,1] > 0.45, 1, 0)
accuracy = accuracy_score(y_test, logit_labels_new)
print(f"Accuracy: {accuracy:.2%}")
confusion_table(logit_labels_new,y_test)


Accuracy: 59.04%


Truth,0,1
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0,496,294
1,218,242
