In [1]:
# load necessary dataset
import pandas as pd
import seaborn as sns
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

In [2]:
# load dataset
df = pd.read_csv('bigml_59c28831336c6604c800002a.csv')

# print the first 5 rows
df.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [3]:
# Drop the unnecessary columns
df = df.drop(['phone number', 'account length'], axis=1)
df.head()

Unnamed: 0,state,area code,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,415,no,yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,415,no,yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,415,no,no,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,408,yes,no,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,415,yes,no,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [4]:
# get the valuecounts and percentages of the churn and non-churn
print("Raw Counts")
print(df["churn"].value_counts())
print()
print("Percentages")
print(df["churn"].value_counts(normalize=True))

Raw Counts
False    2850
True      483
Name: churn, dtype: int64

Percentages
False    0.855086
True     0.144914
Name: churn, dtype: float64


In [5]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Step 1: Ensure all categorical variables are converted to numerical using one-hot encoding
df = pd.get_dummies(df, columns=['international plan', 'voice mail plan'], drop_first=True)

# Step 2: Feature Selection
selected_features = ['total day minutes', 'total night minutes', 'customer service calls', 'international plan_yes']

# Step 3: Data Splitting
X = df[selected_features]
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [6]:
# Instantiate and fit the StandardScaler on X_train
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
# Relevant imports
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import log_loss

# Creating the model
knn_baseline_model = KNeighborsClassifier()

# Perform cross-validation
knn_baseline_log_loss = -cross_val_score(knn_baseline_model, X_train_scaled, y_train, cv=5, scoring='neg_log_loss').mean()

knn_baseline_log_loss

1.2408876549310428

In [8]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grids for grid search
param_grid = {
    'n_neighbors': [3, 5, 10, 15],  # Different number of neighbors
    'weights': ['uniform', 'distance'],  # Uniform and distance-based weighting
    'metric': ['euclidean', 'manhattan'],  # Different distance metrics
}

In [9]:
# Initialize kNN classifier
knn_model = KNeighborsClassifier()

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(knn_model, param_grid, cv=5, scoring='neg_log_loss', n_jobs=-1)

# Fit the models to find the best kNN
grid_search.fit(X_train_scaled, y_train)

In [10]:
# Get the best model and its parameters
best_knn_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_log_loss = -grid_search.best_score_

print("Best kNN Model Parameters:", best_params)
print("Best kNN Model Log Loss:", best_log_loss)

Best kNN Model Parameters: {'metric': 'manhattan', 'n_neighbors': 15, 'weights': 'uniform'}
Best kNN Model Log Loss: 0.6820147144358365


In [11]:
from sklearn.tree import DecisionTreeClassifier

# Create a baseline decision tree model
decision_tree_baseline_model = DecisionTreeClassifier(random_state=42)

# Perform cross-validation to calculate log loss
decision_tree_baseline_log_loss = -cross_val_score(
    decision_tree_baseline_model, X_train_scaled, y_train, cv=5, scoring='neg_log_loss'
).mean()

decision_tree_baseline_log_loss

5.653956268063774

In [12]:
# import necessary 
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [13]:
# Initialize DecisionTreeClassifier with random_state
dt_model = DecisionTreeClassifier(random_state=42)

# Define the hyperparameter grid
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create GridSearchCV
grid_search_dt = GridSearchCV(dt_model, param_grid, cv=5, scoring='neg_log_loss', n_jobs=-1)

# Fit the grid search to the training data
grid_search_dt.fit(X_train_scaled, y_train)

In [14]:
# Get the best decision tree model and its parameters
best_dt_model = grid_search_dt.best_estimator_
best_params_dt = grid_search_dt.best_params_
best_log_loss_dt = -grid_search_dt.best_score_

print("Best Decision Tree Model Parameters:", best_params_dt)
print("Best Decision Tree Model Log Loss:", best_log_loss_dt)

Best Decision Tree Model Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Best Decision Tree Model Log Loss: 1.6170206717129723


In [15]:
# Replace None with appropriate code
final_model = KNeighborsClassifier(metric='manhattan', n_neighbors=15, weights='distance')

# Fit the model on the full training data
# (scaled or unscaled depending on the model)
final_model.fit(X_train_scaled, y_train)

In [16]:
# Replace None with appropriate code
from sklearn.metrics import accuracy_score, precision_score, recall_score

preds = final_model.predict(X_test_scaled)
probs = final_model.predict_proba(X_test_scaled)

print("log loss: ", log_loss(y_test, probs))
print("accuracy: ", accuracy_score(y_test, preds))
print("precision:", precision_score(y_test, preds))
print("recall:   ", recall_score(y_test, preds))

log loss:  0.6763825108539739
accuracy:  0.86810551558753
precision: 0.6153846153846154
recall:    0.32
