In [1]:
# Step 1: Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Step 2: Load the dataset
# Replace 'telco_customer_churn.csv' with the path to your dataset file
data = pd.read_csv('telco_customer_churn.csv')

# Step 3: Explore the dataset
print("Dataset Shape:", data.shape)
print("First few rows of the dataset:\n", data.head())
print("\nColumns with missing values:\n", data.isnull().sum())

# Step 4: Handle missing values
# Fill numerical missing values with the mean and categorical missing values with the mode
for column in data.columns:
    if data[column].dtype == 'object':
        data[column].fillna(data[column].mode()[0], inplace=True)
    else:
        data[column].fillna(data[column].mean(), inplace=True)

print("\nMissing values after handling:\n", data.isnull().sum())

# Step 5: Encode categorical variables
# Convert the target variable 'Churn' into binary (0/1)
label_encoder = LabelEncoder()
data['Churn'] = label_encoder.fit_transform(data['Churn'])

# Perform one-hot encoding for other categorical columns
categorical_columns = data.select_dtypes(include=['object']).columns
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Step 6: Normalize numerical features
scaler = StandardScaler()
numerical_columns = data.select_dtypes(include=['int64', 'float64']).columns
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Step 7: Split the dataset
# Separate features and target variable
X = data.drop('Churn', axis=1)
y = data['Churn']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nTraining set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Dataset Shape: (7043, 21)
First few rows of the dataset:
    customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  Te

In [2]:
# Step 1: Import necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

# Step 2: Train Logistic Regression Model
print("Training Logistic Regression...")
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)

# Evaluate Logistic Regression
print("\nLogistic Regression Evaluation:")
print("Accuracy:", accuracy_score(y_test, lr_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, lr_predictions))
print("Classification Report:\n", classification_report(y_test, lr_predictions))
print("AUC-ROC:", roc_auc_score(y_test, lr_model.predict_proba(X_test)[:, 1]))

# Step 3: Train Random Forest Model
print("\nTraining Random Forest...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)

# Evaluate Random Forest
print("\nRandom Forest Evaluation:")
print("Accuracy:", accuracy_score(y_test, rf_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, rf_predictions))
print("Classification Report:\n", classification_report(y_test, rf_predictions))
print("AUC-ROC:", roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1]))

# Step 4: Train Gradient Boosting Model
print("\nTraining Gradient Boosting...")
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)
gb_predictions = gb_model.predict(X_test)

# Evaluate Gradient Boosting
print("\nGradient Boosting Evaluation:")
print("Accuracy:", accuracy_score(y_test, gb_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, gb_predictions))
print("Classification Report:\n", classification_report(y_test, gb_predictions))
print("AUC-ROC:", roc_auc_score(y_test, gb_model.predict_proba(X_test)[:, 1]))

Training Logistic Regression...


ValueError: Unknown label type: 'continuous'

In [3]:
print(y_train.unique())  # Show unique values in the target variable
print(y_train.dtype)     # Check the data type of the target variable

[-0.60102348  1.66382851]
float64


In [4]:
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [5]:
print(y_train.unique())  # Show unique values in the target variable
print(y_train.dtype)     # Check the data type of the target variable

[0 1]
int64


In [6]:
# Check the target variable's encoding
print("Unique values in y_train before conversion:", y_train.unique())

# Convert target variable to integers if needed
y_train = y_train.astype(int)
y_test = y_test.astype(int)

# Confirm the fix
print("Unique values in y_train after conversion:", y_train.unique())

# Now train the Logistic Regression model
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)

# Evaluate Logistic Regression
print("\nLogistic Regression Evaluation:")
print("Accuracy:", accuracy_score(y_test, lr_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, lr_predictions))
print("Classification Report:\n", classification_report(y_test, lr_predictions))
print("AUC-ROC:", roc_auc_score(y_test, lr_model.predict_proba(X_test)[:, 1]))

Unique values in y_train before conversion: [0 1]
Unique values in y_train after conversion: [0 1]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Logistic Regression Evaluation:
Accuracy: 0.8261178140525195
Confusion Matrix:
 [[942  94]
 [151 222]]
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.91      0.88      1036
           1       0.70      0.60      0.64       373

    accuracy                           0.83      1409
   macro avg       0.78      0.75      0.76      1409
weighted avg       0.82      0.83      0.82      1409

AUC-ROC: 0.8612794621507759


In [7]:
# Step 1: Import necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

# Step 2: Train Logistic Regression Model
print("Training Logistic Regression...")
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)

# Evaluate Logistic Regression
print("\nLogistic Regression Evaluation:")
print("Accuracy:", accuracy_score(y_test, lr_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, lr_predictions))
print("Classification Report:\n", classification_report(y_test, lr_predictions))
print("AUC-ROC:", roc_auc_score(y_test, lr_model.predict_proba(X_test)[:, 1]))

# Step 3: Train Random Forest Model
print("\nTraining Random Forest...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)

# Evaluate Random Forest
print("\nRandom Forest Evaluation:")
print("Accuracy:", accuracy_score(y_test, rf_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, rf_predictions))
print("Classification Report:\n", classification_report(y_test, rf_predictions))
print("AUC-ROC:", roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1]))

# Step 4: Train Gradient Boosting Model
print("\nTraining Gradient Boosting...")
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)
gb_predictions = gb_model.predict(X_test)

# Evaluate Gradient Boosting
print("\nGradient Boosting Evaluation:")
print("Accuracy:", accuracy_score(y_test, gb_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, gb_predictions))
print("Classification Report:\n", classification_report(y_test, gb_predictions))
print("AUC-ROC:", roc_auc_score(y_test, gb_model.predict_proba(X_test)[:, 1]))

Training Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Logistic Regression Evaluation:
Accuracy: 0.8261178140525195
Confusion Matrix:
 [[942  94]
 [151 222]]
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.91      0.88      1036
           1       0.70      0.60      0.64       373

    accuracy                           0.83      1409
   macro avg       0.78      0.75      0.76      1409
weighted avg       0.82      0.83      0.82      1409

AUC-ROC: 0.8612794621507759

Training Random Forest...

Random Forest Evaluation:
Accuracy: 0.8034066713981547
Confusion Matrix:
 [[963  73]
 [204 169]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.93      0.87      1036
           1       0.70      0.45      0.55       373

    accuracy                           0.80      1409
   macro avg       0.76      0.69      0.71      1409
weighted avg       0.79      0.80      0.79      1409

AUC-ROC: 0.8562875878559525

Training Gra

In [8]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs']
}

# Initialize GridSearchCV
grid_search = GridSearchCV(LogisticRegression(random_state=42), param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)

# Get the best model
best_lr_model = grid_search.best_estimator_

# Evaluate the tuned Logistic Regression
best_lr_predictions = best_lr_model.predict(X_test)
best_lr_probabilities = best_lr_model.predict_proba(X_test)[:, 1]

print("\nTuned Logistic Regression Evaluation:")
print("Accuracy:", accuracy_score(y_test, best_lr_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, best_lr_predictions))
print("Classification Report:\n", classification_report(y_test, best_lr_predictions))
print("AUC-ROC:", roc_auc_score(y_test, best_lr_probabilities))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Tuned Logistic Regression Evaluation:
Accuracy: 0.8261178140525195
Confusion Matrix:
 [[942  94]
 [151 222]]
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.91      0.88      1036
           1       0.70      0.60      0.64       373

    accuracy                           0.83      1409
   macro avg       0.78      0.75      0.76      1409
weighted avg       0.82      0.83      0.82      1409

AUC-ROC: 0.8612794621507759


In [9]:
from sklearn.metrics import precision_recall_curve

# Predict probabilities for the test set
y_prob = best_lr_model.predict_proba(X_test)[:, 1]

# Calculate precision, recall, and thresholds
precision, recall, thresholds = precision_recall_curve(y_test, y_prob)

# Find a threshold that improves recall
optimal_threshold = 0.4  # Example threshold
adjusted_predictions = (y_prob >= optimal_threshold).astype(int)

# Evaluate with the adjusted threshold
print("\nEvaluation with Adjusted Threshold (0.4):")
print("Accuracy:", accuracy_score(y_test, adjusted_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, adjusted_predictions))
print("Classification Report:\n", classification_report(y_test, adjusted_predictions))


Evaluation with Adjusted Threshold (0.4):
Accuracy: 0.8026969481902059
Confusion Matrix:
 [[875 161]
 [117 256]]
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.84      0.86      1036
           1       0.61      0.69      0.65       373

    accuracy                           0.80      1409
   macro avg       0.75      0.77      0.76      1409
weighted avg       0.81      0.80      0.81      1409



In [10]:
import joblib

# Save the Logistic Regression model
joblib.dump(best_lr_model, 'logistic_regression_model_with_threshold.pkl')
print("Model saved as logistic_regression_model_with_threshold.pkl")

Model saved as logistic_regression_model_with_threshold.pkl


In [11]:
pip install flask

Note: you may need to restart the kernel to use updated packages.


In [1]:
import joblib
from sklearn.preprocessing import StandardScaler

# Assuming `scaler` is your StandardScaler object
joblib.dump(scaler, 'scaler.pkl')

NameError: name 'scaler' is not defined

In [2]:
from sklearn.preprocessing import StandardScaler
import joblib

# Initialize and fit the scaler on the training features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaler to a file
joblib.dump(scaler, 'scaler.pkl')
print("Scaler saved as scaler.pkl")

# Train the model using the scaled features
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)

# Save the model
joblib.dump(model, 'logistic_regression_model_with_threshold.pkl')
print("Model saved as logistic_regression_model_with_threshold.pkl")

NameError: name 'X_train' is not defined

In [3]:
data.head()

NameError: name 'data' is not defined