In [1]:
# 1. Imports (run this once at the top)
import os
import sys
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [2]:
from src.model_training import prepare_data,  train_logistic_regression, train_random_forest
from src.evaluation import evaluate_model
from src.Feature_engineering import engineer_features

# 2. Load processed data and prepare splits
processed_path = os.path.join(project_root, "data", "processed", "telco_churn_processed.csv")
df = pd.read_csv(processed_path)

print("df shape:", df.shape)

df shape: (7043, 28)


In [3]:
print("First few rows of the dataframe:")
print(df.head())

First few rows of the dataframe:
   SeniorCitizen  tenure  MonthlyCharges  TotalCharges  charges_ratio  \
0              0       1           29.85         29.85       1.000000   
1              0      34           56.95       1889.50      33.178227   
2              0       2           53.85        108.15       2.008357   
3              0      45           42.30       1840.75      43.516548   
4              0       2           70.70        151.65       2.144979   

  tenure_group  gender_Male  Partner_Yes  Dependents_Yes  PhoneService_Yes  \
0        0-1yr        False         True           False             False   
1       2-4yrs         True        False           False              True   
2        0-1yr         True        False           False              True   
3       2-4yrs         True        False           False             False   
4        0-1yr        False        False           False              True   

   ...  StreamingMovies_Yes  Contract_One year  Contract_Tw

In [4]:
print(df.columns)

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges',
       'charges_ratio', 'tenure_group', 'gender_Male', 'Partner_Yes',
       'Dependents_Yes', 'PhoneService_Yes', 'MultipleLines_Yes',
       'InternetService_Fiber optic', 'InternetService_No',
       'OnlineSecurity_Yes', 'OnlineBackup_Yes', 'DeviceProtection_Yes',
       'TechSupport_Yes', 'StreamingTV_Yes', 'StreamingMovies_Yes',
       'Contract_One year', 'Contract_Two year', 'PaperlessBilling_Yes',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check',
       'Churn_Yes', 'HighSpender', 'HighChurnRisk'],
      dtype='object')


In [5]:
X_train, X_test, y_train, y_test, preprocessor = prepare_data(df)
print("X_train:", X_train.shape, "X_test:", X_test.shape)

X_train: (5634, 26) X_test: (1409, 26)


In [6]:
X_train, X_test, y_train, y_test, preprocessor = prepare_data(df)
print("X_train missing:", X_train.isna().sum().sum())
print("y_train missing:", y_train.isna().sum())

X_train missing: 0
y_train missing: 0


In [7]:
log_model = train_logistic_regression(X_train, y_train, preprocessor)
rf_model  = train_random_forest(X_train, y_train, preprocessor)

In [8]:
print("Log model type:", type(log_model))
print("RF model type:", type(rf_model))

# 4. Evaluate models
log_results = evaluate_model(log_model, X_test, y_test)
rf_results  = evaluate_model(rf_model,  X_test, y_test)

print("Logistic Regression accuracy:", log_results["accuracy"])
print("Random Forest accuracy:", rf_results["accuracy"])

Log model type: <class 'sklearn.pipeline.Pipeline'>
RF model type: <class 'sklearn.pipeline.Pipeline'>
Logistic Regression accuracy: 0.7842441447835344
Random Forest accuracy: 0.7700496806245565
