In [27]:
# notebooks/train_model.ipynb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import mlflow
import mlflow.sklearn
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set MLflow tracking URI
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("churn-prediction-experiment")

# 1. Load Data
# For this example, we'll use the Telco Customer Churn dataset
# You can download it from: https://www.kaggle.com/blastchar/telco-customer-churn
# Or use any other churn dataset from Kaggle

# Create sample data if you don't have the dataset
def create_sample_churn_data():
    np.random.seed(42)
    n_samples = 5000
    
    data = {
        'customerID': [f'ID_{i}' for i in range(n_samples)],
        'gender': np.random.choice(['Male', 'Female'], n_samples),
        'SeniorCitizen': np.random.choice([0, 1], n_samples, p=[0.8, 0.2]),
        'Partner': np.random.choice(['Yes', 'No'], n_samples),
        'Dependents': np.random.choice(['Yes', 'No'], n_samples),
        'tenure': np.random.randint(0, 72, n_samples),
        'PhoneService': np.random.choice(['Yes', 'No'], n_samples, p=[0.9, 0.1]),
        'MultipleLines': np.random.choice(['Yes', 'No', 'No phone service'], n_samples),
        'InternetService': np.random.choice(['DSL', 'Fiber optic', 'No'], n_samples),
        'OnlineSecurity': np.random.choice(['Yes', 'No', 'No internet service'], n_samples),
        'OnlineBackup': np.random.choice(['Yes', 'No', 'No internet service'], n_samples),
        'DeviceProtection': np.random.choice(['Yes', 'No', 'No internet service'], n_samples),
        'TechSupport': np.random.choice(['Yes', 'No', 'No internet service'], n_samples),
        'StreamingTV': np.random.choice(['Yes', 'No', 'No internet service'], n_samples),
        'StreamingMovies': np.random.choice(['Yes', 'No', 'No internet service'], n_samples),
        'Contract': np.random.choice(['Month-to-month', 'One year', 'Two year'], n_samples),
        'PaperlessBilling': np.random.choice(['Yes', 'No'], n_samples),
        'PaymentMethod': np.random.choice(['Electronic check', 'Mailed check', 'Bank transfer', 'Credit card'], n_samples),
        'MonthlyCharges': np.random.uniform(20, 120, n_samples),
        'TotalCharges': np.random.uniform(20, 8000, n_samples),
        'Churn': np.random.choice(['Yes', 'No'], n_samples, p=[0.27, 0.73])
    }
    
    return pd.DataFrame(data)

# Load or create data
try:
    df = pd.read_csv('data/telco_churn.csv')
except:
    print("Creating sample data...")
    df = create_sample_churn_data()

print(f"Dataset shape: {df.shape}")
print(df.head())

# 2. Data Preprocessing
def preprocess_data(df):
    # Remove customerID
    df = df.drop('customerID', axis=1, errors='ignore')
    
    # Convert TotalCharges to numeric
    if 'TotalCharges' in df.columns:
        df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
        df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)
    
    # Encode binary columns
    binary_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
    for col in binary_cols:
        if col in df.columns:
            df[col] = df[col].map({'Yes': 1, 'No': 0, 'Male': 1, 'Female': 0})
    
    # Encode multi-class categorical columns
    categorical_cols = ['MultipleLines', 'InternetService', 'OnlineSecurity', 
                       'OnlineBackup', 'DeviceProtection', 'TechSupport', 
                       'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']
    
    df_encoded = pd.get_dummies(df, columns=[col for col in categorical_cols if col in df.columns])
    
    # Encode target variable
    if 'Churn' in df_encoded.columns:
        df_encoded['Churn'] = df_encoded['Churn'].map({'Yes': 1, 'No': 0})
    
    return df_encoded

# Preprocess the data
df_processed = preprocess_data(df)
print(f"Processed dataset shape: {df_processed.shape}")

# 3. Feature Engineering
# Create additional features
df_processed['tenure_MonthlyCharges'] = df_processed['tenure'] * df_processed['MonthlyCharges']
df_processed['TotalCharges_per_Month'] = df_processed['TotalCharges'] / (df_processed['tenure'] + 1)

# 4. Split the data
X = df_processed.drop('Churn', axis=1)
y = df_processed['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 5. Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaler
joblib.dump(scaler, '../app/scaler.pkl')

# 6. Model Training with MLflow Tracking
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
}

best_model = None
best_auc = 0
best_model_name = ""

for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        # Train the model
        model.fit(X_train_scaled, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test_scaled)
        y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_pred_proba)
        
        # Log parameters
        mlflow.log_params(model.get_params())
        
        # Log metrics
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("auc_roc", auc)
        
        # Log model
        mlflow.sklearn.log_model(model, model_name.replace(" ", "_"))
        
        # Log confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        fig, ax = plt.subplots(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
        ax.set_title(f'Confusion Matrix - {model_name}')
        ax.set_xlabel('Predicted')
        ax.set_ylabel('Actual')
        mlflow.log_figure(fig, f"confusion_matrix_{model_name.replace(' ', '_')}.png")
        plt.close()
        
        # Track best model
        if auc > best_auc:
            best_auc = auc
            best_model = model
            best_model_name = model_name
        
        print(f"\n{model_name} Results:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-Score: {f1:.4f}")
        print(f"AUC-ROC: {auc:.4f}")

# 7. Save the best model
print(f"\nBest Model: {best_model_name} with AUC-ROC: {best_auc:.4f}")
joblib.dump(best_model, '../app/churn_model.pkl')

# Save feature names for API
feature_names = X.columns.tolist()
joblib.dump(feature_names, '../app/feature_names.pkl')

print("\nModel training completed!")
print("Best model saved to: ../app/churn_model.pkl")
print("Scaler saved to: ../app/scaler.pkl")
print("Feature names saved to: ../app/feature_names.pkl")



Creating sample data...
Dataset shape: (5000, 21)
  customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0       ID_0    Male              1      No        Yes      61          Yes   
1       ID_1  Female              0      No         No      68          Yes   
2       ID_2    Male              0      No         No      62           No   
3       ID_3    Male              1     Yes         No       1          Yes   
4       ID_4    Male              0     Yes        Yes      53          Yes   

      MultipleLines InternetService       OnlineSecurity  ...  \
0               Yes     Fiber optic                   No  ...   
1               Yes     Fiber optic  No internet service  ...   
2  No phone service              No                   No  ...   
3               Yes              No                   No  ...   
4  No phone service              No                   No  ...   

      DeviceProtection          TechSupport          StreamingTV  \
0               




Logistic Regression Results:
Accuracy: 0.7250
Precision: 0.0000
Recall: 0.0000
F1-Score: 0.0000
AUC-ROC: 0.5077





Random Forest Results:
Accuracy: 0.7240
Precision: 0.3333
Recall: 0.0036
F1-Score: 0.0072
AUC-ROC: 0.5000





Gradient Boosting Results:
Accuracy: 0.7240
Precision: 0.4286
Recall: 0.0109
F1-Score: 0.0213
AUC-ROC: 0.4983

Best Model: Logistic Regression with AUC-ROC: 0.5077

Model training completed!
Best model saved to: ../app/churn_model.pkl
Scaler saved to: ../app/scaler.pkl
Feature names saved to: ../app/feature_names.pkl


In [28]:
import subprocess
import sys

def install_package(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# List of required packages
required_packages = {
    'pandas': 'pandas==2.1.4',
    'numpy': 'numpy==1.26.3',
    'matplotlib': 'matplotlib==3.8.2',
    'seaborn': 'seaborn==0.13.0',
    'sklearn': 'scikit-learn==1.3.2',
    'joblib': 'joblib==1.3.2',
    'mlflow': 'mlflow==2.9.2'
}

# Check and install missing packages
for module_name, package_name in required_packages.items():
    try:
        __import__(module_name)
        print(f"✓ {module_name} is already installed")
    except ImportError:
        print(f"Installing {package_name}...")
        install_package(package_name)
        print(f"✓ {package_name} installed successfully")

✓ pandas is already installed
✓ numpy is already installed
✓ matplotlib is already installed
✓ seaborn is already installed
✓ sklearn is already installed
✓ joblib is already installed
✓ mlflow is already installed


In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import mlflow
import mlflow.sklearn
import joblib
import warnings
warnings.filterwarnings('ignore')

print("All libraries imported successfully!")

All libraries imported successfully!


In [30]:
# Emergency package installer
%pip install pandas numpy matplotlib seaborn scikit-learn joblib mlflow

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
