In [3]:
# In a new Colab cell, upload the files
from google.colab import files

# Upload these files one by one:
uploaded = files.upload()
# Upload: Subscription_Management_ML_Pipeline.ipynb
# Upload: subscription_ml_pipeline.py
# Upload: requirements.txt

Saving Subscription_Management_ML_Pipeline.ipynb to Subscription_Management_ML_Pipeline (2).ipynb
Saving subscription_ml_pipeline.py to subscription_ml_pipeline (2).py


In [4]:
# Run this cell first to install all required packages
!pip install pandas numpy matplotlib seaborn scikit-learn xgboost jupyter plotly dash tqdm python-dateutil joblib

# Alternative: Install from requirements file
# !pip install -r requirements.txt


Collecting jupyter
  Downloading jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting dash
  Downloading dash-3.2.0-py3-none-any.whl.metadata (10 kB)
Collecting jupyterlab (from jupyter)
  Downloading jupyterlab-4.4.7-py3-none-any.whl.metadata (16 kB)
Collecting retrying (from dash)
  Downloading retrying-1.4.2-py3-none-any.whl.metadata (5.5 kB)
Collecting async-lru>=1.0.0 (from jupyterlab->jupyter)
  Downloading async_lru-2.0.5-py3-none-any.whl.metadata (4.5 kB)
Collecting jupyter-lsp>=2.0.0 (from jupyterlab->jupyter)
  Downloading jupyter_lsp-2.3.0-py3-none-any.whl.metadata (1.8 kB)
Collecting jupyterlab-server<3,>=2.27.1 (from jupyterlab->jupyter)
  Downloading jupyterlab_server-2.27.3-py3-none-any.whl.metadata (5.9 kB)
Collecting jedi>=0.16 (from ipython>=7.23.1->ipykernel->jupyter)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting json5>=0.9.0 (from jupyterlab-server<3,>=2.27.1->jupyterlab->jupyter)
  Downloading json5-0.12.1-py3-none-any.whl.me

In [5]:
# Verify all packages are installed correctly
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

print("✅ All packages installed successfully!")
print("📊 Ready to run the ML pipeline!")


✅ All packages installed successfully!
📊 Ready to run the ML pipeline!


In [6]:
# Cell 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.neighbors import NearestNeighbors

# Set plotting style
plt.style.use('default')  # Use default style in Colab
sns.set_palette('husl')
np.random.seed(42)

print('✅ Libraries imported successfully!')
print('📊 Ready to build subscription management ML pipeline!')


✅ Libraries imported successfully!
📊 Ready to build subscription management ML pipeline!


GENERATE DATA SET

In [7]:
# Cell 2: Generate Dataset Function
def generate_subscription_dataset(n_customers=10000):
    """Generate a realistic subscription management dataset"""
    np.random.seed(42)

    # Customer demographics
    customer_data = {
        'customer_id': range(1, n_customers + 1),
        'age': np.random.normal(35, 12, n_customers).astype(int),
        'gender': np.random.choice(['Male', 'Female'], n_customers),
        'location': np.random.choice(['Urban', 'Suburban', 'Rural'], n_customers, p=[0.5, 0.3, 0.2]),
        'income_bracket': np.random.choice(['Low', 'Medium', 'High'], n_customers, p=[0.3, 0.5, 0.2]),
        'family_size': np.random.poisson(2.5, n_customers) + 1,
    }

    # Subscription details
    subscription_types = ['Fibernet_Basic', 'Fibernet_Premium', 'Broadband_Copper_Basic', 'Broadband_Copper_Premium']
    contract_types = ['Monthly', 'Quarterly', 'Yearly']

    subscription_data = {
        'subscription_type': np.random.choice(subscription_types, n_customers),
        'contract_type': np.random.choice(contract_types, n_customers, p=[0.6, 0.25, 0.15]),
        'monthly_charge': np.random.uniform(25, 150, n_customers),
        'data_quota_gb': np.random.choice([50, 100, 200, 500, 1000], n_customers),
        'tenure_months': np.random.exponential(18, n_customers).astype(int),
    }

    # Usage patterns
    usage_data = {
        'avg_monthly_usage_gb': np.random.lognormal(4, 1, n_customers),
        'peak_usage_hours': np.random.choice(['Morning', 'Afternoon', 'Evening', 'Night'], n_customers, p=[0.15, 0.25, 0.45, 0.15]),
        'support_tickets_3m': np.random.poisson(1.5, n_customers),
        'payment_delays_6m': np.random.poisson(0.5, n_customers),
        'auto_renew': np.random.choice([0, 1], n_customers, p=[0.3, 0.7]),
    }

    # Create DataFrame
    df = pd.DataFrame({**customer_data, **subscription_data, **usage_data})

    # Create realistic churn based on multiple factors
    churn_probability = (
        0.1 +  # base churn rate
        0.3 * (df['monthly_charge'] > 100).astype(int) +  # high price increases churn
        0.2 * (df['support_tickets_3m'] > 3).astype(int) +  # many support issues
        0.15 * (df['payment_delays_6m'] > 1).astype(int) +  # payment issues
        0.1 * (df['contract_type'] == 'Monthly').astype(int) +  # monthly contracts less sticky
        0.1 * (df['avg_monthly_usage_gb'] > df['data_quota_gb']).astype(int) -  # quota exceeded
        0.15 * (df['auto_renew'] == 1).astype(int) -  # auto renew reduces churn
        0.1 * (df['tenure_months'] > 24).astype(int)  # loyalty reduces churn
    )

    df['churn'] = np.random.binomial(1, np.clip(churn_probability, 0, 1), n_customers)

    # Add derived features
    df['usage_quota_ratio'] = df['avg_monthly_usage_gb'] / df['data_quota_gb']
    df['price_per_gb'] = df['monthly_charge'] / df['data_quota_gb']
    df['is_heavy_user'] = (df['usage_quota_ratio'] > 0.8).astype(int)
    df['is_premium_customer'] = (df['monthly_charge'] > 80).astype(int)
    df['clv'] = (df['monthly_charge'] * df['tenure_months'] * (1 - df['churn'] * 0.5)).round(2)

    return df

# Generate the dataset
df = generate_subscription_dataset(10000)

print('🎯 Dataset Generated Successfully!')
print(f'📊 Total customers: {len(df):,}')
print(f'📈 Features: {df.shape[1]}')
print(f'❌ Churn rate: {df["churn"].mean():.2%}')
print(f'💰 Average CLV: ${df["clv"].mean():,.2f}')

# Display first few rows
df.head()


🎯 Dataset Generated Successfully!
📊 Total customers: 10,000
📈 Features: 22
❌ Churn rate: 20.50%
💰 Average CLV: $1,354.57


Unnamed: 0,customer_id,age,gender,location,income_bracket,family_size,subscription_type,contract_type,monthly_charge,data_quota_gb,...,peak_usage_hours,support_tickets_3m,payment_delays_6m,auto_renew,churn,usage_quota_ratio,price_per_gb,is_heavy_user,is_premium_customer,clv
0,1,40,Female,Suburban,Low,4,Broadband_Copper_Premium,Monthly,124.945168,100,...,Morning,0,0,1,0,1.27056,1.249452,1,1,999.56
1,2,33,Female,Suburban,Low,4,Broadband_Copper_Premium,Quarterly,103.484537,50,...,Afternoon,0,0,0,0,3.860038,2.069691,1,1,724.39
2,3,42,Male,Urban,Medium,6,Fibernet_Basic,Monthly,39.927471,200,...,Night,1,1,1,0,0.581787,0.199637,0,0,279.49
3,4,53,Male,Suburban,High,3,Broadband_Copper_Premium,Yearly,95.740519,50,...,Evening,3,0,0,0,0.726296,1.91481,0,1,478.7
4,5,32,Male,Suburban,Medium,3,Fibernet_Basic,Monthly,137.097075,200,...,Evening,0,0,1,0,0.369812,0.685485,0,1,1919.36


TRAIN MODEL

In [8]:
# 2. Preprocess Data
df_processed = df.copy()
le_gender = LabelEncoder()
df_processed['gender_encoded'] = le_gender.fit_transform(df_processed['gender'])

# One-hot encode categorical variables
df_encoded = pd.get_dummies(df_processed,
                           columns=['location', 'income_bracket', 'subscription_type', 'contract_type', 'peak_usage_hours'],
                           prefix=['loc', 'income', 'sub', 'contract', 'peak'])
df_encoded = df_encoded.drop(['gender'], axis=1)

print("✅ Data preprocessing complete")

# 3. Train Churn Model
X = df_encoded.drop(['customer_id', 'churn'], axis=1)
y = df_encoded['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train Random Forest for churn prediction
rf_churn = RandomForestClassifier(n_estimators=100, random_state=42)
rf_churn.fit(X_train, y_train)

y_pred = rf_churn.predict(X_test)
y_pred_proba = rf_churn.predict_proba(X_test)[:, 1]

churn_accuracy = accuracy_score(y_test, y_pred)
churn_f1 = f1_score(y_test, y_pred)
churn_auc = roc_auc_score(y_test, y_pred_proba)

print(f"🤖 Churn Model Performance:")
print(f"   Accuracy: {churn_accuracy:.3f}")
print(f"   F1-Score: {churn_f1:.3f}")
print(f"   AUC-ROC: {churn_auc:.3f}")

# 4. Customer Segmentation
features_for_clustering = ['age', 'family_size', 'avg_monthly_usage_gb', 'monthly_charge', 'tenure_months']
X_cluster = df[features_for_clustering]

scaler = StandardScaler()
X_cluster_scaled = scaler.fit_transform(X_cluster)

kmeans = KMeans(n_clusters=5, random_state=42)
df['customer_segment'] = kmeans.fit_predict(X_cluster_scaled)

print("✅ Customer segmentation complete")

# 5. CLV Prediction Model
clv_features = ['age', 'family_size', 'monthly_charge', 'data_quota_gb', 'avg_monthly_usage_gb',
                'tenure_months', 'support_tickets_3m', 'is_heavy_user']

X_clv = df[clv_features]
y_clv = df['clv']

X_train_clv, X_test_clv, y_train_clv, y_test_clv = train_test_split(X_clv, y_clv, test_size=0.2, random_state=42)

rf_clv = RandomForestRegressor(n_estimators=100, random_state=42)
rf_clv.fit(X_train_clv, y_train_clv)

y_pred_clv = rf_clv.predict(X_test_clv)
clv_r2 = r2_score(y_test_clv, y_pred_clv)

print(f"💎 CLV Model Performance:")
print(f"   R² Score: {clv_r2:.3f}")



✅ Data preprocessing complete
🤖 Churn Model Performance:
   Accuracy: 0.873
   F1-Score: 0.593
   AUC-ROC: 0.931
✅ Customer segmentation complete
💎 CLV Model Performance:
   R² Score: 0.914


MODEL SAVING

In [10]:
import pickle

# Save churn model
with open('churn_model.pkl', 'wb') as f:
    pickle.dump(rf_churn, f)

# Save CLV model
with open('clv_model.pkl', 'wb') as f:
    pickle.dump(rf_clv, f)

print("✅ Models saved as 'churn_model.pkl' and 'clv_model.pkl'")

# --- Load them back ---
with open('churn_model.pkl', 'rb') as f:
    loaded_churn_model = pickle.load(f)

with open('clv_model.pkl', 'rb') as f:
    loaded_clv_model = pickle.load(f)

print("✅ Models loaded successfully")


✅ Models saved as 'churn_model.pkl' and 'clv_model.pkl'
✅ Models loaded successfully


TESTING

In [12]:
# Predict churn (0 = active, 1 = churned)
y_pred = rf_churn.predict(X_test)
y_pred_proba = rf_churn.predict_proba(X_test)[:, 1]  # churn probability

print("✅ Churn Model Test Results")
print("First 10 predictions:", y_pred[:10])
print("First 10 churn probabilities:", y_pred_proba[:10])

# Compare with actual values
print("First 10 actual values:", y_test.values[:10])


✅ Churn Model Test Results
First 10 predictions: [0 0 0 0 0 0 0 0 0 0]
First 10 churn probabilities: [0.17 0.08 0.03 0.26 0.08 0.05 0.02 0.44 0.12 0.05]
First 10 actual values: [0 0 0 0 0 0 0 1 0 0]


In [13]:
# Predict CLV values
y_pred_clv = rf_clv.predict(X_test_clv)

print("\n✅ CLV Model Test Results")
print("First 10 predicted CLV:", y_pred_clv[:10])
print("First 10 actual CLV:", y_test_clv.values[:10])



✅ CLV Model Test Results
First 10 predicted CLV: [ 965.716   408.2499 4303.3874 2645.2717  816.8875  868.2516 1384.8959
 1122.6692 1059.2438 3946.9051]
First 10 actual CLV: [1098.31  503.32 4783.81 3053.24  828.26 1070.6  1384.27 1169.74 1093.44
 4852.34]


In [14]:
import pickle

# Load churn model
with open('churn_model.pkl', 'rb') as f:
    churn_model = pickle.load(f)

# Load clv model
with open('clv_model.pkl', 'rb') as f:
    clv_model = pickle.load(f)

# Test loaded models
print("\n✅ Testing loaded models")

# Churn
print("Loaded Churn Predictions:", churn_model.predict(X_test[:5]))
print("Loaded Churn Probabilities:", churn_model.predict_proba(X_test[:5])[:,1])

# CLV
print("Loaded CLV Predictions:", clv_model.predict(X_test_clv[:5]))



✅ Testing loaded models
Loaded Churn Predictions: [0 0 0 0 0]
Loaded Churn Probabilities: [0.17 0.08 0.03 0.26 0.08]
Loaded CLV Predictions: [ 965.716   408.2499 4303.3874 2645.2717  816.8875]


In [15]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

mae = mean_absolute_error(y_test_clv, y_pred_clv)
rmse = np.sqrt(mean_squared_error(y_test_clv, y_pred_clv))
r2 = r2_score(y_test_clv, y_pred_clv)

print(f"CLV Model Metrics:")
print(f"  MAE  : {mae:.2f}")
print(f"  RMSE : {rmse:.2f}")
print(f"  R²   : {r2:.3f}")


CLV Model Metrics:
  MAE  : 221.38
  RMSE : 468.45
  R²   : 0.914


In [None]:
def train_churn_model(self, df_processed):
    """Train churn prediction models"""
    print("🤖 Training churn prediction models...")

    from sklearn.metrics import accuracy_score
    import pickle

    # Prepare features and target
    X = df_processed.drop(['customer_id', 'churn'], axis=1)
    y = df_processed['churn']

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=self.random_state, stratify=y
    )

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Models to train
    trained_models = {
        "Logistic Regression": LogisticRegression(random_state=self.random_state, max_iter=1000),
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=self.random_state),
        "Gradient Boosting": GradientBoostingClassifier(random_state=self.random_state),
        "Decision Tree": DecisionTreeClassifier(random_state=self.random_state),
    }

    # Train models
    for name, model in trained_models.items():
        model.fit(X_train_scaled, y_train)

    # ==============================
    # Select Best Model by Accuracy
    # ==============================
    scores = {}
    for name, model in trained_models.items():
        y_pred = model.predict(X_test_scaled)
        scores[name] = accuracy_score(y_test, y_pred)

    best_model_name = max(scores, key=scores.get)
    best_model = trained_models[best_model_name]

    # Save the best model
    with open("trained_model.pkl", "wb") as f:
        pickle.dump(best_model, f)

    print(f"✅ Best churn model '{best_model_name}' saved as trained_model.pkl "
          f"with accuracy {scores[best_model_name]:.2%}")

    return best_model