In [9]:
import pandas as pd 

# loading the dataset 
df=pd.read_csv("C:/Users/Prasad/Downloads/telecom_Boosting.csv")
df.head()

Unnamed: 0,CustomerID,MonthlyMinutes,MonthlyDataGB,ContractType,PaymentMethod,TenureMonths,MonthlyCharges,TotalCharges,LatePayments,CustomerSupportCalls,Churn
0,C001,450,12,Month-to-Month,Credit Card,5,75,375,2,3,Yes
1,C002,300,6,One Year,Bank Transfer,18,55,990,0,1,No
2,C003,520,20,Month-to-Month,PayPal,2,95,190,1,2,Yes
3,C004,250,5,Two Year,Credit Card,30,50,1500,0,0,No
4,C005,600,25,Month-to-Month,Electronic Check,1,110,110,3,4,Yes


In [4]:
# --- Data cleaning ---
df.head()
df.info()
df.isnull().sum()

# Standardize string columns
df['ContractType'] = df['ContractType'].str.strip()
df['PaymentMethod'] = df['PaymentMethod'].str.strip()
df['Churn'] = df['Churn'].str.strip().map({'Yes':1, 'No':0})  # convert target to 0/1

# If numeric columns are strings (commas), convert:
for col in ['MonthlyMinutes','MonthlyDataGB','TenureMonths','MonthlyCharges','TotalCharges','LatePayments','CustomerSupportCalls']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Impute missing numeric with median (if any)
df.fillna(df.median(numeric_only=True), inplace=True)

# For any remaining categorical missing, fill with mode
df['ContractType'] = df['ContractType'].fillna(df['ContractType'].mode()[0])
df['PaymentMethod'] = df['PaymentMethod'].fillna(df['PaymentMethod'].mode()[0])


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   CustomerID            30 non-null     object
 1   MonthlyMinutes        30 non-null     int64 
 2   MonthlyDataGB         30 non-null     int64 
 3   ContractType          30 non-null     object
 4   PaymentMethod         30 non-null     object
 5   TenureMonths          30 non-null     int64 
 6   MonthlyCharges        30 non-null     int64 
 7   TotalCharges          30 non-null     int64 
 8   LatePayments          30 non-null     int64 
 9   CustomerSupportCalls  30 non-null     int64 
 10  Churn                 30 non-null     object
dtypes: int64(7), object(4)
memory usage: 2.7+ KB


In [5]:
# Example feature engineering
df['AvgChargePerMinute'] = df['MonthlyCharges'] / (df['MonthlyMinutes'] + 1e-6)
df['DataPerMonthPerUser'] = df['MonthlyDataGB'] / (df['TenureMonths'] + 1e-6)
df['HighLatePayments'] = (df['LatePayments'] >= 2).astype(int)

# Categorical columns
cat_cols = ['ContractType','PaymentMethod']

# If using XGBoost/LightGBM, label-encode or one-hot encode:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_le = df.copy()
for c in cat_cols:
    df_le[c] = le.fit_transform(df_le[c])


In [6]:
from sklearn.preprocessing import StandardScaler
num_cols = ['MonthlyMinutes','MonthlyDataGB','TenureMonths','MonthlyCharges','TotalCharges','LatePayments','CustomerSupportCalls','AvgChargePerMinute','DataPerMonthPerUser']
scaler = StandardScaler()
df_scaled = df.copy()
df_scaled[num_cols] = scaler.fit_transform(df_scaled[num_cols])


In [7]:
from sklearn.model_selection import train_test_split
X = df_le.drop(columns=['CustomerID','Churn']) if 'CustomerID' in df_le.columns else df_le.drop('Churn', axis=1)
y = df_le['Churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
# Install packages if not present:
# pip install xgboost lightgbm catboost

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report

# --- XGBoost ---
import xgboost as xgb
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_clf.fit(X_train, y_train)
y_pred_xgb = xgb_clf.predict(X_test)
y_prob_xgb = xgb_clf.predict_proba(X_test)[:,1]

# --- LightGBM ---
import lightgbm as lgb
lgb_clf = lgb.LGBMClassifier(random_state=42)
lgb_clf.fit(X_train, y_train)
y_pred_lgb = lgb_clf.predict(X_test)
y_prob_lgb = lgb_clf.predict_proba(X_test)[:,1]

# --- CatBoost (handles categorical features) ---
from catboost import CatBoostClassifier
# If using CatBoost with original df (not label-encoded), provide cat_features indices:
cat_clf = CatBoostClassifier(verbose=0, random_state=42)
cat_clf.fit(X_train, y_train)
y_pred_cat = cat_clf.predict(X_test)
y_prob_cat = cat_clf.predict_proba(X_test)[:,1]

# Evaluation helper
def eval_model(y_true, y_pred, y_prob=None):
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred, zero_division=0))
    print("Recall:", recall_score(y_true, y_pred, zero_division=0))
    print("F1:", f1_score(y_true, y_pred, zero_division=0))
    if y_prob is not None:
        print("ROC AUC:", roc_auc_score(y_true, y_prob))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print(classification_report(y_true, y_pred, zero_division=0))

print("=== XGBoost ===")
eval_model(y_test, y_pred_xgb, y_prob_xgb)
print("=== LightGBM ===")
eval_model(y_test, y_pred_lgb, y_prob_lgb)
print("=== CatBoost ===")
eval_model(y_test, y_pred_cat, y_prob_cat)


In [None]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# XGBoost param grid
xgb_param = {
    'n_estimators': [50,100,200],
    'max_depth': [2,3,4,6],
    'learning_rate': [0.01,0.05,0.1,0.2],
    'subsample': [0.6,0.8,1.0],
    'colsample_bytree': [0.6,0.8,1.0]
}

xgb_random = RandomizedSearchCV(xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
                                param_distributions=xgb_param,
                                n_iter=25,
                                scoring='f1',
                                cv=cv,
                                random_state=42,
                                n_jobs=-1)
xgb_random.fit(X_train, y_train)
print("XGB best:", xgb_random.best_params_, "best score:", xgb_random.best_score_)

# LightGBM param grid
lgb_param = {
    'n_estimators': [50,100,200],
    'num_leaves': [7,15,31],
    'learning_rate': [0.01,0.05,0.1],
    'subsample': [0.6,0.8,1.0],
    'colsample_bytree': [0.6,0.8,1.0]
}
lgb_random = RandomizedSearchCV(lgb.LGBMClassifier(random_state=42), lgb_param,
                                n_iter=25, scoring='f1', cv=cv, random_state=42, n_jobs=-1)
lgb_random.fit(X_train, y_train)
print("LGB best:", lgb_random.best_params_, "best score:", lgb_random.best_score_)

# CatBoost param grid (smaller search)
cat_param = {
    'iterations': [50,100,200],
    'depth': [3,4,6],
    'learning_rate': [0.01,0.05,0.1],
    'l2_leaf_reg': [1,3,5]
}
cat_random = RandomizedSearchCV(CatBoostClassifier(verbose=0, random_state=42),
                               cat_param,
                               n_iter=20, scoring='f1', cv=cv, random_state=42, n_jobs=-1)
cat_random.fit(X_train, y_train)
print("CatBoost best:", cat_random.best_params_, "best score:", cat_random.best_score_)


In [None]:
from sklearn.tree import DecisionTreeClassifier

# Param grid for AdaBoost
ada_param = {
    'n_estimators': [50, 100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0],
    'base_estimator': [DecisionTreeClassifier(max_depth=1),
                       DecisionTreeClassifier(max_depth=2),
                       DecisionTreeClassifier(max_depth=3)]
}

ada_random = RandomizedSearchCV(
    AdaBoostClassifier(random_state=42),
    param_distributions=ada_param,
    n_iter=20,
    scoring='f1',
    cv=cv,
    random_state=42,
    n_jobs=-1
)

ada_random.fit(X_train, y_train)
print("AdaBoost best:", ada_random.best_params_, "best score:", ada_random.best_score_)

# Evaluate on test set
best_ada = ada_random.best_estimator_
y_pred = best_ada.predict(X_test)
y_prob = best_ada.predict_proba(X_test)[:,1]
print("--- AdaBoost Tuned on TEST ---")
eval_model(y_test, y_pred, y_prob)
