In [1]:

!pip install xgboost
!pip install catboost lightgbm




You should consider upgrading via the 'c:\Users\Asus\Documents\ASAH 2025\CAPSTONE\.venv\Scripts\python.exe -m pip install --upgrade pip' command.




You should consider upgrading via the 'c:\Users\Asus\Documents\ASAH 2025\CAPSTONE\.venv\Scripts\python.exe -m pip install --upgrade pip' command.


In [None]:
# =========================================================
# MASTER IMPORT FOR TELCO MACHINE LEARNING PROJECT
# =========================================================

# Core
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Visualization (EDA)
import matplotlib.pyplot as plt
import seaborn as sns

# Saving / Loading
import pickle
import joblib


# SCIKIT-LEARN PREPROCESSING
from sklearn.preprocessing import (
    StandardScaler,
    LabelEncoder,
    label_binarize
)

# SCIKIT-LEARN MODEL SELECTION
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    RandomizedSearchCV
)


# SCIKIT-LEARN METRICS
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
    confusion_matrix,
    classification_report
)
from sklearn.metrics.pairwise import cosine_similarity

# MACHINE LEARNING MODELS
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier


In [3]:
df = pd.read_csv('data/raw/data_capstone.csv')
df.head()

Unnamed: 0,customer_id,plan_type,device_brand,avg_data_usage_gb,pct_video_usage,avg_call_duration,sms_freq,monthly_spend,topup_freq,travel_score,complaint_count,target_offer
0,C00001,Prepaid,Realme,1.5,0.804146,7.98,13,70000.0,4,0.284419,0,General Offer
1,C00002,Postpaid,Vivo,1.09,0.107686,9.56,9,63000.0,3,0.115086,0,General Offer
2,C00003,Postpaid,Xiaomi,3.24,0.313894,4.61,13,89000.0,7,0.402998,0,General Offer
3,C00004,Prepaid,Apple,5.32,0.420158,6.96,8,67000.0,4,0.302169,0,General Offer
4,C00005,Prepaid,Huawei,1.91,0.251638,11.01,21,72000.0,5,0.487911,0,General Offer


In [4]:

# Load training data (sudah balanced dengan SMOTE)
X_train = np.load('data/processed/X_train.npy')
y_train = np.load('data/processed/y_train.npy')

# Load testing data (original)
X_test = np.load('data/processed/X_test.npy')
y_test = np.load('data/processed/y_test.npy')

# Load artifacts
with open('data/processed/scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

with open('data/processed/label_encoder.pkl', 'rb') as f:
    label_encoder = pickle.load(f)

with open('data/processed/feature_names.pkl', 'rb') as f:
    feature_names = pickle.load(f)

print(f"Training samples: {len(y_train):,}")
print(f"Testing samples: {len(y_test):,}")
print(f"Features: {len(feature_names)}")

Training samples: 42,669
Testing samples: 1,954
Features: 15


In [5]:
behavior_features = [
    'avg_data_usage_gb',
    'pct_video_usage',
    'avg_call_duration',
    'sms_freq',
    'monthly_spend',
    'topup_freq',
    'travel_score',
    'complaint_count'
]

X = df[behavior_features]


In [8]:
# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [13]:
print(df.columns.tolist())


['customer_id', 'plan_type', 'device_brand', 'avg_data_usage_gb', 'pct_video_usage', 'avg_call_duration', 'sms_freq', 'monthly_spend', 'topup_freq', 'travel_score', 'complaint_count', 'target_offer']


In [14]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_offer_cosine(customer_index, top_k=6):
    
    # Hitung cosine similarity antara satu customer vs semua
    sim_scores = cosine_similarity(
        X_scaled[customer_index].reshape(1, -1),
        X_scaled
    )[0]

    # Urutkan customer paling mirip (descending), skip diri sendiri
    similar_indices = sim_scores.argsort()[::-1][1 : top_k+1]

    # Ambil offer dari pelanggan mirip
    similar_offers = df.loc[similar_indices, "target_offer"]

    # Hitung distribusi offer
    freq = similar_offers.value_counts()

    # Ambil rekomendasi (offer paling sering)
    recommended = freq.idxmax()

    return recommended, freq, similar_indices


In [15]:
rec, freq, idx = recommend_offer_cosine(1500, top_k=7)

print("üéØ Recommended Offer:", rec)
print("\nüìä Similar Customer Offer Distribution:\n", freq)
print("\nüßç Similar Customer Index:", idx)


üéØ Recommended Offer: Retention Offer

üìä Similar Customer Offer Distribution:
 target_offer
Retention Offer         3
General Offer           2
Device Upgrade Offer    2
Name: count, dtype: int64

üßç Similar Customer Index: [1949 9789 4426 2916 1392 5225 8535]


In [16]:
def precision_at_k(k=7, sample=300):

    correct = 0

    for i in np.random.choice(len(df), sample, replace=False):
        rec, _, _ = recommend_offer_cosine(i, top_k=k)

        true_offer = df.iloc[i]['target_offer']

        if rec == true_offer:
            correct += 1

    return correct / sample


In [17]:
accuracy = precision_at_k(k=7, sample=500)
print("Precision@7:", accuracy)


Precision@7: 0.728


In [18]:
# Load data
df = pd.read_csv('data/raw/data_capstone.csv')

# 1.a Feature engineering - add cost_per_gb and engagement_score (simple proxy)
df['cost_per_gb'] = df['monthly_spend'] / df['avg_data_usage_gb']
df['cost_per_gb'].replace([np.inf, -np.inf], np.nan, inplace=True)
df['cost_per_gb'].fillna(df['cost_per_gb'].median(), inplace=True)

# engagement_score: normalized combination of usage, pct_video_usage, topup_freq
df['engagement_score'] = (
    (df['avg_data_usage_gb'].rank(pct=True) * 0.5) +
    (df['pct_video_usage'].rank(pct=True) * 0.3) +
    (df['topup_freq'].rank(pct=True) * 0.2)
)

# 1.b Features to use (numerical + optionally encoded categorical)
features = [
    'avg_data_usage_gb', 'pct_video_usage', 'avg_call_duration', 'sms_freq',
    'monthly_spend', 'topup_freq', 'travel_score', 'complaint_count',
    'cost_per_gb', 'engagement_score'
]

# Keep some categorical for CatBoost handling later
cat_features = ['plan_type', 'device_brand']

# 1.c Target encode (use the mapping you provided if you want numeric labels)
le = LabelEncoder()
df['target_label'] = le.fit_transform(df['target_offer'])  # stores encoded classes 0..8

print("Classes:", le.classes_)
# store mapping dict if needed
target_mapping = {cls: int(lbl) for lbl, cls in enumerate(le.classes_)}
print("Target mapping:", target_mapping)


Classes: ['Data Booster' 'Device Upgrade Offer' 'Family Plan Offer' 'General Offer'
 'Retention Offer' 'Roaming Pass' 'Streaming Partner Pack' 'Top-up Promo'
 'Voice Bundle']
Target mapping: {'Data Booster': 0, 'Device Upgrade Offer': 1, 'Family Plan Offer': 2, 'General Offer': 3, 'Retention Offer': 4, 'Roaming Pass': 5, 'Streaming Partner Pack': 6, 'Top-up Promo': 7, 'Voice Bundle': 8}


In [19]:
# 2) Split - stratify by target to preserve distribution
X = df[features + cat_features]  # note: cat_features included (for CatBoost we will pass indices)
y = df['target_label'].values

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42
)

# 2.a Scaling numeric features for models that need it (XGBoost/LGB prefer scaled numeric but not required)
scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train[features])
X_val_num = scaler.transform(X_val[features])
X_test_num = scaler.transform(X_test[features])

# Create dataframe-like structures for model input
X_train_model = pd.DataFrame(X_train_num, columns=features, index=X_train.index)
X_val_model = pd.DataFrame(X_val_num, columns=features, index=X_val.index)
X_test_model = pd.DataFrame(X_test_num, columns=features, index=X_test.index)

# attach categorical columns back (as raw strings) for CatBoost
X_train_model[cat_features] = X_train[cat_features].reset_index(drop=True)
X_val_model[cat_features]   = X_val[cat_features].reset_index(drop=True)
X_test_model[cat_features]  = X_test[cat_features].reset_index(drop=True)

# 2.b Sample weights to handle imbalance (inverse freq)
class_counts = np.bincount(y_train)
class_weights = {i: (len(y_train) / (len(class_counts) * count)) for i, count in enumerate(class_counts)}
sample_weight_train = np.array([class_weights[int(lbl)] for lbl in y_train])
print("Class weights (train):", class_weights)


Class weights (train): {0: np.float64(1.3938669852648347), 1: np.float64(0.7400359446030236), 2: np.float64(13.645224171539962), 3: np.float64(0.18304960644334614), 4: np.float64(1.4592453616843861), 5: np.float64(11.965811965811966), 6: np.float64(4.320987654320987), 7: np.float64(3.003003003003003), 8: np.float64(16.203703703703702)}


In [20]:
# 3) Evaluation helper
def evaluate_model(clf, X_test_input, y_test, model_name="model", proba=True, catboost=False):
    """
    clf: trained classifier (fitted)
    X_test_input: dataframe with numeric + categorical as needed
    y_test: true labels
    """
    # If classifier expects numpy numeric only (XGBoost/LGB), provide numeric features only
    if model_name in ("xgboost", "lightgbm"):
        X_eval = X_test_input[features].values
    else:
        X_eval = X_test_input  # CatBoost can take dataframe with cat columns
    
    # Predictions
    y_pred = clf.predict(X_eval)
    # Probabilities (for ROC-AUC)
    try:
        y_proba = clf.predict_proba(X_eval)
    except Exception:
        y_proba = None
    
    f1_w = f1_score(y_test, y_pred, average='weighted')
    print(f"==== Evaluation {model_name} ====")
    print("F1-weighted: {:.4f}".format(f1_w))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=le.classes_))
    
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix shape:", cm.shape)
    
    # ROC-AUC (multiclass) if probabilities available
    if y_proba is not None:
        y_test_binarized = label_binarize(y_test, classes=np.arange(len(le.classes_)))
        try:
            auc = roc_auc_score(y_test_binarized, y_proba, multi_class='ovr')
            print("ROC-AUC (ovr):", auc)
        except Exception as e:
            print("ROC-AUC error:", e)
    
    return {"f1_weighted": f1_w, "confusion_matrix": cm, "y_pred": y_pred, "y_proba": y_proba}


In [21]:
def evaluate_model(model, X_test, y_test, model_name="Model"):
    import numpy as np
    
    if not isinstance(X_test, np.ndarray):
        X_test = X_test.values

    print(f"\n================= Evaluasi {model_name} =================")
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')

    print(f"Accuracy Score : {acc:.4f}")
    print(f"Weighted F1    : {f1:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    return {"model": model_name, "accuracy": acc, "f1_weighted": f1}


SEMUA MODEL TANPA AKURASI

### LightGBM tuning 

In [22]:
# Model LightGBM tuning
lgb_model = lgb.LGBMClassifier(objective='multiclass', random_state=42, n_jobs=-1)

param_dist_lgb = {
    'num_leaves': [31, 50, 70, 100],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'n_estimators': [100, 300, 600],
    'max_depth': [6, 10, 15, -1],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.01, 0.1],
    'reg_lambda': [0, 0.01, 0.1],
    'class_weight': [None, 'balanced']
}

rsearch_lgb = RandomizedSearchCV(
    estimator=lgb_model,
    param_distributions=param_dist_lgb,
    n_iter=20,
    scoring='f1_weighted',
    cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# fit using numeric features only (LightGBM accepts numpy)
rsearch_lgb.fit(X_train_model[features].values, y_train, sample_weight=sample_weight_train)
print("Best LGB params:", rsearch_lgb.best_params_)
best_lgb = rsearch_lgb.best_estimator_

# Evaluate on test
res_lgb = evaluate_model(best_lgb, X_test_model[features], y_test, model_name="lightgbm")





Fitting 3 folds for each of 20 candidates, totalling 60 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002917 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1832
[LightGBM] [Info] Number of data points in the train set: 7000, number of used features: 10
[LightGBM] [Info] Start training from score -3.636604
[LightGBM] [Info] Start training from score -4.269742
[LightGBM] [Info] Start training from score -1.355296
[LightGBM] [Info] Start training from score -5.666684
[LightGBM] [Info] Start training from score -3.590766
[LightGBM] [Info] Start training from score -1.486632
[LightGBM] [Info] Start training from score -2.505201
[LightGBM] [Info] Start training from score -2.869073
[LightGBM] [Info] Start training from score -1.183446
Best LGB params: {'subsample': 1.0, 'reg_lambda': 0.01, 'reg_alpha': 0, 'num_leaves': 100, 'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.03, 'colsample_byt

### CatBoost tuning

In [23]:
# =============================
# FIX DATA KATEGORIK (WAJIB)
# =============================
# CatBoost tidak boleh ada NaN pada kolom kategori
# dan harus bertipe string/objek

for c in cat_features:
    X_train_model[c] = X_train_model[c].fillna("Unknown").astype(str)
    X_val_model[c] = X_val_model[c].fillna("Unknown").astype(str)
    X_test_model[c] = X_test_model[c].fillna("Unknown").astype(str)

# Ambil index kolom kategori (lebih aman daripada nama kolom)
cat_cols_idx = [X_train_model.columns.get_loc(c) for c in cat_features]


# =============================
# PARAMETER GRID
# =============================
cat_params_grid = [
    {'iterations': 400, 'depth': 6, 'learning_rate': 0.05, 'l2_leaf_reg': 3},
    {'iterations': 600, 'depth': 8, 'learning_rate': 0.03, 'l2_leaf_reg': 5},
    {'iterations': 300, 'depth': 10, 'learning_rate': 0.05, 'l2_leaf_reg': 3},
    {'iterations': 800, 'depth': 8, 'learning_rate': 0.02, 'l2_leaf_reg': 7},
]


# =============================
# CATBOOST TRAINING LOOP
# =============================
best_cat = None
best_score = -np.inf

for params in cat_params_grid:
    print("Training CatBoost dengan params:", params)

    model_cb = CatBoostClassifier(
        iterations=params['iterations'],
        depth=params['depth'],
        learning_rate=params['learning_rate'],
        l2_leaf_reg=params['l2_leaf_reg'],
        random_seed=42,
        verbose=100,
        loss_function='MultiClass',
        thread_count=-1,
        class_weights=[class_weights[i] for i in range(len(class_weights))]
    )

    # Train model
    model_cb.fit(
        X_train_model,
        y_train,
        cat_features=cat_cols_idx,             # gunakan index kolom kategori
        eval_set=(X_val_model, y_val),
        use_best_model=True,
        verbose=False
    )

    # Evaluasi
    y_val_pred = model_cb.predict(X_val_model)
    f1_w = f1_score(y_val, y_val_pred, average='weighted')
    print("Val F1-weighted:", f1_w)

    if f1_w > best_score:
        best_score = f1_w
        best_cat = model_cb

print("\nBest CatBoost val F1:", best_score)


# =============================
# FINAL EVALUATION ON TEST SET
# =============================
res_cat = evaluate_model(best_cat, X_test_model, y_test, model_name="catboost")


Training CatBoost dengan params: {'iterations': 400, 'depth': 6, 'learning_rate': 0.05, 'l2_leaf_reg': 3}
Val F1-weighted: 0.8369662691233885
Training CatBoost dengan params: {'iterations': 600, 'depth': 8, 'learning_rate': 0.03, 'l2_leaf_reg': 5}
Val F1-weighted: 0.8341673489090355
Training CatBoost dengan params: {'iterations': 300, 'depth': 10, 'learning_rate': 0.05, 'l2_leaf_reg': 3}
Val F1-weighted: 0.8253883126368211
Training CatBoost dengan params: {'iterations': 800, 'depth': 8, 'learning_rate': 0.02, 'l2_leaf_reg': 7}
Val F1-weighted: 0.833126454586688

Best CatBoost val F1: 0.8369662691233885

Accuracy Score : 0.8173
Weighted F1    : 0.8357

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       119
           1       0.55      0.84      0.67       225
           2       0.21      0.75      0.33        12
           3       1.00      0.77      0.87       911
           4       0.91      0.84      0.87   

In [24]:
# AGGREGASI HASIL & SIMPAN MODEL
# Collect results
results = pd.DataFrame([
    {'model': 'lightgbm', 'f1_weighted': res_lgb['f1_weighted']},
    {'model': 'catboost', 'f1_weighted': res_cat['f1_weighted']},
]).sort_values('f1_weighted', ascending=False).reset_index(drop=True)

print(results)

# Save best model (example)
best_model_name = results.loc[0, 'model']
if best_model_name == 'lightgbm':
    joblib.dump(best_lgb, "best_model_lgb.joblib")
elif best_model_name == 'xgboost':
    joblib.dump(best_xgb, "best_model_xgb.joblib")
else:
    best_cat.save_model("best_model_catboost.cbm")


      model  f1_weighted
0  lightgbm     0.846122
1  catboost     0.835651


##### https://chatgpt.com/s/t_691c43865d7c8191bb77a31cb9beba06
##### https://chatgpt.com/s/t_691c43ba95048191969c1f1bf9971125

## Cosine Similarity (tidak ada akurasi)

#### Pure Cosine Similarity (CONTENT BASED)

- Tidak menggunakan machine learning.
- Menggunakan pendekatan pure similarity berbasis cosine.
- Memberikan rekomendasi dengan cara melihat pelanggan yang paling mirip secara perilaku.
- Sederhana, cepat, dan cocok sebagai baseline rekomendasi modern.

In [25]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_csv("data/raw/data_capstone.csv")

num_cols = [
    'avg_data_usage_gb','pct_video_usage','avg_call_duration',
    'sms_freq','monthly_spend','topup_freq','travel_score','complaint_count'
]

X = df[num_cols]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

sim_matrix = cosine_similarity(X_scaled)

def recommend_cosine(target_index, top_k=10):
    sims = list(enumerate(sim_matrix[target_index]))
    sims = sorted(sims, key=lambda x: x[1], reverse=True)[1:top_k+1]
    offers = df.loc[[i for i,_ in sims],'target_offer'].value_counts()
    return offers.idxmax(), offers

print(recommend_cosine(100,10))


('General Offer', target_offer
General Offer             5
Device Upgrade Offer      4
Streaming Partner Pack    1
Name: count, dtype: int64)


#### KNN CLASSIFIER (metric = cosine)

Model KNN ini:
- Menggunakan cosine similarity untuk mencari pelanggan dengan perilaku paling mirip.
- Menggunakan 10 tetangga untuk memutuskan offer terbaik.
- Memberikan baseline sederhana untuk prediksi target_offer.

In [26]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

y = df["target_offer"]
le = LabelEncoder()
y_enc = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_enc, test_size=0.2, random_state=42, stratify=y_enc
)

knn = KNeighborsClassifier(n_neighbors=10, metric='cosine')
knn.fit(X_train, y_train)
pred = knn.predict(X_test)

print("Accuracy:", accuracy_score(y_test, pred))


Accuracy: 0.738


##### CLUSTERING + COSINE RECOMMENDER
###### KMeans (cosine distance via normalized vectors) 
- Mengelompokkan pelanggan dengan K-Means.
- Berdasarkan cluster yang sama ‚Üí menawarkan offer yang paling relevan berdasarkan mayoritas.
- Sederhana, cepat, dan cocok untuk baseline rekomendasi berbasis segmentasi pelanggan.

In [27]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=8, random_state=42)
km.fit(X_scaled)

df["cluster"] = km.labels_

def recommend_cluster(customer_index):
    cl = df.loc[customer_index, "cluster"]
    offers = df[df["cluster"] == cl]["target_offer"].value_counts()
    return offers.idxmax(), offers

print(recommend_cluster(100))


('General Offer', target_offer
General Offer             886
Device Upgrade Offer      148
Roaming Pass               88
Top-up Promo               61
Streaming Partner Pack     60
Data Booster               54
Retention Offer            19
Voice Bundle                8
Family Plan Offer           5
Name: count, dtype: int64)


#### MATRIX FACTORIZATION + COSINE

Program ini:
- Menggunakan NMF untuk menemukan pola tersembunyi di balik preferensi offer.
- Mengubah pelanggan menjadi vektor latent.
- Menggunakan cosine similarity untuk mencari pelanggan mirip.
- Merekomendasikan offer terbaik berdasarkan perilaku pelanggan serupa.

##### Model ini bekerja baik saat data penawaran bersifat kategori dan ingin menemukan pola preferensi tersembunyi.

In [28]:
from sklearn.decomposition import NMF

mat = pd.get_dummies(df["target_offer"])
nmf = NMF(n_components=20, init="random", random_state=42)
embed = nmf.fit_transform(mat)

sim_latent = cosine_similarity(embed)

def recommend_latent(idx, top_k=10):
    sims = np.argsort(sim_latent[idx])[::-1][1:top_k+1]
    offers = df.iloc[sims]["target_offer"].value_counts()
    return offers.idxmax(), offers

print(recommend_latent(100))


('General Offer', target_offer
General Offer    10
Name: count, dtype: int64)


#### AUTOENCODER + COSINE EMBEDDING

In [29]:
!pip install tensorflow




You should consider upgrading via the 'c:\Users\Asus\Documents\ASAH 2025\CAPSTONE\.venv\Scripts\python.exe -m pip install --upgrade pip' command.





Program ini membangun Autoencoder untuk:
- mengkompres data pelanggan menjadi embedding,
- menghitung kemiripan antar pelanggan, dan
- akhirnya membuat sistem rekomendasi penawaran berbasis similarity.

##### Model ini unggul ketika fitur numerik banyak dan sulit dianalisis secara manual.

In [30]:
import tensorflow as tf
from tensorflow.keras import layers, models

input_dim = X_scaled.shape[1]

inp = layers.Input(shape=(input_dim,))
enc = layers.Dense(32, activation="relu")(inp)
emb = layers.Dense(16, activation="relu")(enc)

dec = layers.Dense(32, activation="relu")(emb)
out = layers.Dense(input_dim)(dec)

AE = models.Model(inp, out)
encoder = models.Model(inp, emb)

AE.compile(optimizer="adam", loss="mse")
AE.fit(X_scaled, X_scaled, epochs=30, batch_size=128, verbose=0)

embeddings = encoder.predict(X_scaled)
sim_emb = cosine_similarity(embeddings)

def recommend_ae(idx, top_k=10):
    sims = np.argsort(sim_emb[idx])[::-1][1:top_k+1]
    offers = df.iloc[sims]["target_offer"].value_counts()
    return offers.idxmax(), offers

print(recommend_ae(100))


[1m313/313[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m2s[0m 5ms/step
('Device Upgrade Offer', target_offer
Device Upgrade Offer      6
General Offer             3
Streaming Partner Pack    1
Name: count, dtype: int64)


#### https://chatgpt.com/s/t_691c491b5eb08191ae3b5250484ac3ae

In [31]:
!pip install imbalanced-learn



You should consider upgrading via the 'c:\Users\Asus\Documents\ASAH 2025\CAPSTONE\.venv\Scripts\python.exe -m pip install --upgrade pip' command.





In [32]:
dftest = pd.read_csv('data/processed/processed_data.csv')
dftest.head()

Unnamed: 0,avg_data_usage_gb,pct_video_usage,avg_call_duration,sms_freq,monthly_spend,topup_freq,travel_score,complaint_count,plan_type_Prepaid,device_brand_Huawei,device_brand_Oppo,device_brand_Realme,device_brand_Samsung,device_brand_Vivo,device_brand_Xiaomi,target
0,-1.061621,2.04672,-0.486489,-0.521547,-0.86055,0.593283,0.013842,-0.699682,0.796856,-0.409856,-0.399751,2.376994,-0.408663,-0.402674,-0.414445,3
1,-1.159227,-1.507241,-0.14744,-1.552817,-1.011679,0.008441,-1.055531,-0.699682,-1.254932,-0.409856,-0.399751,-0.420699,-0.408663,2.483401,-0.414445,3
2,-0.647391,-0.454985,-1.209649,-0.521547,-0.450343,2.34781,0.762691,-0.699682,-1.254932,-0.409856,-0.399751,-0.420699,-0.408663,-0.402674,2.412867,3
3,-0.15222,0.087267,-0.705368,-1.810634,-0.925319,0.593283,0.125934,-0.699682,0.796856,-0.409856,-0.399751,-0.420699,-0.408663,-0.402674,-0.414445,3
4,-0.964016,-0.772672,0.163712,1.540993,-0.81737,1.178125,1.298936,-0.699682,0.796856,2.439882,-0.399751,-0.420699,-0.408663,-0.402674,-0.414445,3


In [33]:

df = pd.read_csv("data/processed/processed_data.csv")
print(df.columns)


Index(['avg_data_usage_gb', 'pct_video_usage', 'avg_call_duration', 'sms_freq',
       'monthly_spend', 'topup_freq', 'travel_score', 'complaint_count',
       'plan_type_Prepaid', 'device_brand_Huawei', 'device_brand_Oppo',
       'device_brand_Realme', 'device_brand_Samsung', 'device_brand_Vivo',
       'device_brand_Xiaomi', 'target'],
      dtype='object')


Kode ini berfungsi sebagai pipeline pra-pemrosesan dan evaluasi:
- Memuat dataset final
- Menormalisasi fitur numerik
- Meng-encode label target
- Membagi data train & test
- Menyediakan fungsi evaluasi yang lengkap (Accuracy, F1, Report, Confusion Matrix, ROC-AUC)
#####  ‚Üí Anda dapat langsung melatih model apa pun dan menggunakan eval_metrics() untuk mengukur performanya.

In [34]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import (
    f1_score, classification_report, confusion_matrix,
    roc_auc_score, accuracy_score
)

# =========================================================
# 1Ô∏è‚É£ LOAD FINAL DATASET
# =========================================================
df = pd.read_csv("data/processed/processed_data.csv")   # FINAL VERSION

target_col = "target"

numerical_cols = [
    'avg_data_usage_gb','pct_video_usage','avg_call_duration',
    'sms_freq','monthly_spend','topup_freq','travel_score','complaint_count'
]

X = df[numerical_cols].copy()
y = df[target_col].copy()

# =========================================================
# 2Ô∏è‚É£ NORMALISASI + LABEL ENCODING
# =========================================================
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

le = LabelEncoder()
y_enc = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_enc, test_size=0.2, random_state=42, stratify=y_enc
)

# =========================================================
# 3Ô∏è‚É£ FUNGSI EVALUASI ‚Üí WAJIB‚Äº
# =========================================================
def eval_metrics(y_true, y_pred, y_pred_proba):
    print("\nüìå ACCURACY:", accuracy_score(y_true, y_pred))
    print("\nüìå F1-Weighted:", f1_score(y_true, y_pred, average='weighted'))
    print("\nüìå CLASS REPORT:\n", classification_report(y_true, y_pred, target_names=le.classes_))
    print("\nüìå CONFUSION MATRIX:\n", confusion_matrix(y_true, y_pred))
    print("\nüìå ROC-AUC:", roc_auc_score(y_true, y_pred_proba, multi_class='ovr'))


In [35]:
# Final Cosine Pipeline (one-cell)
# Paste and run in Jupyter/Colab. Adjust install commands if needed.

import os
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import (
    accuracy_score, f1_score, classification_report,
    confusion_matrix, roc_auc_score
)

# -------------------------
# Helper: Try common dataset paths
# -------------------------
possible_paths = [
    "data/processed/processed_data.csv",
    "data/raw/data_capstone.csv",
    "data/processed/processed_data.csv",
    "data/raw/data_capstone.csv",
    "processed_data.csv",
    "data_capstone.csv"
]

df_path = None
for p in possible_paths:
    if os.path.exists(p):
        df_path = p
        break

if df_path is None:
    raise FileNotFoundError(f"Dataset not found. Checked: {possible_paths}")

print("Loading dataset from:", df_path)
df = pd.read_csv(df_path)

# -------------------------
# Identify target column
# If user uses different name, try common names
# -------------------------
possible_targets = ["target_offer", "target", "offer", "label", "target_label"]
target_col = None
for t in possible_targets:
    if t in df.columns:
        target_col = t
        break

if target_col is None:
    # fallback: choose last column if likely categorical
    cand = df.columns[-1]
    if df[cand].dtype == object or len(df[cand].unique()) < 50:
        target_col = cand
        print(f"[WARN] target not found in common names; using last column: '{target_col}'")
    else:
        raise KeyError("Target column not found. Expected one of "
                       f"{possible_targets}. Columns: {list(df.columns)}")

print("Using target column:", target_col)

# -------------------------
# Numeric features (expected)
# -------------------------
numerical_cols = [
    'avg_data_usage_gb','pct_video_usage','avg_call_duration',
    'sms_freq','monthly_spend','topup_freq','travel_score','complaint_count'
]

# Safety: if any missing numeric col, attempt to auto-detect numeric features
missing = [c for c in numerical_cols if c not in df.columns]
if missing:
    print(f"[WARN] Some expected numeric columns missing: {missing}")
    # auto-select numeric columns except the target and id-like columns
    auto_numeric = df.select_dtypes(include=[np.number]).columns.tolist()
    # remove target if numeric
    auto_numeric = [c for c in auto_numeric if c != target_col]
    # choose up to 8 numeric features (prefer previously listed if present)
    final_numeric = [c for c in numerical_cols if c in df.columns] + [c for c in auto_numeric if c not in numerical_cols]
    numerical_cols = final_numeric[:8]
    print("Auto-selected numeric columns:", numerical_cols)
else:
    print("Using expected numeric columns.")

# ensure we have at least 2 numeric features
if len(numerical_cols) < 2:
    raise ValueError("Not enough numeric features for similarity. Columns found: " + str(numerical_cols))

X = df[numerical_cols].copy()
y = df[target_col].copy()

# -------------------------
# Preprocessing & splits
# -------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

le = LabelEncoder()
y_enc = le.fit_transform(y)

# maintain indices for mapping predictions computed across full DF
indices = np.arange(len(df))

# train/test split for supervised evaluation
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X_scaled, y_enc, indices, test_size=0.2, random_state=42, stratify=y_enc
)

# -------------------------
# Evaluation helper (robust)
# -------------------------
def eval_metrics(y_true, y_pred, y_proba, label_encoder=le):
    """y_true, y_pred are integer-encoded labels; y_proba shape (n_samples, n_classes)"""
    # convert label names to strings to avoid sklearn errors
    tn = [str(c) for c in label_encoder.classes_]
    acc = accuracy_score(y_true, y_pred)
    f1w = f1_score(y_true, y_pred, average='weighted')
    print("Accuracy:", acc)
    print("F1 (weighted):", f1w)
    print("\nClassification report:")
    print(classification_report(y_true, y_pred, target_names=tn))
    print("Confusion matrix:")
    print(confusion_matrix(y_true, y_pred))
    try:
        auc = roc_auc_score(y_true, y_proba, multi_class='ovr')
    except Exception as e:
        auc = f"ROC-AUC error: {e}"
    print("ROC-AUC (ovr):", auc)
    return {"accuracy": acc, "f1_weighted": f1w, "roc_auc": auc}

# helper: convert counts Series -> proba vector aligned to label encoder
def series_to_proba_vector(counts_series, label_encoder=le):
    probs = np.zeros(len(label_encoder.classes_), dtype=float)
    if counts_series is None or len(counts_series)==0:
        # uniform small prob
        probs += 1.0 / len(probs)
        return probs
    for lab, val in counts_series.items():
        try:
            idx = list(label_encoder.classes_).index(lab)
            probs[idx] = val
        except ValueError:
            # if stored labels are encoded numbers, try convert
            try:
                lab_int = int(lab)
                labname = label_encoder.inverse_transform([lab_int])[0]
                idx = list(label_encoder.classes_).index(labname)
                probs[idx] = val
            except Exception:
                continue
    s = probs.sum()
    if s > 0:
        probs = probs / s
    else:
        probs += 1.0 / len(probs)
    return probs

# -------------------------
# Precompute full-similarity matrix once
# -------------------------
sim_matrix = cosine_similarity(X_scaled)

# We'll store results summary
results = []

# -------------------------
# Model 1: Pure Cosine Majority from Top-K (predictions for full df; evaluate on test subset)
# -------------------------
def predict_cosine_full(top_k=40):
    labels = []
    probas = []
    for i in range(len(df)):
        sims = list(enumerate(sim_matrix[i]))
        sims = sorted(sims, key=lambda x: x[1], reverse=True)[1:top_k+1]
        neigh_idx = [x[0] for x in sims]
        counts = df.iloc[neigh_idx][target_col].value_counts()
        proba = series_to_proba_vector(counts)
        label = np.argmax(proba)
        labels.append(label)
        probas.append(proba)
    return np.array(labels), np.vstack(probas)

print("\n=== Running Model 1: Pure Cosine ===")
y_pred_full_cosine, y_proba_full_cosine = predict_cosine_full(top_k=40)
# Evaluate on test set indices
ytest_true = y_enc[idx_test]
ytest_pred = y_pred_full_cosine[idx_test]
ytest_proba = y_proba_full_cosine[idx_test]
res1 = eval_metrics(ytest_true, ytest_pred, ytest_proba)
results.append(("PureCosine", res1))

# -------------------------
# Model 2: KNN (metric='cosine') - supervised
# -------------------------
print("\n=== Running Model 2: KNN (cosine) ===")
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=15, metric="cosine", weights="distance", n_jobs=-1)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
y_proba_knn = knn.predict_proba(X_test)
res2 = eval_metrics(y_test, y_pred_knn, y_proba_knn)
results.append(("KNN_Cosine", res2))

# -------------------------
# Model 3: KMeans clustering -> cluster-majority offer
# -------------------------
print("\n=== Running Model 3: KMeans Clustering ===")
from sklearn.cluster import KMeans
k_clusters = max(2, len(le.classes_))
kmeans = KMeans(n_clusters=k_clusters, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)
df["_cluster"] = clusters
cluster_major = df.groupby("_cluster")[target_col].agg(lambda s: s.value_counts().idxmax())

# predictions for full df
y_pred_cluster_full = []
y_proba_cluster_full = []
for c in clusters:
    counts = df[df["_cluster"]==c][target_col].value_counts(normalize=True)
    y_proba_cluster_full.append(series_to_proba_vector(counts))
    y_pred_cluster_full.append(np.argmax(series_to_proba_vector(counts)))
y_pred_cluster_full = np.array(y_pred_cluster_full)
y_proba_cluster_full = np.vstack(y_proba_cluster_full)

# evaluate on test subset
ytest_pred_cluster = y_pred_cluster_full[idx_test]
ytest_proba_cluster = y_proba_cluster_full[idx_test]
res3 = eval_metrics(ytest_true, ytest_pred_cluster, ytest_proba_cluster)
results.append(("KMeans_ClusterMajor", res3))

# -------------------------
# Model 4: NMF embedding + cosine on embedding
# -------------------------
print("\n=== Running Model 4: NMF Embedding ===")
from sklearn.decomposition import NMF
# choose components <= n_features
n_features = X_scaled.shape[1]
nmf_components = min(6, n_features)
# NMF needs non-negative input; use shifting to positive
X_nmf_input = X_scaled - X_scaled.min() + 1e-6
nmf = NMF(n_components=nmf_components, init="nndsvd", random_state=42, max_iter=500)
emb_nmf = nmf.fit_transform(X_nmf_input)
sim_emb_nmf = cosine_similarity(emb_nmf)

def predict_nmf_full(top_k=40):
    labels = []
    probas = []
    for i in range(len(df)):
        sims = np.argsort(sim_emb_nmf[i])[-(top_k+1):-1]  # exclude itself
        neigh_idx = sims
        counts = df.iloc[neigh_idx][target_col].value_counts()
        proba = series_to_proba_vector(counts)
        labels.append(np.argmax(proba))
        probas.append(proba)
    return np.array(labels), np.vstack(probas)

y_pred_nmf_full, y_proba_nmf_full = predict_nmf_full(top_k=40)
ytest_pred_nmf = y_pred_nmf_full[idx_test]
ytest_proba_nmf = y_proba_nmf_full[idx_test]
res4 = eval_metrics(ytest_true, ytest_pred_nmf, ytest_proba_nmf)
results.append(("NMF_Embedding", res4))

# -------------------------
# Model 5: Hybrid Cosine Features + CatBoost / XGBoost
# -------------------------
print("\n=== Running Model 5: Hybrid Cosine Features + (CatBoost/XGBoost) ===")
def build_cosine_offer_features(sim_mtx, k=40):
    feat = np.zeros((sim_mtx.shape[0], len(le.classes_)))
    for i in range(sim_mtx.shape[0]):
        neigh = np.argsort(sim_mtx[i])[-(k+1):]
        neigh = neigh[neigh != i]
        neigh = neigh[-k:]
        counts = df.iloc[neigh][target_col].value_counts(normalize=True)
        feat[i,:] = series_to_proba_vector(counts)
    return feat

sim_feat = build_cosine_offer_features(sim_matrix, k=40)
X_hybrid = np.hstack([X_scaled, sim_feat])

Xh_tr, Xh_te, yh_tr, yh_te, idxh_tr, idxh_te = train_test_split(
    X_hybrid, y_enc, indices, test_size=0.2, random_state=42, stratify=y_enc
)

model_hybrid = None
y_pred_hybrid = None
y_proba_hybrid = None

# Try CatBoost
try:
    from catboost import CatBoostClassifier
    model_cb = CatBoostClassifier(iterations=600, depth=8, learning_rate=0.05, loss_function="MultiClass", verbose=100)
    model_cb.fit(Xh_tr, yh_tr, eval_set=(Xh_te, yh_te))
    model_hybrid = model_cb
    y_pred_hybrid = model_cb.predict(Xh_te)
    y_proba_hybrid = model_cb.predict_proba(Xh_te)
    print("[INFO] Trained CatBoost hybrid model.")
except Exception as e:
    print("[WARN] CatBoost failed or not installed:", e)
    try:
        import xgboost as xgb
        model_xgb = xgb.XGBClassifier(n_estimators=300, max_depth=6, learning_rate=0.05, use_label_encoder=False, eval_metric='mlogloss')
        model_xgb.fit(Xh_tr, yh_tr, eval_set=[(Xh_te, yh_te)], verbose=100)
        model_hybrid = model_xgb
        y_pred_hybrid = model_xgb.predict(Xh_te)
        y_proba_hybrid = model_xgb.predict_proba(Xh_te)
        print("[INFO] Trained XGBoost hybrid model.")
    except Exception as e2:
        print("[ERROR] Both CatBoost and XGBoost unavailable/failed:", e2)

if model_hybrid is not None and y_pred_hybrid is not None:
    res5 = eval_metrics(yh_te, y_pred_hybrid, y_proba_hybrid)
    results.append(("Hybrid_CB_XGB", res5))
    # save model
    os.makedirs("/mnt/data/models", exist_ok=True)
    try:
        # CatBoost has save_model; XGBoost has save_model too
        if hasattr(model_hybrid, "save_model"):
            model_hybrid.save_model("/mnt/data/models/hybrid_model.bin")
        else:
            import joblib
            joblib.dump(model_hybrid, "/mnt/data/models/hybrid_model.joblib")
        print("[INFO] Saved hybrid model to /mnt/data/models/")
    except Exception as e:
        print("[WARN] Save model failed:", e)
else:
    print("[WARN] Hybrid model training skipped due to missing libs.")

# -------------------------
# Model 6: Autoencoder embedding + cosine
# -------------------------
print("\n=== Running Model 6: Autoencoder Embedding ===")
ae_success = False
try:
    import tensorflow as tf
    from tensorflow.keras import layers, Model, callbacks

    input_dim = X_scaled.shape[1]
    encoding_dim = min(32, input_dim*2)

    inp = layers.Input(shape=(input_dim,))
    x = layers.Dense(128, activation='relu')(inp)
    x = layers.Dense(64, activation='relu')(x)
    encoded = layers.Dense(encoding_dim, activation='relu', name='embed')(x)
    x = layers.Dense(64, activation='relu')(encoded)
    x = layers.Dense(128, activation='relu')(x)
    decoded = layers.Dense(input_dim, activation='linear')(x)

    ae = Model(inp, decoded)
    encoder = Model(inp, encoded)
    ae.compile(optimizer='adam', loss='mse')

    es = callbacks.EarlyStopping(monitor='loss', patience=8, restore_best_weights=True)
    ae.fit(X_scaled, X_scaled, epochs=60, batch_size=128, callbacks=[es], verbose=0)

    emb_ae = encoder.predict(X_scaled)
    sim_emb_ae = cosine_similarity(emb_ae)

    y_pred_ae_full = []
    y_proba_ae_full = []
    for i in range(len(df)):
        sims = np.argsort(sim_emb_ae[i])[-40-1:-1]
        counts = df.iloc[sims][target_col].value_counts()
        proba = series_to_proba_vector(counts)
        y_proba_ae_full.append(proba)
        y_pred_ae_full.append(np.argmax(proba))
    y_pred_ae_full = np.array(y_pred_ae_full)
    y_proba_ae_full = np.vstack(y_proba_ae_full)

    ytest_pred_ae = y_pred_ae_full[idx_test]
    ytest_proba_ae = y_proba_ae_full[idx_test]
    res6 = eval_metrics(ytest_true, ytest_pred_ae, ytest_proba_ae)
    results.append(("Autoencoder_AE", res6))
    ae_success = True
except Exception as e:
    print("[WARN] Autoencoder step failed:", e)

# -------------------------
# Summary
# -------------------------
print("\n=== Summary of results ===")
for name, met in results:
    print(name, "->", met)
print("\nPipeline finished. Models (if trained) saved to /mnt/data/models")


Loading dataset from: data/processed/processed_data.csv
Using target column: target
Using expected numeric columns.

=== Running Model 1: Pure Cosine ===
Accuracy: 0.7415557830092119
F1 (weighted): 0.705969021812627

Classification report:
              precision    recall  f1-score   support

           0       0.76      0.87      0.81       157
           1       0.53      0.46      0.50       293
           2       0.00      0.00      0.00        16
           3       0.78      0.90      0.83      1185
           4       0.79      0.72      0.75       148
           5       1.00      0.06      0.11        18
           6       0.44      0.08      0.13        51
           7       0.00      0.00      0.00        73
           8       0.00      0.00      0.00        13

    accuracy                           0.74      1954
   macro avg       0.48      0.34      0.35      1954
weighted avg       0.69      0.74      0.71      1954

Confusion matrix:
[[ 136    6    0   12    2    0    1 

https://chatgpt.com/s/t_691d611fdff881918f0e371174d83550

CatBoost (Data Murni)

1. Memuat dataset Telco.
2. Mendeteksi fitur kategori secara otomatis.
3. Membuat Pool CatBoost yang optimal.
4. Menghitung class weight agar seimbang.
5. Melatih model CatBoost high-accuracy (depth 8, iterations 1500).
6. Menghasilkan evaluasi lengkap (F1, AUC, Confusion Matrix).
7. Menyimpan model siap pakai untuk produksi.

In [36]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    f1_score, classification_report, confusion_matrix, roc_auc_score
)
import numpy as np
import joblib

# Load dataset raw Telco
df = pd.read_csv("data/raw/data_capstone.csv")

# Target
target = "target_offer"

# Fitur
X = df.drop(columns=[target])
y = df[target]

# Identifikasi categorical features otomatis
cat_features = X.select_dtypes(include=["object", "category"]).columns.tolist()
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

print("Categorical Features:", cat_features)
print("Numeric Features:", numeric_features)


Categorical Features: ['customer_id', 'plan_type', 'device_brand']
Numeric Features: ['avg_data_usage_gb', 'pct_video_usage', 'avg_call_duration', 'sms_freq', 'monthly_spend', 'topup_freq', 'travel_score', 'complaint_count']


In [37]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [38]:
for c in cat_features:
    X_train[c] = X_train[c].astype("category")
    X_test[c] = X_test[c].astype("category")


In [None]:
train_pool = Pool(X_train, y_train, cat_features=cat_features)
test_pool = Pool(X_test, y_test, cat_features=cat_features)


In [39]:
model = CatBoostClassifier(
    loss_function="MultiClass",
    eval_metric="TotalF1",        # optimasi F1 langsung
    learning_rate=0.05,
    depth=8,
    iterations=1500,
    l2_leaf_reg=3,
    random_seed=42,
    verbose=200,
    class_weights='Balanced',     # menangani imbalance
    task_type="CPU"
)


In [40]:
from catboost import CatBoostClassifier, Pool
import numpy as np

# Hitung class_weights berdasarkan distribusi kelas
class_counts = y_train.value_counts().sort_index()
class_weights = (1 / class_counts).values

train_pool = Pool(X_train, y_train, cat_features=cat_features)
test_pool  = Pool(X_test,  y_test,  cat_features=cat_features)

model = CatBoostClassifier(
    loss_function="MultiClass",
    eval_metric="TotalF1",
    learning_rate=0.05,
    depth=8,
    iterations=1500,
    l2_leaf_reg=3,
    random_seed=42,
    verbose=200,
    class_weights=class_weights
)

model.fit(train_pool)


0:	learn: 0.8735410	total: 759ms	remaining: 18m 58s
200:	learn: 0.9986058	total: 1m 42s	remaining: 11m 1s
400:	learn: 0.9992868	total: 3m 18s	remaining: 9m 3s
600:	learn: 0.9995871	total: 4m 55s	remaining: 7m 22s
800:	learn: 0.9996100	total: 6m 34s	remaining: 5m 43s
1000:	learn: 0.9997253	total: 8m 11s	remaining: 4m 5s
1200:	learn: 0.9997482	total: 9m 50s	remaining: 2m 27s
1400:	learn: 0.9997482	total: 11m 33s	remaining: 49s
1499:	learn: 0.9997482	total: 12m 22s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1b54b9d5720>

In [41]:
model.fit(train_pool)


0:	learn: 0.8735410	total: 585ms	remaining: 14m 37s
200:	learn: 0.9986058	total: 1m 37s	remaining: 10m 29s
400:	learn: 0.9992868	total: 3m 13s	remaining: 8m 51s
600:	learn: 0.9995871	total: 4m 50s	remaining: 7m 15s
800:	learn: 0.9996100	total: 6m 28s	remaining: 5m 38s
1000:	learn: 0.9997253	total: 8m 6s	remaining: 4m 2s
1200:	learn: 0.9997482	total: 9m 51s	remaining: 2m 27s
1400:	learn: 0.9997482	total: 11m 30s	remaining: 48.8s
1499:	learn: 0.9997482	total: 12m 19s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1b54b9d5720>

In [42]:
y_pred = model.predict(test_pool)
y_pred_int = np.argmax(model.predict_proba(test_pool), axis=1)
y_test_int = y_test.astype("category").cat.codes.values


In [43]:
f1 = f1_score(y_test_int, y_pred_int, average="weighted")
print("Weighted F1:", f1)


Weighted F1: 0.9932208933549366


In [44]:
print(classification_report(y_test_int, y_pred_int))


              precision    recall  f1-score   support

           0       0.98      0.99      0.99       159
           1       1.00      0.99      0.99       300
           2       1.00      1.00      1.00        16
           3       1.00      0.99      1.00      1214
           4       1.00      0.99      1.00       152
           5       0.76      1.00      0.86        19
           6       0.95      1.00      0.97        52
           7       1.00      0.99      0.99        74
           8       0.93      0.93      0.93        14

    accuracy                           0.99      2000
   macro avg       0.96      0.99      0.97      2000
weighted avg       0.99      0.99      0.99      2000



In [None]:
cm = confusion_matrix(y_test_int, y_pred_int)
print("Confusion Matrix:")
print(cm)



Confusion Matrix:
[[ 158    1    0    0    0    0    0    0    0]
 [   1  297    0    0    0    2    0    0    0]
 [   0    0   16    0    0    0    0    0    0]
 [   2    0    0 1207    0    2    3    0    0]
 [   0    0    0    0  151    0    0    0    1]
 [   0    0    0    0    0   19    0    0    0]
 [   0    0    0    0    0    0   52    0    0]
 [   0    0    0    0    0    1    0   73    0]
 [   0    0    0    0    0    1    0    0   13]]


###### ROC-AUC Multi-Class

In [46]:
y_pred_proba = model.predict_proba(test_pool)
auc = roc_auc_score(y_test_int, y_pred_proba, multi_class='ovr')
print("ROC-AUC (OVR):", auc)


ROC-AUC (OVR): 0.9997942979921588


In [47]:
# Simpan Model
model.save_model("catboost_telco_best.cbm")
print("Model saved as catboost_telco_best.cbm")


Model saved as catboost_telco_best.cbm


# CatBoost Menggunakan data_capstone.csv

Versi paling akurat
1. ‚úî Memakai Pool ‚Üí kategori diproses optimal
2. ‚úî Menggunakan konfigurasi lebih kuat
3. ‚úî Laporan evaluasi lebih mudah dibaca

- Tidak overfitting secara umum, karena skor test masih sangat tinggi dan stabil.
- Yang terjadi adalah ketidakseimbangan data, sehingga kelas kecil tidak bisa dipelajari dengan baik.

In [48]:
# ==========================================================
#   MODEL CATBOOST KHUSUS TELCO | TARGET OFFER PREDICTION
# ==========================================================

import pandas as pd
from catboost import CatBoostClassifier, Pool

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    f1_score,
    classification_report,
    confusion_matrix,
    roc_auc_score
)


# =========================================
# LOAD DATASET
# =========================================
df = pd.read_csv("data/raw/data_capstone.csv")  # sesuai memory Anda

# =========================================
# PILIH FITUR
# =========================================
selected_features = [
    'plan_type',
    'device_brand',
    'avg_data_usage_gb',
    'pct_video_usage',
    'monthly_spend',
    'topup_freq',
    'travel_score',
    'complaint_count'
]

target = "target_offer"

X = df[selected_features].copy()
y = df[target].copy()

# =========================================
# ENCODE TARGET LABEL
# =========================================
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# =========================================
# TRAIN TEST SPLIT
# =========================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)

# =========================================
# CATBOOST NEEDS CATEGORICAL INDEX
# =========================================
cat_features = [0, 1]  # plan_type, device_brand (index in selected_features)

train_pool = Pool(X_train, y_train, cat_features=cat_features)
test_pool = Pool(X_test, y_test, cat_features=cat_features)

# =========================================
# TRAIN CATBOOST MODEL
# =========================================
model = CatBoostClassifier(
    iterations=1200,
    learning_rate=0.05,
    depth=8,
    loss_function="MultiClass",
    eval_metric="TotalF1",
    random_seed=42,
    verbose=False
)

model.fit(train_pool)

# =========================================
# PREDIKSI
# =========================================
y_pred = model.predict(test_pool)
y_pred = y_pred.flatten().astype(int)

y_pred_proba = model.predict_proba(test_pool)

# =========================================
# EVALUATION METRICS
# =========================================

print("\n====================================")
print("‚ö° CATBOOST EVALUATION METRICS")
print("====================================\n")

# 1. F1 SCORE
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"üîπ F1-Score (Weighted): {f1:.4f}")

# 2. CLASSIFICATION REPORT
print("\nüîπ Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# 3. CONFUSION MATRIX
print("\nüîπ Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# 4. ROC-AUC MULTICLASS
try:
    auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
    print(f"\nüîπ ROC-AUC (ovr): {auc:.4f}")
except:
    print("\n‚ö†Ô∏è ROC-AUC tidak bisa dihitung karena distribusi kelas tertentu tidak valid.")

# =========================================
# SIMPAN MODEL
# =========================================
model.save_model("catboost_telco_model.cbm")
print("\nüìÅ Model berhasil disimpan: catboost_telco_model.cbm")



‚ö° CATBOOST EVALUATION METRICS

üîπ F1-Score (Weighted): 0.9726

üîπ Classification Report:
                        precision    recall  f1-score   support

          Data Booster       0.99      0.99      0.99       159
  Device Upgrade Offer       0.99      0.99      0.99       300
     Family Plan Offer       0.00      0.00      0.00        16
         General Offer       0.98      1.00      0.99      1214
       Retention Offer       1.00      0.99      1.00       152
          Roaming Pass       0.75      0.95      0.84        19
Streaming Partner Pack       0.93      1.00      0.96        52
          Top-up Promo       1.00      0.99      0.99        74
          Voice Bundle       0.00      0.00      0.00        14

              accuracy                           0.98      2000
             macro avg       0.74      0.77      0.75      2000
          weighted avg       0.97      0.98      0.97      2000


üîπ Confusion Matrix:
[[ 158    1    0    0    0    0    0    0    

# CatBoost menggunakan data preposesing

In [None]:
df = pd.read_csv('data/processed/processed_data.csv')
df.head()

Unnamed: 0,avg_data_usage_gb,pct_video_usage,avg_call_duration,sms_freq,monthly_spend,topup_freq,travel_score,complaint_count,plan_type_Prepaid,device_brand_Huawei,device_brand_Oppo,device_brand_Realme,device_brand_Samsung,device_brand_Vivo,device_brand_Xiaomi,target
0,-1.061621,2.04672,-0.486489,-0.521547,-0.86055,0.593283,0.013842,-0.699682,0.796856,-0.409856,-0.399751,2.376994,-0.408663,-0.402674,-0.414445,3
1,-1.159227,-1.507241,-0.14744,-1.552817,-1.011679,0.008441,-1.055531,-0.699682,-1.254932,-0.409856,-0.399751,-0.420699,-0.408663,2.483401,-0.414445,3
2,-0.647391,-0.454985,-1.209649,-0.521547,-0.450343,2.34781,0.762691,-0.699682,-1.254932,-0.409856,-0.399751,-0.420699,-0.408663,-0.402674,2.412867,3
3,-0.15222,0.087267,-0.705368,-1.810634,-0.925319,0.593283,0.125934,-0.699682,0.796856,-0.409856,-0.399751,-0.420699,-0.408663,-0.402674,-0.414445,3
4,-0.964016,-0.772672,0.163712,1.540993,-0.81737,1.178125,1.298936,-0.699682,0.796856,2.439882,-0.399751,-0.420699,-0.408663,-0.402674,-0.414445,3


- Hanya menggunakan train‚Äìtest split, tanpa validation.
- Tidak ada early stopping.
- Regularisasi minimal.
- Tidak memantau performa selama training (tidak ada eval_set).
- Jumlah iterasi lebih besar (600).
- Fokus pada efisiensi RAM, tetapi lebih rawan overfitting

##### OVER FITTING PARAH

In [57]:
# ==========================================================
#   MODEL CATBOOST TELCO | NUMPY VERSION (SAFE MODE)
# ==========================================================

import numpy as np
import pandas as pd
from catboost import CatBoostClassifier

from sklearn.metrics import (
    f1_score,
    classification_report,
    confusion_matrix,
    roc_auc_score
)

# LOAD DATA SPLIT

X_train = np.load("data/processed/x_train.npy")
y_train = np.load("data/processed/y_train.npy")
X_test  = np.load("data/processed/x_test.npy")
y_test  = np.load("data/processed/y_test.npy")


# CATBOOST SAFE CONFIG (ANTI BAD-ALLOCATION)

model = CatBoostClassifier(
    iterations=600,          # dari 1500 ‚Üí 600 (lebih ringan, masih akurat)
    learning_rate=0.05,
    depth=6,                 # dari 10 ‚Üí 6 (mengurangi RAM 50%)
    loss_function="MultiClass",
    eval_metric="TotalF1",
    random_seed=42,
    task_type="CPU",
    grow_policy="Depthwise", # lebih hemat memori
    verbose=False
)

model.fit(X_train, y_train)

# PREDIKSI
y_pred = model.predict(X_test).astype(int).flatten()
y_pred_proba = model.predict_proba(X_test)

# EVALUATION METRICS

print("\n====================================")
print("‚ö° CATBOOST EVALUATION METRICS (SAFE MODE)")
print("====================================\n")

f1 = f1_score(y_test, y_pred, average='weighted')
print(f"üîπ F1-Score (Weighted): {f1:.4f}")

print("\nüîπ Classification Report:")
print(classification_report(y_test, y_pred))

print("\nüîπ Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

try:
    auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
    print(f"\nüîπ ROC-AUC (ovr): {auc:.4f}")
except:
    print("\n‚ö†Ô∏è ROC-AUC tidak bisa dihitung karena ada kelas kosong.")

# SAVE MODEL

model.save_model("catboost_telco_model.cbm")
print("\nüìÅ Model berhasil disimpan ‚Üí catboost_telco_model.cbm")



‚ö° CATBOOST EVALUATION METRICS (SAFE MODE)

üîπ F1-Score (Weighted): 0.9939

üîπ Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.96      0.97       157
           1       0.99      1.00      0.99       293
           2       1.00      1.00      1.00        16
           3       1.00      1.00      1.00      1185
           4       0.99      1.00      0.99       148
           5       0.89      0.94      0.92        18
           6       1.00      1.00      1.00        51
           7       1.00      1.00      1.00        73
           8       0.92      0.92      0.92        13

    accuracy                           0.99      1954
   macro avg       0.98      0.98      0.98      1954
weighted avg       0.99      0.99      0.99      1954


üîπ Confusion Matrix:
[[ 151    3    0    1    1    1    0    0    0]
 [   1  292    0    0    0    0    0    0    0]
 [   0    0   16    0    0    0    0    0    0]
 [   1    0    0 11

https://chatgpt.com/s/t_691ea8aa0e7c8191b7eb688db360f0fa
OVERFITTING

- Menggunakan train‚Äìvalidation‚Äìtest, sehingga model dipantau selama training.
- Memakai early stopping untuk menghentikan training jika performa tidak meningkat.
- Regularisasi lebih lengkap:
subsample,colsample_bylevel,l2_leaf_reg,random_strength
- Training menggunakan eval_set, sehingga performa dievaluasi setiap iterasi.
- Iterasi lebih sedikit (400) karena sudah dibantu kontrol overfitting.
- Lebih aman, stabil, dan minim overfitting.

##### KODE INI MASIH OVERFITTING

In [62]:
from catboost import CatBoostClassifier
import numpy as np
from sklearn.metrics import f1_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split

# ==========================================================
# LOAD DATA
# ==========================================================
X_train = np.load("data/processed/x_train.npy")
y_train = np.load("data/processed/y_train.npy")
X_test  = np.load("data/processed/x_test.npy")
y_test  = np.load("data/processed/y_test.npy")

# Validation split
X_train2, X_val, y_train2, y_val = train_test_split(
    X_train, y_train,
    test_size=0.15,
    random_state=42,
    stratify=y_train
)

# ==========================================================
# CATBOOST ‚Äî CPU SAFE & ANTI OVERFITTING
# ==========================================================
model = CatBoostClassifier(
    iterations=400,
    learning_rate=0.05,
    depth=6,
    loss_function="MultiClass",
    eval_metric="TotalF1",
    random_seed=42,
    task_type="CPU",
    grow_policy="Depthwise",

    # === FIX: ONLY CPU-SAFE OPTION ===
    bootstrap_type="Bernoulli",
    subsample=0.8,

    # === Regularization ===
    l2_leaf_reg=5,
    random_strength=1.2,
    colsample_bylevel=0.8,

    early_stopping_rounds=40,
    verbose=False
)

# TRAIN
model.fit(
    X_train2, y_train2,
    eval_set=(X_val, y_val),
    use_best_model=True
)

# ==========================================================
# PREDIKSI
# ==========================================================
y_pred = model.predict(X_test).flatten()
y_pred_proba = model.predict_proba(X_test)

# ==========================================================
# METRIK
# ==========================================================
print("\n=== CATBOOST FINAL METRICS ===")
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"F1-Weighted: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

try:
    auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
    print(f"\nROC-AUC (ovr): {auc:.4f}")
except:
    print("\nROC-AUC tidak dapat dihitung.")

# SAVE MODEL
model.save_model("catboost_telco_model.cbm")
print("\nModel saved ‚Üí catboost_telco_model.cbm")



=== CATBOOST FINAL METRICS ===
F1-Weighted: 0.9899

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.96      0.96       157
           1       0.99      0.99      0.99       293
           2       0.94      1.00      0.97        16
           3       1.00      0.99      1.00      1185
           4       0.97      1.00      0.99       148
           5       0.82      1.00      0.90        18
           6       0.96      1.00      0.98        51
           7       1.00      1.00      1.00        73
           8       0.87      1.00      0.93        13

    accuracy                           0.99      1954
   macro avg       0.95      0.99      0.97      1954
weighted avg       0.99      0.99      0.99      1954


Confusion Matrix:
[[ 150    3    0    0    1    3    0    0    0]
 [   2  290    1    0    0    0    0    0    0]
 [   0    0   16    0    0    0    0    0    0]
 [   2    0    0 1175    3    1    2    0    2]
 [   0  

feat:add initial CatBoost model (baseline, still overfitting)