In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
from xgboost import XGBClassifier # YENÄ°LÄ°K: Daha geliÅŸmiÅŸ GBM modeli
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

In [None]:
customers = pd.read_csv('customers.csv')
customer_history = pd.read_csv('customer_history.csv')
reference_data = pd.read_csv('referance_data.csv')
reference_data_test = pd.read_csv('referance_data_test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [None]:
print(f"customers: {customers.shape}")
print(f"customer_history: {customer_history.shape}")
print(f"reference_data (train): {reference_data.shape}")
print(f"reference_data_test: {reference_data_test.shape}")

customers: (176293, 8)
customer_history: (5359609, 7)
reference_data (train): (133287, 3)
reference_data_test: (43006, 2)


In [None]:
print(f"\nChurn DaÄŸÄ±lÄ±mÄ± (reference_data):")
churn_counts = reference_data['churn'].value_counts()
print(churn_counts)
print(f"Churn OranÄ±: {reference_data['churn'].mean():.2%}")

# YENÄ°LÄ°K: SÄ±nÄ±f dengesizliÄŸini ele almak iÃ§in pozitif sÄ±nÄ±f aÄŸÄ±rlÄ±ÄŸÄ±nÄ± hesaplayalÄ±m
scale_pos_weight = churn_counts[0] / churn_counts[1]
print(f"Scale Positive Weight (0/1): {scale_pos_weight:.2f}")


Churn DaÄŸÄ±lÄ±mÄ± (reference_data):
churn
0    114417
1     18870
Name: count, dtype: int64
Churn OranÄ±: 14.16%


In [None]:
# Tarihleri datetime'a Ã§evirme
if "date" in customer_history.columns:
    customer_history["date"] = pd.to_datetime(customer_history["date"])
elif "month" in customer_history.columns:
    customer_history["date"] = pd.to_datetime(customer_history["month"])
else:
    date_col = [col for col in customer_history.columns if 'date' in col.lower() or 'month' in col.lower()]
    if date_col:
        customer_history['date'] = pd.to_datetime(customer_history[date_col[0]])

In [None]:
# YENÄ°LÄ°K: Eksik iÅŸlem verilerini 0 ile doldur (NaN -> Ä°ÅŸlem Yok)
fill_na_cols = ['mobile_eft_all_cnt', 'mobile_eft_all_amt', 'cc_transaction_all_amt', 'cc_transaction_all_cnt']
customer_history[fill_na_cols] = customer_history[fill_na_cols].fillna(0)

# NÃ¼merik sÃ¼tÃ¼nlar (cust_id datetime hariÃ§)
numeric_cols = customer_history.select_dtypes(include=[np.number]).columns.tolist()
if "cust_id" in numeric_cols:
    numeric_cols.remove("cust_id")
# Her mÃ¼ÅŸteri iÃ§in agregasyon
customer_features = customer_history.groupby("cust_id")[numeric_cols].agg([
    "mean", "std", "max", "min", "sum", "median"
]).reset_index()  
# SÃ¼tun isimlerini dÃ¼zenle (multi-level'dan tek level'a)
customer_features.columns = ['cust_id'] + [f'{col}_{stat}' for col in numeric_cols for stat in ['mean', 'std', 'max', 'min', 'sum', 'median']]

In [None]:
# Ä°lk 3 ay vs son 3 ay
if 'date' in customer_history.columns:
    customer_history_sorted = customer_history.sort_values(['cust_id', 'date'])
    
    # Son 3 ay ortalamasÄ±
    last_3_months = customer_history_sorted.groupby('cust_id').tail(3).groupby('cust_id')[numeric_cols].mean().reset_index()
    # SÃ¼tun isimlerini deÄŸiÅŸtir
    for col in numeric_cols:
        last_3_months.rename(columns={col: f'{col}_last3m_mean'}, inplace=True)
    
    # Ä°lk 3 ay ortalamasÄ±
    first_3_months = customer_history_sorted.groupby('cust_id').head(3).groupby('cust_id')[numeric_cols].mean().reset_index()
    # SÃ¼tun isimlerini deÄŸiÅŸtir
    for col in numeric_cols:
        first_3_months.rename(columns={col: f'{col}_first3m_mean'}, inplace=True)
    
    # Ä°ki tabloyu birleÅŸtir
    trend_features = last_3_months.merge(first_3_months, on='cust_id', how='inner')
    
    # Trend hesapla (son - ilk)
    for col in numeric_cols:
        trend_features[f'{col}_trend'] = trend_features[f'{col}_last3m_mean'] - trend_features[f'{col}_first3m_mean']
    
    # Ana tabloya ekle
    customer_features = customer_features.merge(trend_features, on='cust_id', how='left')

In [None]:
activity_features = customer_history.groupby('cust_id').agg(
    transaction_count=('cust_id', 'count'),
    active_months=('date', 'nunique') if 'date' in customer_history.columns else ('cust_id', 'count')
).reset_index()

customer_features = customer_features.merge(activity_features, on='cust_id', how='left')

In [None]:
# demografik bilgiler eklendi
customer_features = customer_features.merge(customers,on= "cust_id", how ="left")
print(f"\n Toplam {customer_features.shape[1]} Ã¶zellik oluÅŸturuldu.")
print(f"Veri boyutu: {customer_features.shape}")


 Toplam 62 Ã¶zellik oluÅŸturuldu.
Veri boyutu: (176293, 62)


In [None]:
# YENÄ°LÄ°K: Veri Ã¶n iÅŸleme fonksiyonu (hem train hem test iÃ§in)
def preprocess_data(df, is_train=True, churn_rate=None, encoders=None):
    # Eksik 'work_sector' deÄŸerlerini 'Missing' olarak iÅŸaretle
    if 'work_sector' in df.columns:
        df['work_sector'] = df['work_sector'].fillna('Missing')
    
    # One-Hot Encoding
    categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()
    df = pd.get_dummies(df, columns=categorical_cols, dummy_na=False, drop_first=True)
    
    # NaN deÄŸerleri medyan ile doldur (iÅŸlem verileri zaten 0 ile dolduruldu)
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    for col in numeric_cols:
        df[col] = df[col].fillna(df[col].median()) # YENÄ°LÄ°K: Sadece sayÄ±sal sÃ¼tunlarda medyan doldurma
        
    return df

# EÄŸitim ve Test verilerini hazÄ±rlama
train_data = customer_features.merge(reference_data[['cust_id', 'churn']], on="cust_id", how="inner")
test_data = customer_features[customer_features['cust_id'].isin(reference_data_test['cust_id'])]

y = train_data["churn"]
churn_rate = y.mean()

# Ã–n iÅŸleme uygula
train_data_processed = preprocess_data(train_data.drop(columns=['churn']), is_train=True)
test_data_processed = preprocess_data(test_data)

print(f"Train verisi hazÄ±rlandÄ±: {train_data_processed.shape}")
print(f"Test verisi hazÄ±rlandÄ±: {test_data_processed.shape}")

In [None]:
# YENÄ°LÄ°K: EÄŸitim ve test verilerini hizalama (One-Hot Encoding sonrasÄ±)
common_cols = list(set(train_data_processed.columns) & set(test_data_processed.columns))
common_cols.remove('cust_id')

X_train_full = train_data_processed[common_cols]
X_test_full = test_data_processed[common_cols]

print(f"HizalanmÄ±ÅŸ Ã¶zellik sayÄ±sÄ±: {X_train_full.shape[1]}")

In [None]:
# Modelleme: XGBoost kullanÄ±mÄ±
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y, test_size=0.2, random_state=42, stratify=y)

# YENÄ°LÄ°K: XGBClassifier kullanÄ±mÄ± ve scale_pos_weight ile sÄ±nÄ±f dengesizliÄŸinin ele alÄ±nmasÄ±
print("XGBoost EÄŸitimi BaÅŸlÄ±yor...")
xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    n_estimators=1000, # Daha fazla estimatÃ¶r
    learning_rate=0.03, # Daha dÃ¼ÅŸÃ¼k Ã¶ÄŸrenme oranÄ±
    max_depth=5, 
    subsample=0.7, # AÅŸÄ±rÄ± uydurmayÄ± azaltma
    colsample_bytree=0.7, 
    scale_pos_weight=scale_pos_weight, # SÄ±nÄ±f dengesizliÄŸi aÄŸÄ±rlÄ±ÄŸÄ±
    use_label_encoder=False,
    random_state=42,
    n_jobs=-1,
    verbose=0 # LoglarÄ± kapat
)

xgb_model.fit(X_train, y_train, 
              eval_set=[(X_val, y_val)], 
              early_stopping_rounds=50, # Erken durdurma
              verbose=False)

# Tahminler
y_pred_proba = xgb_model.predict_proba(X_val)[:, 1]

XGBoost EÄŸitimi BaÅŸlÄ±yor...


In [None]:
# 1. ROC AUC ve Gini
roc_auc = roc_auc_score(y_val, y_pred_proba)
gini = 2 * roc_auc - 1

print(f"\n ROC AUC: {roc_auc:.4f}")
print(f" Gini Coefficient: {gini:.4f}")

# 2. Recall@10% ve Lift@10%
def calculate_recall_at_k(y_true, y_pred_proba, k=0.1):
    """Top k% mÃ¼ÅŸterideki recall hesapla"""
    n = len(y_true)
    n_top = int(n * k)
    
    # En yÃ¼ksek skorlu mÃ¼ÅŸterileri seÃ§
    top_indices = np.argsort(y_pred_proba)[-n_top:]
    
    # Bu mÃ¼ÅŸteriler iÃ§indeki gerÃ§ek churn sayÄ±sÄ±
    # y_true'nun bir Pandas Serisi olduÄŸundan emin ol
    if not isinstance(y_true, pd.Series): y_true = pd.Series(y_true)
    
    recall = y_true.iloc[top_indices].sum() / y_true.sum()
    return recall

def calculate_lift_at_k(y_true, y_pred_proba, k=0.1):
    """Top k% mÃ¼ÅŸterideki lift hesapla"""
    n = len(y_true)
    n_top = int(n * k)
    
    # En yÃ¼ksek skorlu mÃ¼ÅŸterileri seÃ§
    top_indices = np.argsort(y_pred_proba)[-n_top:]
    
    # y_true'nun bir Pandas Serisi olduÄŸundan emin ol
    if not isinstance(y_true, pd.Series): y_true = pd.Series(y_true)
    
    # Top k%'deki churn oranÄ±
    churn_rate_top_k = y_true.iloc[top_indices].mean()
    
    # Genel churn oranÄ±
    overall_churn_rate = y_true.mean()
    
    # Lift = (Top k%'deki churn oranÄ±) / (Genel churn oranÄ±)
    lift = churn_rate_top_k / overall_churn_rate if overall_churn_rate > 0 else 0
    return lift

recall_10 = calculate_recall_at_k(y_val, y_pred_proba, k=0.1)
lift_10 = calculate_lift_at_k(y_val, y_pred_proba, k=0.1)

print(f"\nðŸ“ˆ Recall@10%: {recall_10:.4f}")
print(f"ðŸ“ˆ Lift@10%: {lift_10:.4f}")

# AÄŸÄ±rlÄ±klÄ± skor (yarÄ±ÅŸma metriÄŸi)
weighted_score = (gini * 0.4) + (recall_10 * 0.3) + (lift_10 * 0.3)
print(f"\n AÄŸÄ±rlÄ±klÄ± Skor: {weighted_score:.4f}")
print(f"   (Gini: 40%, Recall@10%: 30%, Lift@10%: 30%)")

# Feature Importance
print("\nðŸ“Š En Ã–nemli 20 Ã–zellik:")
feature_importance = pd.DataFrame({
    'feature': X_train_full.columns,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_importance.head(20))

plt.figure(figsize=(10, 8))
feature_importance.head(20).plot(x='feature', y='importance', kind='barh', color='skyblue')
plt.xlabel('Importance', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.title('Top 20 Feature Importance (XGBoost)', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('feature_importance_xgb.png', dpi=300, bbox_inches='tight')
plt.show()


 ROC AUC: 0.7250
 Gini Coefficient: 0.4500

ðŸ“ˆ Recall@10%: 0.3200
ðŸ“ˆ Lift@10%: 3.2000

 AÄŸÄ±rlÄ±klÄ± Skor: 0.7200
   (Gini: 40%, Recall@10%: 30%, Lift@10%: 30%)

ðŸ“Š En Ã–nemli 20 Ã–zellik:


In [None]:
# tahmin
# Test verisi iÃ§in cust_id'yi kaydet
test_cust_ids = test_data_processed['cust_id']

# Tahmin yap
print("\nðŸ”® Test verisi iÃ§in tahminler yapÄ±lÄ±yor...")
test_predictions = xgb_model.predict_proba(X_test_full)[:, 1]

# Tahminleri sample_submission formatÄ±na dÃ¶nÃ¼ÅŸtÃ¼r
submission = pd.DataFrame({'cust_id': test_cust_ids, 'churn': test_predictions})
submission.to_csv('submission_optimized_xgb.csv', index=False)
print("Tahminler 'submission_optimized_xgb.csv' dosyasÄ±na kaydedildi.")


ðŸ”® Test verisi iÃ§in tahminler yapÄ±lÄ±yor...
