In [None]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score, roc_auc_score, recall_score
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score, recall_score, precision_score
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from imblearn.over_sampling import SMOTE


warnings.simplefilter(action='ignore', category=FutureWarning)
df=pd.read_csv('C:\\Users\\PC\\Desktop\\train_set.csv')
df_test=pd.read_csv('C:\\Users\\PC\\Desktop\\test_set.csv')

In [3]:
df['auto_payment'].fillna(0, inplace=True)
df['call_drops'].fillna(0, inplace=True)
#Sıkıntılı olursa geri dön
tenure_median = df['tenure'].median()
df['tenure'] = df['tenure'].fillna(tenure_median)
data_usage_median = df['data_usage'].median()
df['data_usage'] = df['data_usage'].fillna(data_usage_median)
monthly_charge_median = df['monthly_charge'].median()
df['monthly_charge'] = df['monthly_charge'].fillna(monthly_charge_median)
avg_values = df.groupby("service_type")["avg_call_duration"].transform("mean")
df["avg_call_duration"] = df["avg_call_duration"].fillna(avg_values)
df["avg_call_duration"] = df["avg_call_duration"].fillna(0)
roaming_usage_median = df['roaming_usage'].median()
df['roaming_usage'] = df['roaming_usage'].fillna(roaming_usage_median)

In [4]:
df = pd.get_dummies(df, columns=['service_type'], drop_first=False)
df.drop(['id', 'apps'], axis=1, inplace=True)

In [5]:
# Log dönüşümü uygulama
for col in ['avg_top_up_count', 'monthly_charge', 'tenure', 'age']:
    df[col] = np.log1p(df[col])  # log(1 + x) dönüşümünü uygula

X = df.drop(columns=["churn"])
y = df["churn"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

model = XGBClassifier(random_state=42, scale_pos_weight=1, learning_rate=0.05, max_depth=5, n_estimators=100)
cv_scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=3, scoring='roc_auc')

print(f"Average ROC AUC score from 3-fold cross-validation: {cv_scores.mean()}")

model.fit(X_train_resampled, y_train_resampled)
y_pred = model.predict(X_test)

Average ROC AUC score from 3-fold cross-validation: 0.9365698010131357


In [6]:
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
recall = recall_score(y_test, y_pred)

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(f"True Positives (TP): {tp}")
print(f"True Negatives (TN): {tn}")
print(f"False Positives (FP): {fp}")
print(f"False Negatives (FN): {fn}")

print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print(f"Recall Score: {recall:.4f}")

True Positives (TP): 8240
True Negatives (TN): 1346820
False Positives (FP): 231811
False Negatives (FN): 13129
F1 Score: 0.0630
ROC AUC Score: 0.7337
Recall Score: 0.3856


In [7]:
df_test['auto_payment'].fillna(0, inplace=True)
df_test['call_drops'].fillna(0, inplace=True)
df_test['tenure'] = df_test['tenure'].fillna(tenure_median)
df_test['data_usage'] = df_test['data_usage'].fillna(data_usage_median)
df_test['monthly_charge'] = df_test['monthly_charge'].fillna(monthly_charge_median)
df_test['avg_call_duration'] = df_test.groupby("service_type")["avg_call_duration"].transform("mean")
df_test["avg_call_duration"] = df_test["avg_call_duration"].fillna(0)
df_test['roaming_usage'] = df_test['roaming_usage'].fillna(roaming_usage_median)

In [8]:
df_test = pd.get_dummies(df_test, columns=['service_type'], drop_first=False)
df_test.drop(['id', 'apps'], axis=1, inplace=True)

In [None]:
# Test setine log dönüşümü uygula
for col in ['avg_top_up_count', 'monthly_charge', 'tenure', 'age']:
    df_test[col] = np.log1p(df_test[col])

X_test_set = df_test.drop(columns=["churn"])
y_test_set = df_test["churn"]

y_pred_test = model.predict(X_test_set)
y_pred_proba_test = model.predict_proba(X_test_set)[:, 1]

conf_matrix_test = confusion_matrix(y_test_set, y_pred_test)

TP_test = conf_matrix_test[1, 1]
TN_test = conf_matrix_test[0, 0]
FP_test = conf_matrix_test[0, 1]
FN_test = conf_matrix_test[1, 0]

recall_test = recall_score(y_test_set, y_pred_test)
f1_test = f1_score(y_test_set, y_pred_test)
roc_auc_test = roc_auc_score(y_test_set, y_pred_proba_test)

print("==== TEST SET METRİKLERİ ====")
print(f"True Positives (TP): {TP_test}")
print(f"True Negatives (TN): {TN_test}")
print(f"False Positives (FP): {FP_test}")
print(f"False Negatives (FN): {FN_test}")
print(f"Recall: {recall_test:.4f}")
print(f"F1 Score: {f1_test:.4f}")
print(f"ROC AUC Score: {roc_auc_test:.4f}")


==== TEST SET METRİKLERİ ====
True Positives (TP): 10402
True Negatives (TN): 1683272
False Positives (FP): 289646
False Negatives (FN): 16680
Recall: 0.3841
F1 Score: 0.0636
ROC AUC Score: 0.7348
