# Step 1: Import Libraries and Modules

In [None]:
import pandas as pd
from data_loader import DataLoader
from features import FeatureEngineer
from train import Trainer
from evaluate import plot_confusion_matrix

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns


# Step 2: Load Raw Data

In [None]:

data_path = '../data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv'
loader = DataLoader(data_path)
df = loader.load_data()
df = loader.basic_cleaning(df)

print("Dataset loaded and cleaned")
print(df.shape)
print(df.head())


# Step 3: Feature Engineering

In [None]:
target_col = 'Churn'

# Identify categorical and numeric features
categorical_cols = df.select_dtypes(include='object').columns.tolist()
if target_col in categorical_cols:
    categorical_cols.remove(target_col)

numeric_cols = df.select_dtypes(include='number').columns.tolist()

from sklearn.preprocessing import LabelEncoder

le_target = LabelEncoder()
df[target_col] = le_target.fit_transform(df[target_col])  # Yes -> 1, No -> 0

#numeric_cols.remove('Churn')  # Assuming 'Churn' is target

fe = FeatureEngineer()
df = fe.encode_categoricals(df, categorical_cols)
df = fe.scale_features(df, numeric_cols)

print("Feature engineering completed")
print(df.head())


# Step 4: Train Multiple Models


In [None]:
target_col = 'Churn'  # Assuming this is the target
trainer = Trainer(df, target_col)
results = trainer.train_models()

print("\nModel Training Results:")
for model_name, metrics in results.items():
    print(f"{model_name}: Accuracy={metrics['accuracy']:.4f}, F1={metrics['f1_score']:.4f}, ROC-AUC={metrics['roc_auc']:.4f}")


# Step 5: Evaluate Best Model (Random Forest)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X = df.drop(columns=[target_col])
y = df[target_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

best_model = RandomForestClassifier(n_estimators=100, random_state=42)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)


# Plot Confusion Matrix

In [None]:

plot_confusion_matrix(y_test, y_pred, title="Random Forest Confusion Matrix")


# Plot Feature Importance

In [None]:
importances = best_model.feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances}).sort_values(by='importance', ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(x='importance', y='feature', data=importance_df)
plt.title("Random Forest Feature Importance")
plt.show()
