# Import necessary libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Data Processing

In [5]:


# Load the dataset
data = pd.read_excel("customer_churn_large_dataset.xlsx")

# Step 1: Data Preprocessing

# 1.1 Handling Missing Data (if any)
data.dropna(inplace=True)  # Remove rows with missing values

# 1.2 Encode Categorical Variables
label_encoder = LabelEncoder()
data['Location'] = label_encoder.fit_transform(data['Location'])
data['Gender'] = label_encoder.fit_transform(data['Gender'])


# 1.3 Split the data into features (X) and the target variable (y)
X = data.drop(columns=['CustomerID', 'Name', 'Churn'])
y = data['Churn']

# 1.4 Feature Scaling
scaler = StandardScaler()
X[['Age', 'Subscription_Length_Months', 'Monthly_Bill', 'Total_Usage_GB']] = scaler.fit_transform(
    X[['Age', 'Subscription_Length_Months', 'Monthly_Bill', 'Total_Usage_GB']]
)



# Step 2: Feature Selection/Engineering

In [6]:
# Step 2: Feature Selection/Engineering

# 2.1 Identify Relevant Features
# You can use feature importance from a tree-based model (e.g., Random Forest) to identify important features.
from sklearn.ensemble import RandomForestClassifier

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Fit the model to the data to get feature importances
rf_classifier.fit(X, y)

# Get feature importances and map them to feature names
feature_importances = pd.Series(rf_classifier.feature_importances_, index=X.columns)

# Sort features by importance in descending order
feature_importances = feature_importances.sort_values(ascending=False)

# Select the top N important features (you can adjust N as needed)
N = 5
selected_features = feature_importances.head(N).index
X_selected = X[selected_features]

# Model Selection

In [None]:
# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Step 4: Initialize models
models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42),
    "Support Vector Machine": SVC(random_state=42)
}

# Model Creation

In [8]:
# Step 5: Create dictionaries to store the evaluation metrics for each model
accuracy_scores = {}
precision_scores = {}
recall_scores = {}
f1_scores = {}
roc_auc_scores = {}

# Step 6: Train and evaluate each model
for model_name, model in models.items():
    # Fit the model on the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the test data
    y_pred = model.predict(X_test)
    
    # Calculate evaluation metrics
    accuracy_scores[model_name] = accuracy_score(y_test, y_pred)
    precision_scores[model_name] = precision_score(y_test, y_pred)
    recall_scores[model_name] = recall_score(y_test, y_pred)
    f1_scores[model_name] = f1_score(y_test, y_pred)
    roc_auc_scores[model_name] = roc_auc_score(y_test, y_pred)

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [None]:
# Step 7: Print evaluation metrics for each model
print("Model Evaluation Metrics:")
for model_name in models.keys():
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy_scores[model_name]:.4f}")
    print(f"Precision: {precision_scores[model_name]:.4f}")
    print(f"Recall: {recall_scores[model_name]:.4f}")
    print(f"F1-Score: {f1_scores[model_name]:.4f}")
    print(f"ROC AUC: {roc_auc_scores[model_name]:.4f}")
    print("\n")

In [4]:
import pickle

# Save the trained models to pickle files
for model_name, model in models.items():
    with open(f'{model_name}_model.pkl', 'wb') as model_file:
        pickle.dump(model, model_file)
