In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

# Load dataset
customer_data = pd.read_csv('C:/users/admin/Documents/ADS_Assignments/Telco-Customer-Churn.csv')

# Preprocessing
customer_data['TotalCharges'] = pd.to_numeric(customer_data['TotalCharges'], errors='coerce')
customer_data['TotalCharges'].fillna(customer_data['TotalCharges'].mean(), inplace=True)

for categorical_column in customer_data.select_dtypes(include=['object']):
    if categorical_column != 'customerID':
        customer_data[categorical_column] = LabelEncoder().fit_transform(customer_data[categorical_column])

# Splitting dataset
X_features = customer_data.drop(['customerID', 'Churn'], axis=1)
y_target = customer_data['Churn']
X_train_data, X_test_data, y_train_target, y_test_target = train_test_split(X_features, y_target, test_size=0.3, random_state=42)

# Scaling features
scaler = StandardScaler()
X_train_data_scaled = scaler.fit_transform(X_train_data)
X_test_data_scaled = scaler.transform(X_test_data)

# Training and evaluating models
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train_data_scaled, y_train_target)
logreg_predictions = logistic_regression_model.predict(X_test_data_scaled)

random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train_data_scaled, y_train_target)
rf_predictions = random_forest_model.predict(X_test_data_scaled)

# Metrics
logreg_accuracy = accuracy_score(y_test_target, logreg_predictions)
logreg_precision = precision_score(y_test_target, logreg_predictions)
logreg_recall = recall_score(y_test_target, logreg_predictions)
logreg_f1 = f1_score(y_test_target, logreg_predictions)

rf_accuracy = accuracy_score(y_test_target, rf_predictions)
rf_precision = precision_score(y_test_target, rf_predictions)
rf_recall = recall_score(y_test_target, rf_predictions)
rf_f1 = f1_score(y_test_target, rf_predictions)

print(f"Logistic Regression - Accuracy: {logreg_accuracy}, Precision: {logreg_precision}, Recall: {logreg_recall}, F1: {logreg_f1}")
print(f"Random Forest - Accuracy: {rf_accuracy}, Precision: {rf_precision}, Recall: {rf_recall}, F1: {rf_f1}")

# Hyperparameter tuning
param_grid = {'n_estimators': [10, 50, 100], 'max_depth': [None, 10, 20]}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_search.fit(X_train_data_scaled, y_train_target)
best_params = grid_search.best_params_

# Feature selection
feature_selector = RFE(RandomForestClassifier(**best_params), n_features_to_select=10)
X_train_rfe = feature_selector.fit_transform(X_train_data_scaled, y_train_target)
X_test_rfe = feature_selector.transform(X_test_data_scaled)

rf_optimized_model = RandomForestClassifier(**best_params)
rf_optimized_model.fit(X_train_rfe, y_train_target)
rf_optimized_predictions = rf_optimized_model.predict(X_test_rfe)

# Evaluation of the optimized model
rf_optimized_accuracy = accuracy_score(y_test_target, rf_optimized_predictions)
rf_optimized_precision = precision_score(y_test_target, rf_optimized_predictions)
rf_optimized_recall = recall_score(y_test_target, rf_optimized_predictions)
rf_optimized_f1 = f1_score(y_test_target, rf_optimized_predictions)

print(f"Optimized Random Forest - Accuracy: {rf_optimized_accuracy}, Precision: {rf_optimized_precision}, Recall: {rf_optimized_recall}, F1: {rf_optimized_f1}")

# Identifying important features
important_features = pd.Series(rf_optimized_model.feature_importances_, index=X_features.columns[feature_selector.support_])
important_features = important_features.sort_values(ascending=False)

print("\nImportant Features:")
print(important_features)

# Conclusion
print("\nBased on the evaluation metrics, the Optimized Random Forest model is the best-performing model.")
print("The top features contributing to customer churn prediction are:")
print(important_features.head(5))

Logistic Regression - Accuracy: 0.8106956933270232, Precision: 0.68125, Recall: 0.5696864111498258, F1: 0.6204933586337761
Random Forest - Accuracy: 0.7931850449597728, Precision: 0.6634844868735084, Recall: 0.4843205574912892, F1: 0.5599194360523666
