In [25]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from pathlib import Path

import sys
from pathlib import Path

project_root = Path.cwd().parents[0]
sys.path.append(str(project_root))

from src.features.feature_engineering import feature_engineering
from src.utils.split_data import split_data
from src.utils.model_helpers import run_grid_search, save_model

In [26]:
# Load data
df = pd.read_csv("/Users/erik/Documents/GitHub/customer-churn-prediction/data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [27]:
# Feature engineering
X, y = feature_engineering(df)

# Train/test split
X_train, X_test, y_train, y_test = split_data(X, y)

In [28]:
rf_parameters={
    'n_estimators': [50,100,150,200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2,5,10],
    'max_features': ['sqrt', 'log2'] 
}

xgb_parameters={
    'max_depth': [3,5,7],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.5, 0.7, 1]

}

svm_parameters = {
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': [0.1, 1, 10],
    'C': [0.1, 1, 10],
    'degree': [2, 3, 4]
}

In [29]:
rf_model = RandomForestClassifier(random_state=42)
xgb_model = XGBClassifier()
svm_model = SVC()

In [30]:
models = {
    'Random Forest': {
        'model': rf_model,
        'param': rf_parameters
        },
    'XGBoost': {
        'model': xgb_model,
        'param': xgb_parameters
    },
    'Support Vector Machine': {
        'model': svm_model,
        'param': svm_parameters
    }
}

In [None]:
results = {}
best_param = {}

for name, config in models.items():
    model = config['model']
    params = config['param']
    print(f"\nRunning GridSearchCV for: {name}")
    grid = run_grid_search(model, params, X_train, y_train)

    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)

    f1 = f1_score(y_test, y_pred)

    results[name] = f1
    best_param[name] = grid.best_params_

    save_model(best_model, f"outputs/models/{name.lower().replace(' ', '_')}_tuned.pkl")


Running GridSearchCV for: Random Forest
Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV] END max_depth=None, max_features=sqrt, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=None, max_features=sqrt, min_samples_split=2, n_estimators=100; tot

In [34]:
print(results)
print(best_param)

{'Random Forest': 0.5714285714285714, 'XGBoost': 0.5891238670694864}
{'Random Forest': {'max_depth': 10, 'max_features': 'log2', 'min_samples_split': 10, 'n_estimators': 150}, 'XGBoost': {'learning_rate': 0.1, 'max_depth': 3, 'subsample': 0.7}}
