In [22]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from scipy.stats import yeojohnson
import scipy.stats.mstats as mstats
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# Read the data from the CSV file
data = pd.read_csv("./ifood_df.csv")

# Define lists of column names
skewed_features = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 'MntTotal', 'MntRegularProds']
numerical_cols = ['Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 'Customer_Days', 'MntTotal', 'MntRegularProds']

# Define skewness correction function
def skewness_correction(X):
    for feature in skewed_features:
        X[feature], _ = yeojohnson(X[feature] + 1)  # Adding 1 to avoid issues with zero values
    return X

# Define FunctionTransformer for skewness correction
skewness_corrector = FunctionTransformer(skewness_correction)

# Define binning and encoding functions
def binning_and_encoding(X):
    X_binned = X.copy()
    # Define the maximum income value
    max_income = X_binned['Income'].max()

    # Define the bin edges for income
    bin_edges_income = [0, 30000, 75000, max_income]

    # Define the bin labels for income
    bin_labels_income = ['Income_Low', 'Income_Average', 'Income_High']

    # Create the income_bins column using binning
    X_binned['Income'] = pd.cut(X_binned['Income'], bins=bin_edges_income, labels=bin_labels_income, right=False)

    # Define the bin edges for age
    age_bin_edges = [0, 20, 40, 60, float('inf')]  # Define the age bins as [0-30], [31-50], [51-70], [71 and above]

    # Define the bin labels for age
    age_bin_labels = ['Age_Young', 'Age_Adult', 'Age_Middle-aged', 'Age_Senior']

    # Create the age_bins column using binning
    X_binned['Age'] = pd.cut(X_binned['Age'], bins=age_bin_edges, labels=age_bin_labels, right=False)

    # Perform one-hot encoding on 'Age' and 'Income' columns
    X_encoded = pd.get_dummies(X_binned, columns=['Age', 'Income'], drop_first=False)

    return X_encoded

# Define FunctionTransformer for binning and encoding
binning_and_encoding_transformer = FunctionTransformer(binning_and_encoding)

# Define scaling function
def scale_numerical_columns(X):
    X_scaled = X.copy()
    # Apply winsorization to the MntRegularProds column because of outliers
    X_scaled['MntRegularProds'] = mstats.winsorize(X_scaled['MntRegularProds'], limits=[0.05, 0.05])

    # Initialize MinMaxScaler
    scaler = MinMaxScaler()

    # Apply Min-Max scaling to the numerical columns
    X_scaled[numerical_cols] = scaler.fit_transform(X_scaled[numerical_cols])

    return X_scaled

# Define FunctionTransformer for scaling
scaling_transformer = FunctionTransformer(scale_numerical_columns)

# Integrate scaling into the preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('skewness_corrector', skewness_corrector, skewed_features),
        ('binning_and_encoding', binning_and_encoding_transformer, ['Income', 'Age']),
        ('scaling', scaling_transformer, numerical_cols)
    ])

# Select features and target
X = data.drop(columns=['Response'])  # Adjust accordingly if 'Response' is not the target column
y = data['Response']

# Apply preprocessing steps
X_preprocessed = preprocessor.fit_transform(X)

print("Shape of X:", X.shape)
print("Shape of X_preprocessed:", X_preprocessed.shape)

# Initialize the RFE selector with the model and desired number of features to select
estimator = LogisticRegression()  # For RFE initialization
# Initialize the RFE selector with the model and desired number of features to select
selector_rfe = RFE(estimator, n_features_to_select=5, step=1)

# Fit the RFE selector to your preprocessed data
selector_rfe.fit(X_preprocessed, y)

print("Length of selector_rfe.support_:", len(selector_rfe.support_))


# Get the indices of selected features
selected_features_rfe_indices = np.where(selector_rfe.support_)[0]  # Get the indices where support_ is True
print(selected_features_rfe_indices)
selected_features_rfe = np.array(X.columns)[selected_features_rfe_indices]

print(selected_features_rfe)

# Create pipeline with GridSearchCV
pipeline = Pipeline([
    ('feature_selection', selector_rfe),
    ('clf', GridSearchCV(estimator=None, param_grid=None, cv=5, scoring='accuracy'))  # Placeholder for GridSearchCV
])

# Define different models and their hyperparameter grids
models = {
    'Logistic Regression': {
        'model': LogisticRegression(),
        'params': {
            'C': [0.001, 0.01, 0.1, 1, 10, 100],
            'penalty': ['l1', 'l2'],
            'solver': ['liblinear', 'saga']
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10]
        }
    },
    'SVM': {
        'model': SVC(),
        'params': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf']
        }
    },
    'Gradient Boosting': {
        'model': GradientBoostingClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 0.5],
            'max_depth': [3, 5, 7]
        }
    }
}

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

# Fit the pipeline using different models and hyperparameter grids
best_models = {}
for name, model in models.items():
    print(f"Training {name}...")
    pipeline.named_steps['clf'].param_grid = model['params']
    pipeline.named_steps['clf'].estimator = model['model']
    pipeline.fit(X_train, y_train)
    
    # Save best model
    best_models[name] = pipeline.named_steps['clf'].best_estimator_

# Evaluate and select the best model based on validation performance
best_model_name = None
best_model_score = -1
for name, model in best_models.items():
    # Only use selected features for testing
    X_test_selected = X_test[:, selector_rfe.support_]
    score = model.score(X_test_selected, y_test)
    print(f"{name} Accuracy: {score}")
    if score > best_model_score:
        best_model_name = name
        best_model_score = score

print(f"Best Model: {best_model_name} with Accuracy: {best_model_score}")

Shape of X: (2205, 38)
Shape of X_preprocessed: (2205, 35)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Length of selector_rfe.support_: 35
[ 8  9 20 21 32]
['MntSweetProducts' 'MntGoldProds' 'Complain' 'Z_CostContact'
 'education_Graduation']
Training Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Training Random Forest...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Training SVM...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Training Gradient Boosting...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Logistic Regression Accuracy: 0.8684807256235828
Random Forest Accuracy: 0.8571428571428571
SVM Accuracy: 0.8594104308390023
Gradient Boosting Accuracy: 0.8616780045351474
Best Model: Logistic Regression with Accuracy: 0.8684807256235828


## Factorized Version

In [None]:
# Define preprocessing functions
def skewness_correction(X):
    skewed_features = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
                       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases',
                       'NumStorePurchases', 'NumWebVisitsMonth', 'MntTotal', 'MntRegularProds']
    for feature in skewed_features:
        X[feature], _ = yeojohnson(X[feature] + 1)  # Adding 1 to avoid issues with zero values
    return X

def binning_and_encoding(X):
    X_binned = X.copy()
    max_income = X_binned['Income'].max()
    bin_edges_income = [0, 30000, 75000, max_income]
    bin_labels_income = ['Income_Low', 'Income_Average', 'Income_High']
    X_binned['Income'] = pd.cut(X_binned['Income'], bins=bin_edges_income, labels=bin_labels_income, right=False)
    age_bin_edges = [0, 20, 40, 60, float('inf')]
    age_bin_labels = ['Age_Young', 'Age_Adult', 'Age_Middle-aged', 'Age_Senior']
    X_binned['Age'] = pd.cut(X_binned['Age'], bins=age_bin_edges, labels=age_bin_labels, right=False)
    X_encoded = pd.get_dummies(X_binned, columns=['Age', 'Income'], drop_first=False)
    return X_encoded

def scale_numerical_columns(X):
    X_scaled = X.copy()
    X_scaled['MntRegularProds'] = mstats.winsorize(X_scaled['MntRegularProds'], limits=[0.05, 0.05])
    scaler = MinMaxScaler()
    numerical_cols = ['Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
                      'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases',
                      'NumStorePurchases', 'NumWebVisitsMonth', 'Customer_Days', 'MntTotal', 'MntRegularProds']
    X_scaled[numerical_cols] = scaler.fit_transform(X_scaled[numerical_cols])
    return X_scaled

# Create preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('skewness_corrector', FunctionTransformer(skewness_correction)),
    ('binning_and_encoding', FunctionTransformer(binning_and_encoding)),
    ('scaling', FunctionTransformer(scale_numerical_columns))
])

# Read the data from the CSV file
data = pd.read_csv("./ifood_df.csv")

# Select features and target
X = data.drop(columns=['Response'])
y = data['Response']

# Apply preprocessing pipeline
X_preprocessed = preprocessing_pipeline.fit_transform(X)

# Initialize the RFE selector with the model and desired number of features to select
estimator = LogisticRegression()
selector_rfe = RFE(estimator, n_features_to_select=5, step=1)

# Create modeling pipeline
modeling_pipeline = Pipeline([
    ('feature_selection', selector_rfe),
    ('clf', GridSearchCV(estimator=None, param_grid=None, cv=5, scoring='accuracy'))
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

# Fit the modeling pipeline using different models and hyperparameter grids
models = {
    'Logistic Regression': {
        'model': LogisticRegression(),
        'params': {
            'C': [0.001, 0.01, 0.1, 1, 10, 100],
            'penalty': ['l1', 'l2'],
            'solver': ['liblinear', 'saga']
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10]
        }
    },
    'SVM': {
        'model': SVC(),
        'params': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf']
        }
    },
    'Gradient Boosting': {
        'model': GradientBoostingClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 0.5],
            'max_depth': [3, 5, 7]
        }
    }
}

best_models = {}
for name, model in models.items():
    print(f"Training {name}...")
    modeling_pipeline.named_steps['clf'].param_grid = model['params']
    modeling_pipeline.named_steps['clf'].estimator = model['model']
    modeling_pipeline.fit(X_train, y_train)
    best_models[name] = modeling_pipeline.named_steps['clf'].best_estimator_

# Evaluate and select the best model based on validation performance
best_model_name = None
best_model_score = -1
for name, model in best_models.items():
    X_test_selected = X_test[:, selector_rfe.support_]
    score = model.score(X_test_selected, y_test)
    print(f"{name} Accuracy: {score}")
    if score > best_model_score:
        best_model_name = name
        best_model_score = score

print(f"Best Model: {best_model_name} with Accuracy: {best_model_score}")