In [1]:
from functions import *

import boto3
import pandas as pd
import io
from itertools import combinations
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

settings = read_settings()

In [2]:
# Read the train and validation data from S3
train_s3_key = f"{settings['project_path_s3']}/data/train/train.csv"
valid_s3_key = f"{settings['project_path_s3']}/data/valid/valid.csv"

train_df = read_csv_from_s3(settings['bucket_name'], train_s3_key)
valid_df = read_csv_from_s3(settings['bucket_name'], valid_s3_key)

In [3]:
def get_all_combinations(columns):
    """
    Get all possible combinations of columns for logistic regression models.

    Parameters:
    - columns: list, the list of column names

    Returns:
    - list of tuples, each tuple contains a combination of column names
    """
    combinations_list = []
    for r in range(1, len(columns) + 1):
        combinations_list.extend(combinations(columns, r))
    return combinations_list

In [4]:
def build_and_evaluate_models(train_df, valid_df, target_column):
    """
    Build and evaluate all possible logistic regression models.

    Parameters:
    - train_df: pd.DataFrame, the training dataset
    - valid_df: pd.DataFrame, the validation dataset
    - target_column: str, the name of the target column

    Returns:
    - list of tuples, each tuple contains model combination, train accuracy, valid accuracy, train AUC, and valid AUC
    """
    results = []
    feature_columns = [col for col in train_df.columns if col != target_column]
    combinations_list = get_all_combinations(feature_columns)

    for combination in combinations_list:
        model = LogisticRegression(max_iter=1000)
        model.fit(train_df[list(combination)], train_df[target_column])
        
        train_predictions = model.predict(train_df[list(combination)])
        valid_predictions = model.predict(valid_df[list(combination)])
        
        train_accuracy = accuracy_score(train_df[target_column], train_predictions)
        valid_accuracy = accuracy_score(valid_df[target_column], valid_predictions)
        
        train_auc = roc_auc_score(train_df[target_column], model.predict_proba(train_df[list(combination)])[:, 1])
        valid_auc = roc_auc_score(valid_df[target_column], model.predict_proba(valid_df[list(combination)])[:, 1])
        
        results.append((combination, train_accuracy, valid_accuracy, train_auc, valid_auc))

    return sorted(results, key=lambda x: x[3], reverse=True)  # Sort by valid AUC

# Ensure the target column is correctly specified
target_column = 'target'  # Change this if your target column has a different name

# Build and evaluate models
results = build_and_evaluate_models(train_df, valid_df, target_column)

# Print the results
for combination, train_acc, valid_acc, train_auc, valid_auc in results:
    print(f"Features: {combination}, Train Accuracy: {train_acc:.4f}, Valid Accuracy: {valid_acc:.4f}, Train AUC: {train_auc:.4f}, Valid AUC: {valid_auc:.4f}")

Features: ('col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6', 'col_7', 'col_8'), Train Accuracy: 0.8470, Valid Accuracy: 0.8480, Train AUC: 0.9394, Valid AUC: 0.9401
Features: ('col_1', 'col_2', 'col_3', 'col_5', 'col_6', 'col_7', 'col_8'), Train Accuracy: 0.8470, Valid Accuracy: 0.8480, Train AUC: 0.9394, Valid AUC: 0.9401
Features: ('col_1', 'col_2', 'col_3', 'col_6', 'col_7', 'col_8'), Train Accuracy: 0.8470, Valid Accuracy: 0.8479, Train AUC: 0.9394, Valid AUC: 0.9401
Features: ('col_1', 'col_2', 'col_3', 'col_4', 'col_6', 'col_7', 'col_8'), Train Accuracy: 0.8470, Valid Accuracy: 0.8479, Train AUC: 0.9394, Valid AUC: 0.9401
Features: ('col_1', 'col_2', 'col_4', 'col_5', 'col_6', 'col_7', 'col_8'), Train Accuracy: 0.8347, Valid Accuracy: 0.8367, Train AUC: 0.9256, Valid AUC: 0.9264
Features: ('col_1', 'col_2', 'col_5', 'col_6', 'col_7', 'col_8'), Train Accuracy: 0.8347, Valid Accuracy: 0.8365, Train AUC: 0.9256, Valid AUC: 0.9264
Features: ('col_1', 'col_2', 'col_4', 'col_6', 'c