In [1]:
!pip install mlflow==2.13.2 sagemaker-mlflow==0.1.0

[0m

In [1]:
from functions import *

import boto3
import pandas as pd
import io
import mlflow
from mlflow.tracking import MlflowClient
from itertools import combinations
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

settings = read_settings()

In [2]:
mlflow.set_tracking_uri(settings['mlflow_arn'])
mlflow.set_experiment(settings['mlflow_experiment_name'])

KeyboardInterrupt: 

In [3]:
# Read the train and validation data from S3
train_s3_key = f"{settings['project_path_s3']}/data/train/train.csv"
valid_s3_key = f"{settings['project_path_s3']}/data/valid/valid.csv"

train_df = read_csv_from_s3(settings['bucket_name'], train_s3_key)
valid_df = read_csv_from_s3(settings['bucket_name'], valid_s3_key)

In [5]:
def get_all_combinations(columns):
    """
    Get all possible combinations of columns for logistic regression models.

    Parameters:
    - columns: list, the list of column names

    Returns:
    - list of tuples, each tuple contains a combination of column names
    """
    combinations_list = []
    for r in range(1, len(columns) + 1):
        combinations_list.extend(combinations(columns, r))
    return combinations_list

In [6]:
def build_and_evaluate_models(train_df, valid_df, target_column):
    """
    Build and evaluate all possible logistic regression models.

    Parameters:
    - train_df: pd.DataFrame, the training dataset
    - valid_df: pd.DataFrame, the validation dataset
    - target_column: str, the name of the target column

    Returns:
    - list of tuples, each tuple contains model combination, train accuracy, valid accuracy, train AUC, and valid AUC
    """
    results = []
    feature_columns = [col for col in train_df.columns if col != target_column]
    combinations_list = get_all_combinations(feature_columns)

    for combination in combinations_list:
        with mlflow.start_run():
            model = LogisticRegression(max_iter=1000, random_state=42)
            model.fit(train_df[list(combination)], train_df[target_column])

            train_predictions = model.predict(train_df[list(combination)])
            valid_predictions = model.predict(valid_df[list(combination)])

            train_accuracy = accuracy_score(train_df[target_column], train_predictions)
            valid_accuracy = accuracy_score(valid_df[target_column], valid_predictions)

            train_auc = roc_auc_score(train_df[target_column], model.predict_proba(train_df[list(combination)])[:, 1])
            valid_auc = roc_auc_score(valid_df[target_column], model.predict_proba(valid_df[list(combination)])[:, 1])

            # Log parameters: which variables are used
            for col in feature_columns:
                mlflow.log_param(f"use_{col}", col in combination)

            # Log metrics
            mlflow.log_metric("train_accuracy", train_accuracy)
            mlflow.log_metric("valid_accuracy", valid_accuracy)
            mlflow.log_metric("train_auc", train_auc)
            mlflow.log_metric("valid_auc", valid_auc)

            mlflow.sklearn.log_model(model, "model")

            results.append((combination, train_accuracy, valid_accuracy, train_auc, valid_auc))

    return sorted(results, key=lambda x: x[3], reverse=True)  # Sort by valid AUC

# Ensure the target column is correctly specified
target_column = 'target'  # Change this if your target column has a different name

# Build and evaluate models
results = build_and_evaluate_models(train_df, valid_df, target_column)

# Print the results
for combination, train_acc, valid_acc, train_auc, valid_auc in results:
    print(f"Features: {combination}, Train Accuracy: {train_acc:.4f}, Valid Accuracy: {valid_acc:.4f}, Train AUC: {train_auc:.4f}, Valid AUC: {valid_auc:.4f}")



Features: ('col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6', 'col_7', 'col_8'), Train Accuracy: 0.8470, Valid Accuracy: 0.8480, Train AUC: 0.9394, Valid AUC: 0.9401
Features: ('col_1', 'col_2', 'col_3', 'col_5', 'col_6', 'col_7', 'col_8'), Train Accuracy: 0.8470, Valid Accuracy: 0.8480, Train AUC: 0.9394, Valid AUC: 0.9401
Features: ('col_1', 'col_2', 'col_3', 'col_6', 'col_7', 'col_8'), Train Accuracy: 0.8470, Valid Accuracy: 0.8479, Train AUC: 0.9394, Valid AUC: 0.9401
Features: ('col_1', 'col_2', 'col_3', 'col_4', 'col_6', 'col_7', 'col_8'), Train Accuracy: 0.8470, Valid Accuracy: 0.8479, Train AUC: 0.9394, Valid AUC: 0.9401
Features: ('col_1', 'col_2', 'col_4', 'col_5', 'col_6', 'col_7', 'col_8'), Train Accuracy: 0.8347, Valid Accuracy: 0.8367, Train AUC: 0.9256, Valid AUC: 0.9264
Features: ('col_1', 'col_2', 'col_5', 'col_6', 'col_7', 'col_8'), Train Accuracy: 0.8347, Valid Accuracy: 0.8365, Train AUC: 0.9256, Valid AUC: 0.9264
Features: ('col_1', 'col_2', 'col_4', 'col_6', 'c

In [7]:
# Initialize the client
client = MlflowClient()

# Get the experiment ID by name
experiment = client.get_experiment_by_name(settings['mlflow_experiment_name'])
if experiment is None:
    raise ValueError(f"Experiment with name '{experiment_name}' not found")

experiment_id = experiment.experiment_id

# List all runs in the experiment
runs = client.search_runs(experiment_ids=[experiment_id])

In [9]:
# Function to flatten the RunData dictionary
def flatten_rundata(run_data):
    flat_data = {}
    flat_data.update(run_data.data.metrics)
    flat_data.update(run_data.data.params)
    flat_data['run_name'] = run_data.data.tags['mlflow.runName']

    use_col_count = sum(1 for key, value in run_data.data.params.items() if 'use_col_' in key and value == 'True')
    flat_data['n_vars'] = use_col_count
    return flat_data

# Flatten all RunData dictionaries
flattened_data = [flatten_rundata(run) for run in runs]

# Convert to DataFrame
df = pd.DataFrame(flattened_data)

# Display the DataFrame
print(df)

     train_accuracy  valid_accuracy  train_auc  valid_auc use_col_1 use_col_2  \
0          0.847017         0.84805   0.939422   0.940125      True      True   
1          0.793267         0.79140   0.880264   0.878690     False      True   
2          0.790733         0.79005   0.878273   0.877857      True     False   
3          0.834750         0.83665   0.925614   0.926351      True      True   
4          0.847033         0.84805   0.939422   0.940125      True      True   
..              ...             ...        ...        ...       ...       ...   
250        0.502433         0.50985   0.501855   0.507830     False     False   
251        0.502733         0.51000   0.500265   0.497897     False     False   
252        0.571217         0.57235   0.594743   0.598441     False     False   
253        0.653533         0.64845   0.701612   0.697410     False      True   
254        0.648200         0.65405   0.696769   0.703276      True     False   

    use_col_3 use_col_4 use

In [10]:
# Group by 'n_vars' and get the indices of the rows with the highest 'valid_auc' within each group
idx = df.groupby('n_vars')['valid_auc'].idxmax()

# Use these indices to filter the DataFrame
df_highest_valid_auc = df.loc[idx].reset_index(drop=True)

# Display the resulting DataFrame
print(df_highest_valid_auc)

   train_accuracy  valid_accuracy  train_auc  valid_auc use_col_1 use_col_2  \
0        0.648200         0.65405   0.696769   0.703276      True     False   
1        0.707950         0.71100   0.782236   0.785957      True      True   
2        0.764767         0.76920   0.849540   0.853760      True      True   
3        0.821933         0.82380   0.911628   0.911659      True      True   
4        0.835000         0.83660   0.925609   0.926338      True      True   
5        0.847033         0.84795   0.939420   0.940117      True      True   
6        0.847033         0.84805   0.939422   0.940125      True      True   
7        0.847017         0.84805   0.939422   0.940125      True      True   

  use_col_3 use_col_4 use_col_7 use_col_8 use_col_5 use_col_6  \
0     False     False     False     False     False     False   
1     False     False     False     False     False     False   
2     False     False      True     False     False     False   
3     False     False      T

In [7]:
vars_best_iteration = ["col_1", "col_2", "col_3", "col_6", "col_7", "col_8"]
feature_columns = [col for col in train_df.columns if col != target_column]

with mlflow.start_run(run_name = "best-valid-auc-6-vars"):
    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(train_df[vars_best_iteration], train_df[target_column])

    train_predictions = model.predict(train_df[vars_best_iteration])
    valid_predictions = model.predict(valid_df[vars_best_iteration])

    train_accuracy = accuracy_score(train_df[target_column], train_predictions)
    valid_accuracy = accuracy_score(valid_df[target_column], valid_predictions)

    train_predictions_proba = model.predict_proba(train_df[vars_best_iteration])[:, 1]
    valid_predictions_proba = model.predict_proba(valid_df[vars_best_iteration])[:, 1]
    
    train_auc = roc_auc_score(train_df[target_column], train_predictions_proba)
    valid_auc = roc_auc_score(valid_df[target_column], valid_predictions_proba)

    # Log parameters: which variables are used
    for col in feature_columns:
        mlflow.log_param(f"use_{col}", col in vars_best_iteration)

    # Log metrics
    mlflow.log_metric("train_accuracy", train_accuracy)
    mlflow.log_metric("valid_accuracy", valid_accuracy)
    mlflow.log_metric("train_auc", train_auc)
    mlflow.log_metric("valid_auc", valid_auc)

In [8]:
base_s3_path = settings['project_path_s3']
paths = {
    'train_pred': f"{base_s3_path}/output/base_pred/train.csv",
    'valid_pred': f"{base_s3_path}/output/base_pred/valid.csv",
}

df_train_predictions_proba = pd.DataFrame(train_predictions_proba, columns=['pred'])
df_train_predictions_proba['target'] = train_df['target']
df_valid_predictions_proba = pd.DataFrame(valid_predictions_proba, columns=['pred'])
df_valid_predictions_proba['target'] = valid_df['target']

save_df_to_s3(df_train_predictions_proba, settings['bucket_name'], paths['train_pred'], decimal_places = 5)
save_df_to_s3(df_valid_predictions_proba, settings['bucket_name'], paths['valid_pred'], decimal_places = 5)

In [9]:
!aws s3api get-object --bucket {settings['bucket_name']} --key {paths['train_pred']} /dev/stdout | head -n 10

pred,target
0.99987,1
0.99387,1
0.07881,0
0.01232,0
0.57458,1
0.03212,0
0.99525,1
0.96508,1
0.83500,1

[Errno 32] Broken pipe


In [10]:
!aws s3api get-object --bucket {settings['bucket_name']} --key {paths['valid_pred']} /dev/stdout | head -n 10

pred,target
0.00512,0
0.49614,0
0.10881,0
0.15601,0
0.89603,1
0.46593,1
0.18065,0
0.07806,0
0.50506,1

[Errno 32] Broken pipe
