In [1]:
import argparse
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import os
import pandas as pd
import mlflow

In [2]:
def select_first_file(path):
    """Selects first file in folder, use under assumption there is only one file in folder
    Args:
        path (str): path to directory or file to choose
    Returns:
        str: full path of selected file
    """
    files = os.listdir(path)
    return os.path.join(path, files[0])

In [3]:
def main(args):
    """Main function of the script."""

    # paths are mounted as folder, therefore, we are selecting the file from folder
    train_df = pd.read_csv(select_first_file(args.train_data))
    # Extracting the label column
    y_train = train_df.pop("is_bug_inc")
    # convert the dataframe values to array
    X_train = train_df.values

    print(f"Training with data of shape {X_train.shape}")

    clf = LogisticRegression(C=args.regression_C, penalty=args.regression_penalty, solver=args.regression_solver)
    mlflow.log_param("C", args.regression_C)
    mlflow.log_param("penalty", args.regression_penalty)
    mlflow.log_param("solver", args.regression_solver)

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_train)
    fpr, tpr, thresholds = metrics.roc_curve(y_train, y_pred)
    precision = metrics.precision_score(y_train, y_pred)
    recall = metrics.recall_score(y_train, y_pred)
    f1 = metrics.f1_score(y_train, y_pred)
    auc = metrics.auc(fpr, tpr)
    print(precision, recall, f1, auc)

    mlflow.log_metric("train precision", precision)
    mlflow.log_metric("train recall", recall)
    mlflow.log_metric("train f1", f1)
    mlflow.log_metric("train auc", auc)

    # Save the model
    mlflow.sklearn.save_model(sk_model=clf, path=args.model_output)

In [15]:
class MyArgs:
    def __init__(self, /, **kwargs):
        self.__dict__.update(kwargs)

args = MyArgs(
            train_data = "local_run/prep_outputs/train",
            regression_C = 1.0,
            regression_penalty = "l1",
            regression_solver = "liblinear",
            model_output = "local_run/train_exp3",
            )

os.makedirs(args.model_output, exist_ok = True)

In [16]:
mlflow.start_run()

lines = [
    f"Train dataset input path: {args.train_data}",
    f"Model output path: {args.model_output}",
    f"C: {args.regression_C}",
    f"penalty: {args.regression_penalty}",
    f"solver: {args.regression_solver}",
]

for line in lines:
    print(line)

main(args)

mlflow.end_run()

Train dataset input path: local_run/prep_outputs/train
Model output path: local_run/train_exp3
C: 1.0
penalty: l1
solver: liblinear
Training with data of shape (1542, 8)
0.7946287519747235 0.6523994811932555 0.7165242165242164 0.7418936446173798


