In [1]:
def load_and_clean_data():
    
    import pandas as pd
    import numpy as np
    
    data = pd.read_csv("https://raw.githubusercontent.com/TripathiAshutosh/dataset/main/banking.csv")
    
    print("Null/missingalues available in the data: \n")
    print(data.isna().sum())
    data = data.dropna()
    print("The data after dropping the na values are: \n")
    print(data.isna().sum())
    
    print("--------data imported and cleaned----------")

    return data

In [2]:
def preprocessing(data):
    
    import pandas as pd
    import numpy as np
    
    data = load_and_clean_data()

    data['education'] = np.where(data['education'] == 'basic.9y', 'Basic', data['education'])
    data['education'] = np.where(data['education'] == 'basic.6y', 'Basic', data['education'])
    data['education'] = np.where(data['education'] == 'basic.4y', 'Basic', data['education'])
    
    categorical_vars = ['job','marital','education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
    for var in categorical_vars:
        cat_list = 'var' + '_' + var
        cat_list = pd.get_dummies(data[var], prefix = var) # one hot encoding
        data_new = data.join(cat_list)
        data = data_new
    
    categorical_vars = ['job','marital','education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
    
    data_vars = data.columns.values.tolist()
    
    keeping = [i for i in data_vars if i not in categorical_vars]
    
    final_df = data[keeping]
    
    final_df.columns = final_df.columns.str.replace(".", "_")
    final_df.columns = final_df.columns.str.replace(" ", "_")
    
    print(final_df.head())
    
    print("Education column pre-processed, categorical variables one-hot encoded. Ready to input data to model")
    
    return final_df

In [3]:

def train_test_split(final_df: pd.DataFrame):

    import pandas as pd
    import numpy as np
    from sklearn.model_selection import train_test_split

    #final_df = preprocessing()
    
    X = final_df.loc[:, final_df.columns != 'y']
    y = final_df.loc[:, final_df.columns == 'y']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify = y, random_state = 47)
    
    # np.save(f'data/X_train.npy', X_train) # saved as a numpy binary file (efficient to save and load)
    # np.save(f'data/X_test.npy', X_test)
    # np.save(f'data/y_train.npy', y_train)
    # np.save(f'data/y_test.npy', y_test)
    
    print("\n---- X_train ----")
    print("\n")
    print(X_train.head())
    
    print("\n---- X_test ----")
    print("\n")
    print(X_test.head())
    
    print("\n---- y_test ----")
    print("\n")
    print(y_test.head())

    return X_train, X_test, y_train, y_test

NameError: name 'pd' is not defined

In [None]:

def training_basic_classifier(X_train,y_train):
    import mlflow
    from sklearn.ensemble import RandomForestClassifier
    import pandas as pd
    import numpy as np
    import pickle
    
    #X_train, X_test, y_train, y_test = train_test_split()

    model = RandomForestClassifier(n_estimators=150)
    model.fit(X_train, y_train)

    #mlflow.set_tracking_uri("http://localhost:8000")

    with mlflow.start_run():
        mlflow.log_param("n_estimators", 150)
        # Log any other hyperparameters you want to track
        
        mlflow.sklearn.log_model(model, "model")
        
        with open(f'data/model.pkl', 'wb') as f:
            pickle.dump(model, f)
        
    print("\nRandomForest classifier is trained on banking data and saved to PV location /data/model.pkl ----")
    return model


In [4]:

def predict_on_test_data(model,X_test):
    import pandas as pd
    import numpy as np
    import sklearn

    #model = training_basic_classifier()

    print("---- Inside predict_on_test_data component ----")
    
    X_test = np.load('data/X_test.npy', allow_pickle=True)
    y_pred = model.predict(X_test)
    np.save('data/y_pred.npy', y_pred)

    print("\n---- Predicted classes ----")
    print("\n")
    print(y_pred)

    return y_pred

In [5]:
import kfp.dsl as dsl
import mlflow

def get_mlflow_server_url():
    # Replace 'your_mlflow_service_name' with the name of the MLflow service in your Kubernetes cluster
    mlflow_service_name = 'localhost:5000'

    # Retrieve the MLflow server URL using the Kubernetes API
    service_url = mlflow.get_service_url(mlflow_service_name)
    return service_url


@dsl.component
def load_data_component():
    return dsl.ContainerOp(
        name='load-data',
        image='python:3.8',
        command=['python', 'load_data_script.py'],
        output_artifact_paths={'data': '/data'},
    )

@dsl.component
def preprocess_data_component(data):
    return dsl.ContainerOp(
        name='preprocess-data',
        image='python:3.8',
        command=['python', 'preprocessing_script.py'],
        arguments=['--data', data],
        output_artifact_paths={'preprocessed_data': '/data'},
    )

@dsl.component
def split_data_component(final_df):
    return dsl.ContainerOp(
        name='split-data',
        image='python:3.8',
        command=['python', 'data_split_script.py'],
        arguments=['--final_df', final_df],
        output_artifact_paths={
            'X_train': '/data/X_train.npy',
            'X_test': '/data/X_test.npy',
            'y_train': '/data/y_train.npy',
            'y_test': '/data/y_test.npy',
        },
    )

@dsl.component
def train_classifier_component(X_train, y_train):
    return dsl.ContainerOp(
        name='train-classifier',
        image='python:3.8',
        command=['python', 'model_building_script.py'],
        arguments=[
            '--X_train', X_train,
            '--y_train', y_train
        ],
        output_artifact_paths={'model': '/data/model.pkl'},
    )

@dsl.component
def predict_test_data_component(model, X_test):
    return dsl.ContainerOp(
        name='predict-test-data',
        image='python:3.8',
        command=['python', 'prediction_script.py'],
        arguments=[
            '--model', model,
            '--X_test', X_test
        ],
        output_artifact_paths={'y_pred': '/data/y_pred.npy'},
    )


In [6]:
import kfp.dsl as dsl
@dsl.pipeline(name='ML Pipeline', description='A pipeline for ML model training and prediction')
def ml_pipeline():
    # Load data
    load_data_task = load_data_component()

    # Preprocess data
    preprocess_task = preprocess_data_component(data=load_data_task.output)

    # Split data
    split_data_task = split_data_component(preprocessed_data=preprocess_task.output)

    # Train classifier with MLflow tracking
    train_classifier_task = train_classifier_component(
        X_train=split_data_task.outputs['X_train'], 
        y_train=split_data_task.outputs['y_train']
    )

    predictions_task = predict_test_data_component(model = train_classifier_task.output, X_test=split_data_task.outputs['X_test'])

    # Set the MLflow server URL
    mlflow_server_url = get_mlflow_server_url()

    # Log the MLflow server URL to the pipeline output
    dsl.ContainerOp(
        name='log-mlflow-server-url',
        image='python:3.8',
        command=['python', '-c', f'print("{mlflow_server_url}")']
    )


In [8]:
import kfp


# Set your Kubeflow experiment and run names
experiment_name = 'my-ml-experiment'
run_name = 'my-ml-run'

# Initialize the Kubeflow client
client = kfp.Client()

# Compile the pipeline
pipeline_filename = 'ml_pipeline.yaml'
kfp.compiler.Compiler().compile(ml_pipeline, pipeline_filename)

# Create and run the pipeline
client.create_run_from_pipeline_func(ml_pipeline, experiment_name=experiment_name, run_name=run_name)


ValueError: I/O operation on closed file