In [43]:
import mlflow
import kfp
import kfp.components as comp
import requests
import kfp.dsl as dsl
import sys

In [44]:
sys.executable

'C:\\Users\\hp India\\anaconda3\\envs\\mlops\\python.exe'

## Functions and components

In [45]:
def load_and_clean_data():
    
    import pandas as pd
    import numpy as np
    
    data = pd.read_csv("https://raw.githubusercontent.com/TripathiAshutosh/dataset/main/banking.csv")
    
    print("Null/missingalues available in the data: \n")
    print(data.isna().sum())
    data = data.dropna()
    print("The data after dropping the na values are: \n")
    print(data.isna().sum())
    
    data.to_csv(f'data/initial_data.csv', index = False)
    print("--------data imported and cleaned----------")

In [46]:
create_step_load_and_clean_data = kfp.components.create_component_from_func(
    func=load_and_clean_data,
    base_image='python:3.7',
    packages_to_install=['pandas','numpy']
)

In [47]:
def preprocessing():
    
    import pandas as pd
    import numpy as np
    
    data = pd.read_csv(f'data/initial_data.csv')
    
    data['education'] = np.where(data['education'] == 'basic.9y', 'Basic', data['education'])
    data['education'] = np.where(data['education'] == 'basic.6y', 'Basic', data['education'])
    data['education'] = np.where(data['education'] == 'basic.4y', 'Basic', data['education'])
    
    categorical_vars = ['job','marital','education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
    for var in categorical_vars:
        cat_list = 'var' + '_' + var
        cat_list = pd.get_dummies(data[var], prefix = var) # one hot encoding
        data_new = data.join(cat_list)
        data = data_new
    
    categorical_vars = ['job','marital','education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
    
    data_vars = data.columns.values.tolist()
    
    keeping = [i for i in data_vars if i not in categorical_vars]
    
    final_df = data[keeping]
    
    final_df.columns = final_df.columns.str.replace(".", "_")
    final_df.columns = final_df.columns.str.replace(" ", "_")
    
    print(final_df.head())
    
    final_df.to_csv(f'data/preprocessed_df.csv', index = False)
    print("Education column pre-processed, categorical variables one-hot encoded. Ready to input data to model")
    


In [48]:
create_step_prprocessing = kfp.components.create_component_from_func(
    func=preprocessing,
    base_image='python:3.7',
    packages_to_install=['pandas','numpy']
)

In [49]:
def train_test_split():
    import pandas as pd
    import numpy as np
    from sklearn.model_selection import train_test_split
    
    final_df = pd.read_csv(f'data/preprocessed_df.csv')
    
    X = final_df.loc[:, final_df.columns != 'y']
    y = final_df.loc[:, final_df.columns == 'y']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify = y, random_state = 47)
    
    np.save(f'data/X_train.npy', X_train) # saved as a numpy binary file (efficient to save and load)
    np.save(f'data/X_test.npy', X_test)
    np.save(f'data/y_train.npy', y_train)
    np.save(f'data/y_test.npy', y_test)
    
    print("\n---- X_train ----")
    print("\n")
    print(X_train.head())
    
    print("\n---- X_test ----")
    print("\n")
    print(X_test.head())
    
    print("\n---- y_test ----")
    print("\n")
    print(y_test.head())

In [50]:
create_step_train_test_split = kfp.components.create_component_from_func(
    func=train_test_split,
    base_image='python:3.7',
    packages_to_install=['pandas','numpy','scikit-learn']
)

In [51]:
def training_basic_classifier():
    #import mlflow
    from sklearn.ensemble import RandomForestClassifier
    import pandas as pd
    import numpy as np
    import pickle
    
    X_train = np.load(f'data/X_train.npy', allow_pickle=True)
    y_train = np.load(f'data/y_train.npy', allow_pickle=True)
    
    model = RandomForestClassifier(n_estimators=150)
    model.fit(X_train, y_train)
    

  
    with open(f'data/model.pkl', 'wb') as f:
            pickle.dump(model, f)
        
    print("\nRandomForest classifier is trained on banking data and saved to PV location /data/model.pkl ----")


In [52]:
create_step_training_basic_classifier = kfp.components.create_component_from_func(
    func=training_basic_classifier,
    base_image='python:3.7',
    packages_to_install=['pandas','numpy','scikit-learn'] #'mlflow'
)

In [53]:
def log_mlflow():
    import mlflow
    import pickle
    import sklearn
    import pandas 
    import numpy
    with open('data/model.pkl', 'rb') as f:
        model = pickle.load(f)
    mlflow.set_tracking_uri("http://127.0.0.1:52111")
    with mlflow.start_run():
        mlflow.sklearn.log_model(model, "model")
        
    

In [54]:
create_step_mlflow_logging= kfp.components.create_component_from_func(
    func=log_mlflow,
    base_image='python:3.7',
    packages_to_install=['pandas','numpy','scikit-learn','mlflow']
)

In [55]:
def predict_on_test_data():
    import pandas as pd
    import numpy as np
    import sklearn
    import pickle
    print("---- Inside predict_on_test_data component ----")
    with open('data/model.pkl', 'rb') as f:
        model = pickle.load(f)

    X_test = np.load('data/X_test.npy', allow_pickle=True)
    y_pred = model.predict(X_test)
    np.save('data/y_pred.npy', y_pred)

    print("\n---- Predicted classes ----")
    print("\n")
    print(y_pred)


In [56]:
create_step_predict_on_test_data = kfp.components.create_component_from_func(
    func=predict_on_test_data,
    base_image='python:3.7',
    packages_to_install=['pandas','numpy','scikit-learn']
)

In [57]:

"""

def predict_prob_on_test_data():
    import pandas as pd
    import numpy as np
    import sklearn
    
    import pickle
    print("---- Inside predict_prob_on_test_data component ----")
    with open(f'data/model.pkl','rb') as f:
        model = pickle.load(f)
        
    X_test = np.load(f'data/X_test.npy',allow_pickle=True)
    y_pred_prob = model.predict_proba(X_test)
    np.save(f'data/y_pred_prob.npy', y_pred_prob)
    
    print("\n---- Predicted Probabilities ----")
    print("\n")
    print(y_pred_prob)
    
    
    create_step_predict_prob_on_test_data = kfp.components.create_component_from_func(
    func=predict_prob_on_test_data,
    base_image='python:3.7',
    packages_to_install=['pandas','numpy','scikit-learn']
)
   
""" 



'\n\ndef predict_prob_on_test_data():\n    import pandas as pd\n    import numpy as np\n    import sklearn\n    \n    import pickle\n    print("---- Inside predict_prob_on_test_data component ----")\n    with open(f\'data/model.pkl\',\'rb\') as f:\n        model = pickle.load(f)\n        \n    X_test = np.load(f\'data/X_test.npy\',allow_pickle=True)\n    y_pred_prob = model.predict_proba(X_test)\n    np.save(f\'data/y_pred_prob.npy\', y_pred_prob)\n    \n    print("\n---- Predicted Probabilities ----")\n    print("\n")\n    print(y_pred_prob)\n    \n    \n    create_step_predict_prob_on_test_data = kfp.components.create_component_from_func(\n    func=predict_prob_on_test_data,\n    base_image=\'python:3.7\',\n    packages_to_install=[\'pandas\',\'numpy\',\'scikit-learn\']\n)\n   \n'

In [58]:
"""
def get_metrics():
    import mlflow
    from sklearn.metrics import accuracy_score, precision_score, recall_score, log_loss
    import pandas as pd
    import numpy as np
    from sklearn import metrics
    mlflow.set_tracking_uri("http://localhost:5000")

    
    y_true = np.load(f'data/y_test.npy', allow_pickle=True)
    y_pred = np.load(f'data/y_pred.npy', allow_pickle=True)
    y_pred_prob = np.load(f'data/y_pred_prob.npy', allow_pickle=True)

    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    entropy = log_loss(y_true, y_pred_prob)

    metrics_dict = {'accuracy': round(acc, 2), 'precision': round(prec, 2), 'recall': round(recall, 2), 'entropy': round(entropy, 2)}
    print("\n Model Metrics:", metrics_dict)
    
    create_step_get_metrics = kfp.components.create_component_from_func(
    func=get_metrics,
    base_image='python:3.7',
    packages_to_install=['pandas','numpy','scikit-learn','mlflow']
)
    """

'\ndef get_metrics():\n    import mlflow\n    from sklearn.metrics import accuracy_score, precision_score, recall_score, log_loss\n    import pandas as pd\n    import numpy as np\n    from sklearn import metrics\n    mlflow.set_tracking_uri("http://localhost:5000")\n\n    \n    y_true = np.load(f\'data/y_test.npy\', allow_pickle=True)\n    y_pred = np.load(f\'data/y_pred.npy\', allow_pickle=True)\n    y_pred_prob = np.load(f\'data/y_pred_prob.npy\', allow_pickle=True)\n\n    acc = accuracy_score(y_true, y_pred)\n    prec = precision_score(y_true, y_pred)\n    recall = recall_score(y_true, y_pred)\n    entropy = log_loss(y_true, y_pred_prob)\n\n    metrics_dict = {\'accuracy\': round(acc, 2), \'precision\': round(prec, 2), \'recall\': round(recall, 2), \'entropy\': round(entropy, 2)}\n    print("\n Model Metrics:", metrics_dict)\n    \n    create_step_get_metrics = kfp.components.create_component_from_func(\n    func=get_metrics,\n    base_image=\'python:3.7\',\n    packages_to_install=

In [59]:
 
@dsl.pipeline(
   name='Banking Term Deposit classifier pipeline with kuberflow',
   description='A sample pipeline that performs Ramdom Classifer classifier task'
)
# Define parameters to be fed into pipeline
def banking_classifier_pipeline(data_path: str):
    vop = dsl.VolumeOp(
    name="t-vol",
    resource_name="t-vol", 
    size="1Gi", 
    modes=dsl.VOLUME_MODE_RWO)
    
    # Here we are creating persistent volume, with name, size as 1GB and read_write once operation as mode
    
    # Now we will bind all the components
    
    prepare_data_cleaning_task = create_step_load_and_clean_data().add_pvolumes({data_path: vop.volume})
    
    preprocessing_task = create_step_prprocessing().add_pvolumes({data_path: vop.volume}).after(prepare_data_cleaning_task)
    
    
    train_test_split = create_step_train_test_split().add_pvolumes({data_path: vop.volume}).after(preprocessing_task)
    
    classifier_training = create_step_training_basic_classifier().add_pvolumes({data_path: vop.volume}).after(train_test_split)
    
    mlflow_log = create_step_mlflow_logging().add_pvolumes({data_path: vop.volume}).after(classifier_training)
    
    log_predicted_class = create_step_predict_on_test_data().add_pvolumes({data_path: vop.volume}).after(mlflow_log)
    
    #log_predicted_probabilities = create_step_predict_prob_on_test_data().add_pvolumes({data_path: vop.volume}).after(log_predicted_class)
    
    #log_metrics_task = create_step_get_metrics().add_pvolumes({data_path: vop.volume}).after(log_predicted_probabilities)

    # disabling cache and getting outputs of fresh runs
    
    prepare_data_cleaning_task.execution_options.caching_strategy.max_cache_staleness = "P0D"
    preprocessing_task.execution_options.caching_strategy.max_cache_staleness = "P0D"
    train_test_split.execution_options.caching_strategy.max_cache_staleness = "P0D"
   # oversampling_data.execution_options.caching_strategy.max_cache_staleness = "P0D"
    classifier_training.execution_options.caching_strategy.max_cache_staleness = "P0D"
    log_predicted_class.execution_options.caching_strategy.max_cache_staleness = "P0D"
   # log_predicted_probabilities.execution_options.caching_strategy.max_cache_staleness = "P0D"
   # log_metrics_task.execution_options.caching_strategy.max_cache_staleness = "P0D"
    

In [60]:
kfp.compiler.Compiler().compile(
    pipeline_func=banking_classifier_pipeline,
    package_path='kuberflow_mlflow_mixed.yaml')

In [61]:
client = kfp.Client()

In [62]:
DATA_PATH = '/data'

import datetime
print(datetime.datetime.now().date())


pipeline_func = banking_classifier_pipeline
experiment_name = 'banking_classifier_exp' +"_"+ str(datetime.datetime.now().date())
run_name = pipeline_func.__name__ + ' run'
namespace = "kubeflow"

arguments = {"data_path":DATA_PATH}

kfp.compiler.Compiler().compile(pipeline_func,  
  '{}.zip'.format(experiment_name))

run_result = client.create_run_from_pipeline_func(pipeline_func, 
                                                  experiment_name=experiment_name, 
                                                  run_name=run_name, 
                                                  arguments=arguments)

2023-07-25


In [63]:
import mlflow
import pickle
with open('data/model.pkl', 'rb') as f:
        model = pickle.load(f)


FileNotFoundError: [Errno 2] No such file or directory: 'data/model.pkl'