In [None]:
! pip install kfp
!pip install google-cloud-pipeline-components
!pip install gcsfs
!pip install fsspec
!pip install scikit-learn

In [None]:
# Set parameters
project_id = 'ise543-final-project-421906'
location = 'us-central1'

from google.cloud import aiplatform
aiplatform.init(project=project_id, location=location)

from kfp.v2.dsl import pipeline, component, component, InputPath, OutputPath, Dataset
import joblib
import gcsfs
import fsspec
import pandas as pd
import numpy as np

In [None]:
impute_multiple_features_training_artifact_path = "gs://final-bucket-2/795244162107/final-project-pipeline-20240501182840/impute-multiple-features-training_-3870435342435745792/executor_output.json"
model_path = 'gs://final-bucket-2/795244162107/final-project-pipeline-20240502212050/train-gradient-boosting_-5544868406236282880/trained_model_artifact/model.joblib'

In [None]:
impute_dictionary = pd.read_json(impute_multiple_features_training_artifact_path).to_dict()
impute_dictionary

{'artifacts': {'feature_medians': {'artifacts': [{'name': 'projects/795244162107/locations/us-central1/metadataStores/default/artifacts/7653179628138035221',
     'uri': 'gs://final-bucket-2/795244162107/final-project-pipeline-20240501182840/impute-multiple-features-training_-3870435342435745792/feature_medians',
     'metadata': {'medians': {'cigsPerDay': 20.0,
       'totChol': 233.0,
       'glucose': 78.0,
       'BMI': 25.38,
       'heartRate': 75.0},
      'modes': {'education': 1.0, 'BPMeds': 0.0}}}]},
  'imputed_dataset_path': {'artifacts': [{'name': 'projects/795244162107/locations/us-central1/metadataStores/default/artifacts/15032303969185358295',
     'uri': 'gs://final-bucket-2/795244162107/final-project-pipeline-20240501182840/impute-multiple-features-training_-3870435342435745792/imputed_dataset_path',
     'metadata': {}}]}}}

In [None]:
training_cigsPerDay_median = impute_dictionary['artifacts']['feature_medians']['artifacts'][0]['metadata']
training_cigsPerDay_median

{'medians': {'cigsPerDay': 20.0,
  'totChol': 233.0,
  'glucose': 78.0,
  'BMI': 25.38,
  'heartRate': 75.0},
 'modes': {'education': 1.0, 'BPMeds': 0.0}}

In [None]:
medians = impute_dictionary['artifacts']['feature_medians']['artifacts'][0]['metadata']['medians']
modes = impute_dictionary['artifacts']['feature_medians']['artifacts'][0]['metadata']['modes']

median_cigsPerDay = medians['cigsPerDay']
median_totChol = medians['totChol']
median_glucose = medians['glucose']
median_BMI = medians['BMI']
median_heartRate = medians['heartRate']
mode_education = modes['education']
mode_BPMeds = modes['BPMeds']

print("Median of Cigarettes Per Day:", median_cigsPerDay)
print("Median of Total Cholesterol:", median_totChol)
print("Median of Glucose:", median_glucose)
print("Median of BMI:", median_BMI)
print("Median of Heart Rate:", median_heartRate)
print("Mode of Education:", mode_education)
print("Mode of BPMeds:", mode_BPMeds)



Median of Cigarettes Per Day: 20.0
Median of Total Cholesterol: 233.0
Median of Glucose: 78.0
Median of BMI: 25.38
Median of Heart Rate: 75.0
Mode of Education: 1.0
Mode of BPMeds: 0.0


In [None]:
# Create a GCS file system object
fs = gcsfs.GCSFileSystem()

with fs.open(model_path, 'rb') as f:
    model = joblib.load(f)

model

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [None]:
@component(packages_to_install=["pandas", "numpy", "fsspec", "gcsfs"])
def impute_multiple_features_validation(validation_dataset_path: str,
                                        imputed_dataset_path: OutputPath('Dataset'),
                                        median_cigsPerDay: float,
                                        median_totChol: float,
                                        median_glucose: float,
                                        median_BMI: float,
                                        median_heartRate: float,
                                        mode_education: float,
                                        mode_BPMeds: float):
    import pandas as pd
    # Load the test dataset
    df = pd.read_csv(validation_dataset_path)

    import pandas as pd

    df = pd.read_csv(validation_dataset_path)



    df['cigsPerDay'].fillna(median_cigsPerDay, inplace=True)
    df['totChol'].fillna(median_totChol, inplace=True)
    df['glucose'].fillna(median_glucose, inplace=True)
    df['BMI'].fillna(median_BMI, inplace=True)
    df['heartRate'].fillna(median_heartRate, inplace=True)
    df['education'].fillna(mode_education, inplace=True)
    df['BPMeds'].fillna(mode_BPMeds, inplace=True)


    df.to_csv(imputed_dataset_path, index=False)

  return component_factory.create_component_from_func(


In [None]:
@component(packages_to_install=["pandas", "numpy", "fsspec", "gcsfs"])
def perform_initial_data_preparation(input_dataset_path: InputPath('Dataset'),
                                     output_dataset_path: OutputPath(Dataset)):
    import pandas as pd
    import numpy as np

    df = pd.read_csv(input_dataset_path)
    df = df.drop(columns=["a1c"])
    df["totChol"] = df["totChol"].clip(upper=700)
    df["BMI"] = df["BMI"].clip(upper=50)
    df["totChol"] = np.log(df["totChol"]+1)
    df["income"] = np.log(df["income"]+1)
    df.loc[df['currentSmoker'] == 0, 'cigsPerDay'] = 0
    df = pd.get_dummies(df, drop_first=True)
    df.to_csv(output_dataset_path, index=False)

In [None]:
@component(packages_to_install=["pandas", "numpy", "scikit-learn", "joblib", "fsspec", "gcsfs"])
def perform_predictions(dataset_for_prediction_path: InputPath('Dataset'),
                   model_path: str,
                   predictions_path: OutputPath('Dataset')):

    import pandas as pd
    import joblib
    import gcsfs

    # Create a GCS file system object
    fs = gcsfs.GCSFileSystem()

    # Load the trained model
    with fs.open(model_path, 'rb') as f:
      trained_model = joblib.load(f)

    # Load the test dataset
    pred_df = pd.read_csv(dataset_for_prediction_path)

    # Make predictions
    #y_pred = trained_model.predict(pred_df)
    pred_df_2 = pred_df.drop('patientID', axis=1)
    y_pred = trained_model.predict(pred_df_2)
    pred_df['prediction'] = y_pred
    pred_df = pred_df[['patientID', 'prediction']]

    # Save the predictions
    pred_df.to_csv(predictions_path, index=False)

In [None]:
from kfp.v2.dsl import pipeline, Output, Dataset
model_path = 'gs://final-bucket-2/795244162107/final-project-pipeline-20240502212050/train-gradient-boosting_-5544868406236282880/trained_model_artifact/model.joblib'
median_cigsPerDay = impute_dictionary['artifacts']['feature_medians']['artifacts'][0]['metadata']['medians']['cigsPerDay']
median_totChol = impute_dictionary['artifacts']['feature_medians']['artifacts'][0]['metadata']['medians']['totChol']
median_glucose = impute_dictionary['artifacts']['feature_medians']['artifacts'][0]['metadata']['medians']['glucose']
median_BMI = impute_dictionary['artifacts']['feature_medians']['artifacts'][0]['metadata']['medians']['BMI']
median_heartRate = impute_dictionary['artifacts']['feature_medians']['artifacts'][0]['metadata']['medians']['heartRate']
mode_education = impute_dictionary['artifacts']['feature_medians']['artifacts'][0]['metadata']['modes']['education']
mode_BPMeds = impute_dictionary['artifacts']['feature_medians']['artifacts'][0]['metadata']['modes']['BPMeds']


@pipeline(name='final_project_inference_pipeline')
def final_project_inference_pipeline(dataset_for_predictions_path: str,
                               median_cigsPerDay: float = median_cigsPerDay,
                               median_totChol: float = median_totChol,
                               median_glucose: float = median_glucose,
                               median_BMI: float = median_BMI,
                               median_heartRate: float = median_heartRate,
                               mode_education: float = mode_education,
                               mode_BPMed: float = mode_BPMeds,
                               model_uri: str = model_path):



    imputed_data = impute_multiple_features_validation(
        validation_dataset_path=dataset_for_predictions_path,
        median_cigsPerDay=median_cigsPerDay,
        median_totChol=median_totChol,
        median_glucose=median_glucose,
        median_BMI=median_BMI,
        median_heartRate=median_heartRate,
        mode_education=mode_education,
        mode_BPMeds=mode_BPMeds
    )

    test_data_preparation = perform_initial_data_preparation(input_dataset_path=imputed_data.outputs['imputed_dataset_path'])

    perform_predictions(
        dataset_for_prediction_path=test_data_preparation.outputs['output_dataset_path'],
        model_path=model_uri
    )



In [23]:
from kfp.v2 import compiler

compiler.Compiler().compile(
    pipeline_func=final_project_inference_pipeline,
    package_path = 'final_project_inference_pipeline.json'
)

pipeline_job = aiplatform.PipelineJob(
    display_name='final_project_inference_pipeline',
    template_path='final_project_inference_pipeline.json',
    pipeline_root='gs://final-bucket-2',
    parameter_values={
        'dataset_for_predictions_path': 'gs://final-bucket-2/Final Project Evaluation Dataset - Student.csv'
    },
    enable_caching=True
)

pipeline_job.run()

INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob created. Resource name: projects/795244162107/locations/us-central1/pipelineJobs/final-project-inference-pipeline-20240503011057
INFO:google.cloud.aiplatform.pipeline_jobs:To use this PipelineJob in another session:
INFO:google.cloud.aiplatform.pipeline_jobs:pipeline_job = aiplatform.PipelineJob.get('projects/795244162107/locations/us-central1/pipelineJobs/final-project-inference-pipeline-20240503011057')
INFO:google.cloud.aiplatform.pipeline_jobs:View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/final-project-inference-pipeline-20240503011057?project=795244162107
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/795244162107/locations/us-central1/pipelineJobs/final-project-inference-pipeline-20240503011057 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.pipeline_jobs:Pipeli