In [5]:
from typing import NamedTuple

import kfp
import kfp.components as components
import kfp.dsl as dsl
#import kubeflow.fairing.utils
from kfp.dsl import InputPath, OutputPath

In [49]:
def get_data(url_path:InputPath(str),data_dir:OutputPath(str)):
    import pandas as pd
    import os
    from sklearn.model_selection import train_test_split
    try:
        df=pd.read_csv(url_path)
    except exception as e:
        print(e)
    if not os.path.exists(os.path.join(os.getcwd(),data_dir)):
        os.makedirs(os.path.join(os.getcwd(),data_dir))
    train,test=train_test_split(df)
    train.to_csv(os.path.join(os.getcwd(),data_dir,train.csv))
    test.to_csv(os.path.join(os.getcwd(),data_dir,test.csv))

In [50]:
def train_model(data_dir:InputPath(str),model_dir:OutputPath(str)):
    import pandas as pd
    import os
    import pickle
    from sklearn.ensemble import RandomForestClassifier
    train=pd.read_csv(os.path.join(os.getcwd(),data_dir,train.csv))
    train_x = train.drop(["quality"], axis=1)
    train_y = train[["quality"]]
    rf=RandomForestClassifier()
    rf.fit(train_x,train_y)
    if not os.path.exists(os.path.join(os.getcwd(),model_dir)):
        os.makedirs(os.path.join(os.getcwd(),model_dir))
    with open(os.path.join(os.getcwd(),model_dir,'model.pkl'),'wb') as f:
        pickle.dump(rf,f)  

In [58]:
def evaluate_model(
    data_dir: InputPath(str), model_dir: InputPath(str), metrics_path: OutputPath(str)
) -> NamedTuple("EvaluationOutput", [("mlpipeline_metrics", "Metrics")]):
    import pandas as pd
    import os
    import pickle
    from sklearn.metrics import accuracy_score
    from collections import namedtuple
    import json
    test=pd.read_csv(os.path.join(os.getcwd(),data_dir,test.csv))
    test_x = test.drop(["quality"], axis=1)
    test_y = test[["quality"]]
    with open(os.path.join(os.getcwd(),model_dir,'model.pkl'),'rb') as f:
        model=pickle.load(f)
    pred_y=model.predict(test_x)
    from sklearn.metrics import accuracy_score
    accuracy=(accuracy_score(test_y,pred_y)*100)
    metrics={'metrics':
             [
                 {'name':"accuracy_score",
                  'numberValue':float(accuracy),
                 'format':"PERCENTAGE"}]
            }
    with open(metrics_path,'w') as f:
        json.dump(metrics,f)
    example_outputs=namedtuple('EvaluationOutput',['mlpipeline_metrics'])
    return example_outputs(json.dumps(metrics))

In [67]:
def export_model(model_dir:InputPath(str),metrics_path:InputPath(str),export_bucket:str,model_name:str,model_version:int):
    import os
    import boto3
    from botocore.client import Config
    
    s3 = boto3.client(
        "s3",
        endpoint_url="http://minio.kubeflow",
        aws_access_key_id="minio",
        aws_secret_access_key="minio123",
        config=Config(signature_version="s3v4"),
    )
    response=s3.list_buckets()
    export_bucket_exists=False
    for bucket in response["Buckets"]:
        if bucket["Name"]==export_bucket:
            export_bucket_exists=True
    if not export_bucket_exists:
        s3.create_bucket(ACL="public-read-write",Bucket=export_bucket)
        
    for root,dirs,files in os.walk(model_dir):
        for filename in files:
            local_path=os.path.join(root,filename)
            s3_path=os.path.relpath(local_path,model_dir)
            
            s3.upload_file(local_path,
                           export_bucket,
                           f"{model_name}/{model_version}/{s3_path}",
                           ExtraArgs={"ACL":"public-read"},
                          )
            
    response=s3.list_buckets(Bucket=export_bucket)
    print(f"All objects in {export_bucket}")
    for file in response["Contents"]:
        print("{}/{}".format(export_bucket,file["key"]))  

In [68]:
BASE_IMAGE = "mesosphere/kubeflow:1.2.0-tensorflow-2.4.0"

In [69]:
def train_and_serve(input_bucket:str,model_dir:str,data_dir:str,export_bucket:str,model_name:str,model_version:int):
    downloadOp=components.func_to_container_op(get_data,base_image=BASE_IMAGE)(input_bucket).set_retry(5)
    trainOp=components.func_to_container_op(train_model,base_image=BASE_IMAGE)(downloadOp.output)
    evaluateOp=components.func_to_container_op(evaluate_model,base_image=BASE_IMAGE)(downloadOp.output,trainOp.output)
    exportOp=components.func_to_container_op(export_model,base_image=BASE_IMAGE)(downloadOp.output,trainOp.output,evaluate_model.output,export_bucket,model_name,model_version)
    kfservingOp=kfserving(
        action="apply",
        model_uri=f"s3://{export_bucket}/{model_name}",
        model_name="",
        namespace=NAMESPACE,
        framework="tensorflow",
        watch_timeout="300",
    )
    kfservingOp.after(exportOp)
    

In [None]:
@dsl.pipeline(
    name="End-to-End WineQuality Pipeline",
    description="A sample pipeline to demonstrate multi-step model training, evaluation, export, and serving",
)
def Wine_pipeline(input_bucket:str='wine-ml-model',
                  model_dir='/train/model',
                  data_dir='/train/data',
                  export_bucket='WineQualityBucket',
                  model_name:str="Wine-Model',
                  model_version:int=1,
                 ):
    train_and_serve(input_bucket=input_bucket,
                    model_dir=model_dir,
                    data_dir=data_dir,
                    export_bucket=export_bucket,
                    model_name=model_name,
                    model_version=model_version)


In [None]:
pipeline_func=Wine_pipeline
run_name=pipeline_func.__name__ + "run"
experiment_name="end to end Wine Quality"
arguments = {
    "input_bucket": INPUT_BUCKET,
    "model_dir": "/train/model",
    "data_dir": "/train/data",
    "export_bucket": EXPORT_BUCKET,
    "model_name": "Wine-Model",
    "model_version": "1",
}
client=kfp.Client()
run_result=client.create_run_from_pipeline_func(
    pipeline_func,
    experiment_name=experiment_name,
    run_name=run_name,
    arguments=arguments,
    namespace=NAMESPACE
)
completed_run = run_result.wait_for_run_completion(timeout=900)
completed_run.run.status

In [1]:
import pandas as pd
df=pd.read_csv(r"C:\Users\krishna\Desktop\Datasets\Breast_cancer_data.csv")

In [2]:
df.head()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis
0,17.99,10.38,122.8,1001.0,0.1184,0
1,20.57,17.77,132.9,1326.0,0.08474,0
2,19.69,21.25,130.0,1203.0,0.1096,0
3,11.42,20.38,77.58,386.1,0.1425,0
4,20.29,14.34,135.1,1297.0,0.1003,0


In [4]:
df.head()
import numpy as np

In [5]:
df.isna().any()

mean_radius        False
mean_texture       False
mean_perimeter      True
mean_area           True
mean_smoothness    False
diagnosis          False
dtype: bool

In [6]:
df['mean_perimeter']=df['mean_perimeter'].fillna(np.mean(df['mean_perimeter']))
df['mean_area']=df['mean_area'].fillna(np.mean(df['mean_area']))

In [7]:
df.isna().any()

mean_radius        False
mean_texture       False
mean_perimeter     False
mean_area          False
mean_smoothness    False
diagnosis          False
dtype: bool

In [8]:
x=df.iloc[:,:-1].values
y=df.iloc[:,-1].values

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pickle

In [10]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=100)

In [13]:
rf=RandomForestClassifier()
rf.fit(x_train,y_train)
import os
with open(os.path.join('folder','data','model_file.pkl'),'wb') as f:
    pickle.dump(rf,f)

In [15]:
y_pred=rf.predict(x_test)

In [16]:
y_pred

array([0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1], dtype=int64)

In [17]:
from sklearn.metrics import accuracy_score
accuracy=(accuracy_score(y_test,y_pred)*100)

In [18]:
accuracy

93.00699300699301

In [21]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,y_pred,labels=[0,1])

In [22]:
cm

array([[51,  5],
       [ 5, 82]], dtype=int64)

In [24]:
vocab=np.unique(y_test)
vocab

array([0, 1], dtype=int64)

In [28]:
data=[]

In [31]:
for target_index,target_row in enumerate(cm):
    for predicted_index,predicted_row in enumerate(target_row):
        data.append((vocab[predicted_index],vocab[target_index],predicted_row))

In [32]:
data

[(0, 0, 51), (1, 0, 5), (0, 1, 5), (1, 1, 82)]

In [83]:
import boto3
s3 = boto3.client(
        "s3",
        aws_access_key_id="AKIA3K2K7L4W7UMERY5D",
        aws_secret_access_key="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",
    )
s3.upload_file(
                "folder/data/model_file.pkl",
                "elasticbeanstalk-ap-south-1-779156938541",
                "data/model_file.pkl"
            )