### Tutorial Pipeline for Artifacts Generation, File Transfer

You can find the full README at:

https://github.com/ydataai/academy/blob/master/tutorials/Pipelines/README.md


### Input variables

In [None]:
# The project where your bucket is located
gcp_bucket_project = "gcp_project"

# Bucket path where the input files will be downloaded and the output files will be uploaded
# The input files will be read from the defined folder (this example will read the files from the root folder - /)
# The output files will be uploaded to defined folder (this example will upload the files to the Telco_runs folder - /Telco_runs/)
input_files_path = "gs://bucket_name/"
output_files_path = "gs://bucket_name/Telco_runs/"

# GCP service account file used to download the samples and upload the results to your private bucket
gcp_service_account = {
    "type": "service_account",
    "project_id": "XXX",
    "private_key_id": "XXX",
    "private_key": "-----BEGIN PRIVATE KEY-----\nXXX\n-----END PRIVATE KEY-----\n",
    "client_email": "XXX",
    "client_id": "XXX",
    "auth_uri": "XXX",
    "token_uri": "XXX",
    "auth_provider_x509_cert_url": "XXX",
    "client_x509_cert_url": "XXX",
}

# The python packages that will be installed in the multiple pipeline steps
pip_packages_to_install = [
    "scikit-learn==0.22.2",
    "numpy==1.17.2",
    "pandas==1.0.3",
    "xgboost==1.0.2",
    "gcsfs",
    "tensorflow==2.2.0",
    "seaborn==0.9.0",
    "matplotlib==3.1.1",
    "mpld3==0.5.1",
]

# TelcoChurnXGB parameters
n_estimators = 100
verbosity = 0
max_depth = 2
eta = 1
silent = 0

# Kfp
pipeline_name = "Telco Merchant Churn Prediction Pipeline"
pipeline_description = "Churn predictions using XGBoost Algorithm"
experiment_name = "Telco experiment"
run_name = "Sample Run"


In [None]:
import kfp
import typing


#### Read the Data file from GCS Bucket

In [None]:
from typing import NamedTuple
from kfp.components import *

def read_data(
    df_churn_op: OutputPath(),
    mlpipeline_ui_metadata: OutputPath(),
    in_gcp_bucket_project: str,
    in_gcp_bucket_input_path: str,
    in_gcp_sa_json: dict,
):

    import pandas as pd
    import numpy as np
    import gcsfs
    from tensorflow.python.lib.io import file_io
    import json
    import os
    import io

    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/gcp_sa.json"

    with io.open("gcp_sa.json", "w", encoding="utf-8") as f:
        json.dump(in_gcp_sa_json, f, ensure_ascii=False)

    fs = gcsfs.GCSFileSystem(project=in_gcp_bucket_project)

    # include slash at the end of path
    if not in_gcp_bucket_input_path.endswith("/") : in_gcp_bucket_input_path+= "/";
    
    df_churn = pd.read_csv(in_gcp_bucket_input_path + "Data.csv")
    df_churn.to_csv(df_churn_op, index=False)

    # A DataFrame too long cannot be displayed in the Artifacts

    df_disp = df_churn.iloc[0:5]
    df_disp = df_disp[
        ["customerID", "gender", "tenure", "Contract", "TotalCharges", "Churn"]
    ]

    df_disp.to_csv(in_gcp_bucket_input_path + "Data_Sample.csv", index=False)

    df_show = pd.read_csv(in_gcp_bucket_input_path + "Data_Sample.csv")
    categorical_cols = [
        c
        for c in df_show.columns
        if df_show[c].dtype == "object" or c == "SeniorCitizen"
    ]

    numerical_cols = ["tenure", "MonthlyCharges", "TotalCharges"]

    schema = [
        {"name": c, "type": "CATEGORY" if c in categorical_cols else "NUMBER"}
        for c in df_show.columns
    ]

    metadata = {
        "outputs": [
            {
                "type": "table",
                "storage": "gcs",
                "format": "csv",
                "header": [x["name"] for x in schema],
                "source": in_gcp_bucket_input_path + "Data_Sample.csv",
            }
        ]
    }

    with file_io.FileIO("/mlpipeline-ui-metadata.json", "w") as f:
        json.dump(metadata, f)

    with file_io.FileIO(mlpipeline_ui_metadata, "w") as f:
        json.dump(metadata, f)


In [None]:
kfp_read_data = kfp.components.func_to_container_op(
    func=read_data,
    output_component_file="./read-data-func.yaml",
    packages_to_install=pip_packages_to_install,
)


#### Statistical Analysis and Artifact Generation for Categorical Data Features

In [None]:
def categorical_analysis(
    df_churn_ip: InputPath(),
    in_gcp_bucket_project: str,
    in_gcp_bucket_output_path: str,
    in_gcp_sa_json: dict,
    mlpipeline_ui_metadata: OutputPath(),
    df_churn_op: OutputPath(),
):

    import pandas as pd
    import numpy as np
    import gcsfs
    import seaborn as sns
    import matplotlib.pyplot as plt
    import warnings
    import mpld3
    import json
    import io
    import os
    from sklearn.ensemble import RandomForestClassifier
    from tensorflow.python.lib.io import file_io

    warnings.simplefilter(action="ignore", category=FutureWarning)
    warnings.simplefilter(action="ignore", category=UserWarning)

    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/gcp_sa.json"

    with io.open("gcp_sa.json", "w", encoding="utf-8") as f:
        json.dump(in_gcp_sa_json, f, ensure_ascii=False)

    fs = gcsfs.GCSFileSystem(project=in_gcp_bucket_project)

    # include slash at the end of path
    if not in_gcp_bucket_output_path.endswith("/") : in_gcp_bucket_output_path+= "/";
    
    df = pd.read_csv(df_churn_ip)
    df1 = df.copy(deep=True)
    df1.to_csv(df_churn_op, index=False)

    sns.set(style="white")
    df["TotalCharges"] = df["TotalCharges"].replace(" ", 0).astype("float32")

    # Churn Plot
    ax = sns.catplot(
        y="Churn",
        kind="count",
        data=df,
        height=2.0,
        aspect=3.0,
        palette="bright",
        legend=True,
    )

    fig = plt.gcf()
    s = mpld3.fig_to_html(fig)

    with file_io.FileIO(in_gcp_bucket_output_path + "Artifacts/Graphs/churn_plot.html", "w") as f:
        f.write(s)

    def barplot_percentages(feature, orient="v", axis_name="percentage of customers"):
        ratios = pd.DataFrame()
        g = df.groupby(feature)["Churn"].value_counts().to_frame()
        g = g.rename({"Churn": axis_name}, axis=1).reset_index()
        g[axis_name] = g[axis_name] / len(df)
        if orient == "v":
            ax = sns.barplot(
                x=feature,
                y=axis_name,
                hue="Churn",
                data=g,
                orient=orient,
                palette="bright",
            )
            ax.set_yticklabels(["{:,.0%}".format(y) for y in ax.get_yticks()])
        else:
            ax = sns.barplot(
                x=axis_name,
                y=feature,
                hue="Churn",
                data=g,
                orient=orient,
                palette="bright",
            )
            ax.set_xticklabels(["{:,.0%}".format(x) for x in ax.get_xticks()])
        ax.plot()

    # Genders
    df["churn_rate"] = df["Churn"].replace("No", 0).replace("Yes", 1)
    g = sns.FacetGrid(df, col="SeniorCitizen", height=4, aspect=0.9)
    ax = g.map(
        sns.barplot, "gender", "churn_rate", palette="bright", order=["Female", "Male"]
    )

    fig = plt.gcf()
    s = mpld3.fig_to_html(fig)

    with file_io.FileIO(
        in_gcp_bucket_output_path + "Artifacts/Graphs/Genders.html", "w"
    ) as f:
        f.write(s)

    # Multiple Lines
    fig = plt.figure(figsize=(9, 4.5))
    barplot_percentages("MultipleLines", orient="v")

    fig = plt.gcf()
    s = mpld3.fig_to_html(fig)

    with file_io.FileIO(
        in_gcp_bucket_output_path + "Artifacts/Graphs/Multiple_lines.html", "w"
    ) as f:
        f.write(s)

    # Service-wise Columns analysis
    cols = [
        "OnlineSecurity",
        "OnlineBackup",
        "DeviceProtection",
        "TechSupport",
        "StreamingTV",
        "StreamingMovies",
    ]
    df1 = pd.melt(df[df["InternetService"] != "No"][cols]).rename(
        {"value": "Has service"}, axis=1
    )
    plt.figure(figsize=(10, 4.5))
    ax = sns.countplot(data=df1, x="variable", hue="Has service", palette="bright")
    ax.set(xlabel="Additional service", ylabel="Num of customers")

    fig = plt.gcf()

    s = mpld3.fig_to_html(fig)

    with file_io.FileIO(
        in_gcp_bucket_output_path + "Artifacts/Graphs/Servicewise_analysis.html", "w"
    ) as f:
        f.write(s)

    # Service-wise Columns analysis2
    plt.figure(figsize=(10, 4.5))
    df1 = df[(df.InternetService != "No") & (df.Churn == "Yes")]
    df1 = pd.melt(df1[cols]).rename({"value": "Has service"}, axis=1)
    ax = sns.countplot(
        data=df1,
        x="variable",
        hue="Has service",
        hue_order=["No", "Yes"],
        palette="bright",
    )
    ax.set(xlabel="Additional service", ylabel="Num of churns")

    fig = plt.gcf()
    s = mpld3.fig_to_html(fig)

    with file_io.FileIO(
        in_gcp_bucket_output_path + "Artifacts/Graphs/Servicewise_analysis2.html", "w"
    ) as f:
        f.write(s)

    # Generating Metadata
    metadata = {
        "version": 1,
        "outputs": [
            {
                "type": "web-app",
                "storage": "gcs",
                "source": in_gcp_bucket_output_path + "Artifacts/Graphs/churn_plot.html",
            },
            {
                "type": "web-app",
                "storage": "gcs",
                "source": in_gcp_bucket_output_path + "Artifacts/Graphs/Genders.html",
            },
            {
                "type": "web-app",
                "storage": "gcs",
                "source": in_gcp_bucket_output_path + "Artifacts/Graphs/Multiple_lines.html",
            },
            {
                "type": "web-app",
                "storage": "gcs",
                "source": in_gcp_bucket_output_path + "Artifacts/Graphs/Servicewise_analysis.html",
            },
            {
                "type": "web-app",
                "storage": "gcs",
                "source": in_gcp_bucket_output_path + "Artifacts/Graphs/Servicewise_analysis2.html",
            },
        ],
    }

    with file_io.FileIO("/mlpipeline-ui-metadata.json", "w") as f:
        json.dump(metadata, f)

    with file_io.FileIO(mlpipeline_ui_metadata, "w") as f:
        json.dump(metadata, f)


In [None]:
kfp_categorical_analysis = kfp.components.func_to_container_op(
    func=categorical_analysis,
    output_component_file="./categorical_analysis.yaml",
    packages_to_install=pip_packages_to_install,
)


#### Combined Statistical Analysis and Artifact Generation for Numerical and Categorical Features 

In [None]:
def mixed_analysis(
    df_churn_ip: InputPath(),
    df_churn_ip2: InputPath(),
    in_gcp_bucket_project: str,
    in_gcp_bucket_output_path: str,
    in_gcp_sa_json: dict,
    mlpipeline_ui_metadata: OutputPath(),
    df_churn_op: OutputPath(),
):

    import pandas as pd
    import numpy as np
    import gcsfs
    import seaborn as sns
    import matplotlib.pyplot as plt
    import warnings
    import mpld3
    import json
    import io
    import os
    from sklearn.ensemble import RandomForestClassifier
    from tensorflow.python.lib.io import file_io

    warnings.simplefilter(action="ignore", category=FutureWarning)
    warnings.simplefilter(action="ignore", category=UserWarning)

    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/gcp_sa.json"

    with io.open("gcp_sa.json", "w", encoding="utf-8") as f:
        json.dump(in_gcp_sa_json, f, ensure_ascii=False)

    fs = gcsfs.GCSFileSystem(project=in_gcp_bucket_project)

    # include slash at the end of path
    if not in_gcp_bucket_output_path.endswith("/") : in_gcp_bucket_output_path+= "/";
    
    df = pd.read_csv(df_churn_ip)
    df2 = pd.read_csv(df_churn_ip2)

    df1 = df.copy(deep=True)
    df1.to_csv(df_churn_op, index=False)

    sns.set(style="white")
    df["TotalCharges"] = df["TotalCharges"].replace(" ", 0).astype("float32")

    df["total_charges_to_tenure_ratio"] = df["TotalCharges"] / df["tenure"]
    df["monthly_charges_diff"] = (
        df["MonthlyCharges"] - df["total_charges_to_tenure_ratio"]
    )
    df["churn_rate"] = df["Churn"].replace("No", 0).replace("Yes", 1)

    # Internet Service vs Monthly Charges
    ax = sns.catplot(
        x="InternetService",
        y="MonthlyCharges",
        hue="Churn",
        kind="violin",
        split=True,
        palette="pastel",
        data=df,
        height=4.2,
        aspect=1.4,
    )

    fig = plt.gcf()
    s = mpld3.fig_to_html(fig)

    with file_io.FileIO(
        in_gcp_bucket_output_path + "Artifacts/Graphs/violinplot2.html", "w"
    ) as f:
        f.write(s)

    metadata = {
        "version": 1,
        "outputs": [
            {
                "type": "web-app",
                "storage": "gcs",
                "source": in_gcp_bucket_output_path + "Artifacts/Graphs/violinplot2.html",
            }
        ],
    }

    with file_io.FileIO("/mlpipeline-ui-metadata.json", "w") as f:
        json.dump(metadata, f)

    with file_io.FileIO(mlpipeline_ui_metadata, "w") as f:
        json.dump(metadata, f)


In [None]:
kfp_mixed_analysis = kfp.components.func_to_container_op(
    func=mixed_analysis,
    output_component_file="./mixed_analysis.yaml",
    packages_to_install=pip_packages_to_install,
)


In [None]:
def numerical_analysis(
    df_churn_ip: InputPath(),
    in_gcp_bucket_project: str,
    in_gcp_bucket_output_path: str,
    in_gcp_sa_json: dict,
    mlpipeline_ui_metadata: OutputPath(),
    df_churn_op: OutputPath(),
):

    import pandas as pd
    import numpy as np
    import gcsfs
    import seaborn as sns
    import matplotlib.pyplot as plt
    import warnings
    import mpld3
    import json
    import io
    import os
    from sklearn.ensemble import RandomForestClassifier
    from tensorflow.python.lib.io import file_io

    warnings.simplefilter(action="ignore", category=FutureWarning)
    warnings.simplefilter(action="ignore", category=UserWarning)

    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/gcp_sa.json"

    with io.open("gcp_sa.json", "w", encoding="utf-8") as f:
        json.dump(in_gcp_sa_json, f, ensure_ascii=False)

    fs = gcsfs.GCSFileSystem(project=in_gcp_bucket_project)

    # include slash at the end of path
    if not in_gcp_bucket_output_path.endswith("/") : in_gcp_bucket_output_path+= "/";
    
    df = pd.read_csv(df_churn_ip)

    df1 = df.copy(deep=True)
    df1.to_csv(df_churn_op, index=False)

    sns.set(style="white")
    df["TotalCharges"] = df["TotalCharges"].replace(" ", 0).astype("float32")

    # kdeplots - tenure, Monthly Charges, Total Charges
    def kdeplot(feature):
        fig = plt.figure(figsize=(9, 4))
        plt.title("KDE for {}".format(feature))
        ax0 = sns.kdeplot(
            df[df["Churn"] == "No"][feature].dropna(), color="navy", label="Churn: No"
        )
        ax1 = sns.kdeplot(
            df[df["Churn"] == "Yes"][feature].dropna(),
            color="orange",
            label="Churn: Yes",
        )

        fig = plt.gcf()
        s = mpld3.fig_to_html(fig)

        with file_io.FileIO(
            in_gcp_bucket_output_path + "Artifacts/Graphs/{}.html".format(feature), "w"
        ) as f:
            f.write(s)

    kdeplot("TotalCharges")

    metadata = {
        "version": 1,
        "outputs": [
            {
                "type": "web-app",
                "storage": "gcs",
                "source": in_gcp_bucket_output_path + "Artifacts/Graphs/TotalCharges.html",
            }
        ],
    }

    with file_io.FileIO("/mlpipeline-ui-metadata.json", "w") as f:
        json.dump(metadata, f)

    with file_io.FileIO(mlpipeline_ui_metadata, "w") as f:
        json.dump(metadata, f)


In [None]:
kfp_numerical_analysis = kfp.components.func_to_container_op(
    func=numerical_analysis,
    output_component_file="./numerical_analysis.yaml",
    packages_to_install=pip_packages_to_install,
)


#### Data Cleaning and One-Hot Encoding

In [None]:
from typing import NamedTuple
from kfp.components import *

def one_hot_encode(
    df_churn_ip: InputPath(), df_churn_imputed: InputPath(), df_one_hot: OutputPath()
):

    import pandas as pd
    import numpy as np

    df_churn = pd.read_csv(df_churn_ip)
    df_churn_imp = pd.read_csv(df_churn_imputed)
    empty_cols = [
        "customerID",
        "gender",
        "SeniorCitizen",
        "Partner",
        "Dependents",
        "tenure",
        "PhoneService",
        "MultipleLines",
        "InternetService",
        "OnlineSecurity",
        "OnlineBackup",
        "DeviceProtection",
        "TechSupport",
        "StreamingTV",
        "StreamingMovies",
        "Contract",
        "PaperlessBilling",
        "PaymentMethod",
        "MonthlyCharges",
        "TotalCharges",
        "Churn",
    ]

    for i in empty_cols:
        df_churn[i] = df_churn[i].replace(" ", np.nan)

    df_churn.drop(["customerID"], axis=1, inplace=True)
    df_churn = df_churn.dropna()
    binary_cols = ["Partner", "Dependents", "PhoneService", "PaperlessBilling"]

    for i in binary_cols:
        df_churn[i] = df_churn[i].replace({"Yes": 1, "No": 0})

    # Encoding column 'gender'
    df_churn["gender"] = df_churn["gender"].replace({"Male": 1, "Female": 0})

    category_cols = [
        "PaymentMethod",
        "MultipleLines",
        "InternetService",
        "OnlineSecurity",
        "OnlineBackup",
        "DeviceProtection",
        "TechSupport",
        "StreamingTV",
        "StreamingMovies",
        "Contract",
    ]

    for cc in category_cols:
        dummies = pd.get_dummies(df_churn[cc], drop_first=False)
        dummies = dummies.add_prefix("{}#".format(cc))
        df_churn.drop(cc, axis=1, inplace=True)
        df_churn = df_churn.join(dummies)

    df_churn_targets = df_churn["Churn"].unique()
    df_churn["Churn"] = df_churn["Churn"].replace({"Yes": 1, "No": 0})

    df_churn.to_csv(df_one_hot, index=False)


In [None]:
kfp_one_hot_encode = kfp.components.func_to_container_op(
    func=one_hot_encode,
    output_component_file="./one-hot-encode-func.yaml",
    packages_to_install=pip_packages_to_install,
)


#### Machine Learning Algorithm - XGBoost

In [None]:
from typing import NamedTuple
from kfp.components import *

def xgb_model(
    df_churn_ip: InputPath(),
    n_estimators: int,
    verbosity: int,
    max_depth: int,
    eta: int,
    silent: int,
    conf_matr: OutputPath(),
    mlpipeline_ui_metadata: OutputPath(),
    in_gcp_bucket_project: str,
    in_gcp_sa_json: dict,
    in_gcp_bucket_output_path: str,
    mlpipeline_metrics: OutputPath(),
):

    import pandas as pd
    import numpy as np
    import xgboost as xgb
    import json
    import os
    import io
    import gcsfs
    from tensorflow.python.lib.io import file_io
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import (
        confusion_matrix,
        accuracy_score,
        roc_auc_score,
        roc_curve,
        precision_score,
        recall_score,
        f1_score,
    )

    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/gcp_sa.json"

    # include slash at the end of path
    if not in_gcp_bucket_output_path.endswith("/") : in_gcp_bucket_output_path+= "/";
        
    with io.open("gcp_sa.json", "w", encoding="utf-8") as f:
        json.dump(in_gcp_sa_json, f, ensure_ascii=False)

    df_churn = pd.read_csv(df_churn_ip)
    df_churn.dropna(inplace=True)

    y1 = df_churn["Churn"]
    X1 = df_churn.drop(["Churn"], axis=1)

    X_train, X_test, y_train, y_test = train_test_split(X1, y1, random_state=0)

    clfxg = xgb.XGBClassifier(
        objective="binary:logistic", verbosity=0, max_depth=2, eta=1, silent=0
    )
    clfxg.fit(X_train, y_train)

    y_test_pred = clfxg.predict(X_test)

    y_test_proba = clfxg.predict_proba(X_test)[:, 0]

    xgb_score = float("%.4f" % accuracy_score(y_test, y_test_pred))
    xgb_precision = float("%.4f" % precision_score(y_test, y_test_pred))
    xgb_recall = float("%.4f" % recall_score(y_test, y_test_pred))
    xgb_f1 = float("%.4f" % f1_score(y_test, y_test_pred))

    print("Accuracy, Precision, Recall, f1: ")
    print(xgb_score, xgb_precision, xgb_recall, xgb_f1)

    cm = confusion_matrix(y_test, y_test_pred)
    print("Confusion Matrix: {}".format(cm))

    fpr, tpr, thresholds = roc_curve(y_test, y_test_proba)
    auc_score = float("%.4f" % roc_auc_score(y_test, y_test_proba))
    print("Auc score: ")
    print(auc_score)

    # Converting the matrix to a Dataframe
    flags = {0: "Not Churned", 1: "Churned"}
    flag_list = ["Not Churned", "Churned"]
    data = []
    for target_index, target_row in enumerate(cm):
        for predicted_index, count in enumerate(target_row):
            data.append((flags[target_index], flags[predicted_index], count))

    df_cm = pd.DataFrame(data, columns=["target", "predicted", "count"])
    print(df_cm)

    with file_io.FileIO(conf_matr, "w") as f:
        df_cm.to_csv(
            f, columns=["target", "predicted", "count"], header=False, index=False
        )

    fs = gcsfs.GCSFileSystem(project=in_gcp_bucket_project)

    with file_io.FileIO(in_gcp_bucket_output_path + "Artifacts/XGBConf_mat.csv", "w") as f:
        df_cm.to_csv(
            f, columns=["target", "predicted", "count"], header=False, index=False
        )

    # roc curve
    df_roc = pd.DataFrame({"fpr": fpr, "tpr": tpr, "thresholds": thresholds})
    with file_io.FileIO(in_gcp_bucket_output_path + "Artifacts/XGBROC_curve.csv", "w") as f:
        df_roc.to_csv(
            f, columns=["fpr", "tpr", "thresholds"], header=False, index=False
        )

    # code to generate artifacts

    # Artifact generator - metadata
    metadata = {
        "version": 1,
        "outputs": [
            {
                "type": "confusion_matrix",
                "format": "csv",
                "storage": "gcs",
                "schema": [
                    {"name": "target", "type": "CATEGORY"},
                    {"name": "predicted", "type": "CATEGORY"},
                    {"name": "count", "type": "NUMBER"},
                ],
                "source": in_gcp_bucket_output_path
                + "Artifacts/XGBConf_mat.csv",  # conf_matr
                # Convert flags to string because for bealean values we want "True|False" to match csv data.
                "labels": flag_list,
            },
            {
                "type": "roc",
                "format": "csv",
                "storage": "gcs",
                "schema": [
                    {"name": "fpr", "type": "NUMBER"},
                    {"name": "tpr", "type": "NUMBER"},
                    {"name": "thresholds", "type": "NUMBER"},
                ],
                "source": in_gcp_bucket_output_path + "Artifacts/XGBROC_curve.csv",
            },
        ],
    }

    with file_io.FileIO("/mlpipeline-ui-metadata.json", "w") as f:
        json.dump(metadata, f)

    with file_io.FileIO(mlpipeline_ui_metadata, "w") as f:
        json.dump(metadata, f)

    metrics = {
        "metrics": [
            {
                "name": "accuracy-score",  # The name of the metric. Visualized as the column name in the runs table.
                "numberValue": xgb_score,  # The value of the metric. Must be a numeric value.
                "format": "RAW",  # The optional format of the metric. Supported values are "RAW" (displayed in raw format) and "PERCENTAGE" (displayed in percentage format).
            },
            {
                "name": "precision-score",  # The name of the metric. Visualized as the column name in the runs table.
                "numberValue": xgb_precision,  # The value of the metric. Must be a numeric value.
                "format": "RAW",  # The optional format of the metric. Supported values are "RAW" (displayed in raw format) and "PERCENTAGE" (displayed in percentage format).
            },
            {
                "name": "recall",  # The name of the metric. Visualized as the column name in the runs table.
                "numberValue": xgb_recall,  # The value of the metric. Must be a numeric value.
                "format": "RAW",  # The optional format of the metric. Supported values are "RAW" (displayed in raw format) and "PERCENTAGE" (displayed in percentage format).
            },
            {
                "name": "f1-score",  # The name of the metric. Visualized as the column name in the runs table.
                "numberValue": xgb_f1,  # The value of the metric. Must be a numeric value.
                "format": "RAW",  # The optional format of the metric. Supported values are "RAW" (displayed in raw format) and "PERCENTAGE" (displayed in percentage format).
            },
            {
                "name": "auc-score",  # The name of the metric. Visualized as the column name in the runs table.
                "numberValue": auc_score,  # The value of the metric. Must be a numeric value.
                "format": "RAW",  # The optional format of the metric. Supported values are "RAW" (displayed in raw format) and "PERCENTAGE" (displayed in percentage format).
            },
        ]
    }

    with file_io.FileIO("/mlpipeline-metrics.json", "w") as f:
        json.dump(metrics, f)

    with file_io.FileIO(mlpipeline_metrics, "w") as f:
        json.dump(metrics, f)


In [None]:
kfp_xgb_model = kfp.components.func_to_container_op(
    func=xgb_model,
    output_component_file="./xgb-model-func.yaml",
    packages_to_install=pip_packages_to_install,
)


#### Defining the Pipeline Execution Sequence and Input-Output scheme

In [None]:
import kfp.dsl as dsl

@dsl.pipeline(
    name=pipeline_name,
    description=pipeline_description,
)

# Create pipeline and set default values
def TelcoChurnXGB_func(
    in_n_estimators: int = n_estimators,
    in_verbosity: int = verbosity,
    in_max_depth: int = max_depth,
    in_eta: int = eta,
    in_silent: int = silent,
    in_input_files_path: str = input_files_path,
    in_output_files_path: str = output_files_path,
    in_gcp_bucket_project: str = gcp_bucket_project,
    in_gcp_service_account: dict = gcp_service_account,
):
  
    # Passing pipeline parameter and add default values
    read_data_task = kfp_read_data(
        in_gcp_bucket_project=in_gcp_bucket_project,
        in_gcp_bucket_input_path=in_input_files_path,
        in_gcp_sa_json=in_gcp_service_account,
    )

    cat_analysis_task = kfp_categorical_analysis(
        df_churn_ip=read_data_task.outputs["df_churn_op"],
        in_gcp_bucket_project=in_gcp_bucket_project,
        in_gcp_bucket_output_path=in_output_files_path,
        in_gcp_sa_json=in_gcp_service_account,
    )

    mix_analysis_task = kfp_mixed_analysis(
        df_churn_ip=read_data_task.outputs["df_churn_op"],
        in_gcp_bucket_project=in_gcp_bucket_project,
        in_gcp_bucket_output_path=in_output_files_path,
        in_gcp_sa_json=in_gcp_service_account,
        df_churn_ip2=cat_analysis_task.outputs["df_churn_op"],
    )

    num_analysis_task = kfp_numerical_analysis(
        df_churn_ip=mix_analysis_task.outputs["df_churn_op"],
        in_gcp_bucket_project=in_gcp_bucket_project,
        in_gcp_bucket_output_path=in_output_files_path,
        in_gcp_sa_json=in_gcp_service_account,
    )

    ohe_task = kfp_one_hot_encode(
        df_churn_ip=read_data_task.outputs["df_churn_op"],
        df_churn_imputed=num_analysis_task.outputs["df_churn_op"],
    )

    xgb_model_task = kfp_xgb_model(
        ohe_task.outputs["df_one_hot"],
        in_n_estimators,
        in_verbosity,
        in_max_depth,
        in_eta,
        in_silent,
        in_gcp_bucket_project=in_gcp_bucket_project,
        in_gcp_bucket_output_path=in_output_files_path,
        in_gcp_sa_json=in_gcp_service_account,
    )


#### Compiling the Pipeline

In [None]:
import kfp.compiler as comp

pipeline_func = TelcoChurnXGB_func
pipeline_filename = pipeline_func.__name__ + ".pipeline.tar.gz"

comp.Compiler().compile(pipeline_func, pipeline_filename)


#### Running the Pipeline

In [None]:
import json
import uuid
from random import randrange

client = kfp.Client()

# check if pipeline already exists -> if not, create a new one
filter = json.dumps(
    {
        "predicates": [
            {"key": "name", "op": 1, "string_value": "{}".format(pipeline_name)}
        ]
    }
)
pipeline = client.pipelines.list_pipelines(filter=filter)

if pipeline.pipelines is None:
    print("Creating a new pipeline: " + pipeline_name)
    pipeline = client.pipeline_uploads.upload_pipeline(
        pipeline_filename, name=pipeline_name, description=pipeline_description
    )
else:
    print("Pipeline already exists: " + pipeline_name)
    pipeline = pipeline.pipelines[0]
    
pipeline_version = str(uuid.uuid4())

print("Creating a new pipeline version: " + pipeline_name + str(" [" + pipeline_version + "]"))
client.pipeline_uploads.upload_pipeline_version(
    pipeline_filename,
    name=pipeline_name + str(" [" + pipeline_version + "]"),
    pipelineid=pipeline.id,
)

# create a new experiment or use an existing one
print("Creating a new experiment or use an existing one: " + experiment_name)
experiment = client.create_experiment(name=experiment_name)

# create a new run with a random identifier
run_random_id = str(randrange(1000))
print("Creating a new run: " + run_name + " [" + run_random_id + "]")
new_run = client.run_pipeline(
    experiment.id, run_name + " [" + run_random_id + "]", pipeline_id=pipeline.id
)
