### Telco Customer Churn Prediction using Random Forest

In [1]:
import kfp
import typing

#### Read the Data file from GCS Bucket

In [2]:
#### Read the Data file from GCS Bucket## Read Data

from typing import NamedTuple
from kfp.components import *

def read_data(file_name: str, df_churn_op: OutputPath(), mlpipeline_ui_metadata: OutputPath()): 
        
    ## Import Required Libraries
    import pandas as pd
    import numpy as np
    import gcsfs
    from tensorflow.python.lib.io import file_io
    import json
    
    fs = gcsfs.GCSFileSystem(project='YDataSynthetic', token = 'cloud')

    df_churn = pd.read_csv(file_name)
    df_churn.to_csv(df_churn_op, index=False)
    
    #A DataFrame too long cannot be displayed in the Artifacts
    
    df_disp = df_churn.iloc[0:5]
    df_disp = df_disp[['customerID','gender','tenure','Contract','TotalCharges','Churn']]
 
    df_disp.to_csv('gs://pipelines_artifacts/Artifacts/Data_Sample.csv', index=False)
    
    df_show = pd.read_csv("gs://pipelines_artifacts/Artifacts/Data_Sample.csv")
    categorical_cols = [c for c in df_show.columns if df_show[c].dtype == 'object' or c == 'SeniorCitizen']

    numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

    schema = [{'name':c, 'type': 'CATEGORY'if c in categorical_cols else 'NUMBER'} for c in df_show.columns]
    
    metadata = {
        'outputs' : [{
          'type': 'table',
          'storage': 'gcs',
          'format': 'csv',
          'header': [x['name'] for x in schema],
          'source': 'gs://pipelines_artifacts/Artifacts/Data_Sample.csv'
        }
        ]
    }

    with file_io.FileIO('/mlpipeline-ui-metadata.json', 'w') as f:
        json.dump(metadata, f)

    with file_io.FileIO(mlpipeline_ui_metadata, 'w') as f:
        json.dump(metadata, f)
        
    

In [3]:
kfp_read_data = kfp.components.func_to_container_op(func = read_data, 
                                                          output_component_file = './read-data-func.yaml',
                                                          packages_to_install = ['numpy==1.17.2',
                                                                                 'pandas==1.0.3', 'gcsfs'])


#### Statistical Analysis and Artifact Generation for Categorical Data Features

In [4]:
def categorical_analysis(df_churn_ip :InputPath(), mlpipeline_ui_metadata: OutputPath(), df_churn_op :OutputPath()):
        
    ## Import Required Libraries
    import pandas as pd
    import numpy as np
    import gcsfs
    import seaborn as sns
    import matplotlib.pyplot as plt
    from sklearn.ensemble import RandomForestClassifier
    
    import warnings
    warnings.simplefilter(action='ignore', category=FutureWarning)
    warnings.simplefilter(action='ignore', category=UserWarning)
    
    import mpld3
    from tensorflow.python.lib.io import file_io
    import json

        
    fs = gcsfs.GCSFileSystem(project='YDataSynthetic', token = 'cloud')

    df = pd.read_csv(df_churn_ip)
    df1 = df.copy(deep=True)
    df1.to_csv(df_churn_op, index=False)

    sns.set(style="white")
    df['TotalCharges'] = df['TotalCharges'].replace(" ", 0).astype('float32')


    
    #Categorical Analysis
    
    #Churn Plot

    ax = sns.catplot(y="Churn", kind="count", data=df, height=2.0, aspect=3.0, palette = 'bright',
                     legend = True)
    
    #Seaborn Plots cannot be directly viewed on the YData Pipelines Dashboard
    #The plot must be converted to .html format, and uploaded to be accessed by the Artifacts Generator
    
    fig = plt.gcf()    #gcf gets the current figure generated
    s = mpld3.fig_to_html(fig)   #mpld3 is a Python library which converts matplotlib/dericative library
                                 #plots to html 

    #write the .html file to your storage bucket using file_io
    with file_io.FileIO('gs://pipelines_artifacts/Artifacts/Graphs/churn_plot.html', 'w') as f:
        f.write(s)


    def barplot_percentages(feature, orient='v', axis_name="percentage of customers"):
        ratios = pd.DataFrame()
        g = df.groupby(feature)["Churn"].value_counts().to_frame()
        g = g.rename({"Churn": axis_name}, axis=1).reset_index()
        g[axis_name] = g[axis_name]/len(df)
        if orient == 'v':
            ax = sns.barplot(x=feature, y= axis_name, hue='Churn', data=g, orient=orient, palette = 'bright')
            ax.set_yticklabels(['{:,.0%}'.format(y) for y in ax.get_yticks()])
        else:
            ax = sns.barplot(x= axis_name, y=feature, hue='Churn', data=g, orient=orient, palette = 'bright')
            ax.set_xticklabels(['{:,.0%}'.format(x) for x in ax.get_xticks()])
        ax.plot()

        
    #Partners and Dependents
    
    fig, axis = plt.subplots(1, 2, figsize=(12,4))
    axis[0].set_title("Has partner")
    axis[1].set_title("Has dependents")
    axis_y = "percentage of customers"
    # Plot Partner column
    gp_partner = df.groupby('Partner')["Churn"].value_counts()/len(df)
    gp_partner = gp_partner.to_frame().rename({"Churn": axis_y}, axis=1).reset_index()
    ax = sns.barplot(x='Partner', y= axis_y, hue='Churn', data=gp_partner, ax=axis[0], palette = 'bright')
    # Plot Dependents column
    gp_dep = df.groupby('Dependents')["Churn"].value_counts()/len(df)
    gp_dep = gp_dep.to_frame().rename({"Churn": axis_y}, axis=1).reset_index()
    ax = sns.barplot(x='Dependents', y= axis_y, hue='Churn', data=gp_dep, ax=axis[1], palette = 'bright')

    fig = plt.gcf()
    s = mpld3.fig_to_html(fig)

    with file_io.FileIO('gs://pipelines_artifacts/Artifacts/Graphs/Partners_dependents.html', 'w') as f:
        f.write(s)
    
    
    #A metadata json file has to be generated and dumped to the root level of the container for artifacts to
    # be generated. Metadata supports the following artifacts - 
    # Static HTML
    # ROC Curve
    # Confusion Matrix
    # Tables
    # Markdown

    #Generating Metadata
    metadata = {
        'version' : 1,   #Check the Version of your Kubeflow 
        'outputs' : [{
            'type' : 'web-app',
            'storage' : 'gcs',
            'source' : 'gs://pipelines_artifacts/Artifacts/Graphs/churn_plot.html'
            }, 
            {        
          'type': 'web-app',
          'storage': 'gcs',
          'source': "gs://pipelines_artifacts/Artifacts/Graphs/Partners_dependents.html",
        },
        ]
    }
    
    # Output the metadata file, with the exact same name as below to the root level of the Pipeline-block container
    # While troubleshooting artifact issues, first check at the container level if the file has been created properly
    
    with file_io.FileIO('/mlpipeline-ui-metadata.json', 'w') as f:
        json.dump(metadata, f)
    
    #Also output to Minio as an Output file
    with file_io.FileIO(mlpipeline_ui_metadata, 'w') as f:
        json.dump(metadata, f)



In [5]:
kfp_categorical_analysis = kfp.components.func_to_container_op(func = categorical_analysis, 
                                                          output_component_file = './categorical_analysis.yaml',
                                                          packages_to_install = ['gcsfs', 'scikit-learn==0.22.2',
                                                                                 'numpy==1.17.2',
                                                                                 'pandas==1.0.3',
                                                                                 'seaborn==0.9.0',
                                                                                 'matplotlib==3.1.1',
                                                                                 'mpld3==0.5.1'])

#### Combined Statistical Analysis and Artifact Generation for Numerical and Categorical Features 

In [6]:
def mixed_analysis(df_churn_ip :InputPath(), df_churn_ip2 :InputPath(), 
                   mlpipeline_ui_metadata: OutputPath(), df_churn_op :OutputPath()):
        
    ## Import Required Libraries
    import pandas as pd
    import numpy as np
    import gcsfs
    import seaborn as sns
    import matplotlib.pyplot as plt
    from sklearn.ensemble import RandomForestClassifier
    
    import warnings
    warnings.simplefilter(action='ignore', category=FutureWarning)
    warnings.simplefilter(action='ignore', category=UserWarning)
    
    import mpld3
    from tensorflow.python.lib.io import file_io
    import json

        
    fs = gcsfs.GCSFileSystem(project='YDataSynthetic', token = 'cloud')

    df = pd.read_csv(df_churn_ip)
    df2 = pd.read_csv(df_churn_ip2)

    df1 = df.copy(deep=True)
    df1.to_csv(df_churn_op, index=False)
    
    sns.set(style="white")
    df['TotalCharges'] = df['TotalCharges'].replace(" ", 0).astype('float32')
    
    
    df['total_charges_to_tenure_ratio'] = df['TotalCharges'] / df['tenure']
    df['monthly_charges_diff'] = df['MonthlyCharges'] - df['total_charges_to_tenure_ratio']
    df['churn_rate'] = df['Churn'].replace("No", 0).replace("Yes", 1)
    
    
    #Multiple-Lines vs Monthly Charges
    
    ax = sns.catplot(x="MultipleLines", y="MonthlyCharges", hue="Churn", kind="violin",
                     split=True, palette="pastel", data=df, height=4.2, aspect=1.4)

    fig = plt.gcf()
    s = mpld3.fig_to_html(fig)

    with file_io.FileIO('gs://pipelines_artifacts/Artifacts/Graphs/violinplot1.html', 'w') as f:
        f.write(s)

    
    metadata = {
        'version' : 1, 
        'outputs' : [{
            'type' : 'web-app',
            'storage' : 'gcs',
            'source' : 'gs://pipelines_artifacts/Artifacts/Graphs/violinplot1.html'
            }
        ]
    }
    
    with file_io.FileIO('/mlpipeline-ui-metadata.json', 'w') as f:
        json.dump(metadata, f)

    with file_io.FileIO(mlpipeline_ui_metadata, 'w') as f:
        json.dump(metadata, f)




In [7]:
kfp_mixed_analysis = kfp.components.func_to_container_op(func = mixed_analysis, 
                                                          output_component_file = './mixed_analysis.yaml',
                                                          packages_to_install = ['gcsfs', 'scikit-learn==0.22.2',
                                                                                 'numpy==1.17.2',
                                                                                 'pandas==1.0.3',
                                                                                 'seaborn==0.9.0',
                                                                                 'matplotlib==3.1.1',
                                                                                 'mpld3==0.5.1'])

#### Statistical Analysis and Artifact Generation for Numerical Data Features

In [8]:
def numerical_analysis(df_churn_ip :InputPath(), mlpipeline_ui_metadata: OutputPath(), df_churn_op: OutputPath()):
        
    ## Import Required Libraries
    import pandas as pd
    import numpy as np
    import gcsfs
    import seaborn as sns
    import matplotlib.pyplot as plt
    
    import warnings
    warnings.simplefilter(action='ignore', category=FutureWarning)
    warnings.simplefilter(action='ignore', category=UserWarning)
    
    import mpld3
    from tensorflow.python.lib.io import file_io
    import json

        
    fs = gcsfs.GCSFileSystem(project='YDataSynthetic', token = 'cloud')

    df = pd.read_csv(df_churn_ip)

    df1 = df.copy(deep=True)
    df1.to_csv(df_churn_op, index=False)
    
    sns.set(style="white")
    df['TotalCharges'] = df['TotalCharges'].replace(" ", 0).astype('float32')
    
    
    #kdeplots - tenure, Monthly Charges, Total Charges
    
    def kdeplot(feature):
        fig = plt.figure(figsize=(9, 4))
        plt.title("KDE for {}".format(feature))
        ax0 = sns.kdeplot(df[df['Churn'] == 'No'][feature].dropna(), color= 'navy', label= 'Churn: No')
        ax1 = sns.kdeplot(df[df['Churn'] == 'Yes'][feature].dropna(), color= 'orange', label= 'Churn: Yes')
        
        fig = plt.gcf()
        s = mpld3.fig_to_html(fig)

        with file_io.FileIO('gs://pipelines_artifacts/Artifacts/Graphs/{}.html'.format(feature), 'w') as f:
            f.write(s)

                    
    kdeplot('tenure')

    #scatterplot - Monthly and Total Charges vs Tenure
    fig = plt.figure(figsize=(9, 4))
    g = sns.PairGrid(df, y_vars=["tenure"], x_vars=["MonthlyCharges", "TotalCharges"], height=4.5, hue="Churn", aspect=1.1)
    ax = g.map(plt.scatter, alpha=0.6)
    fig = plt.gcf()
    s = mpld3.fig_to_html(fig)

    with file_io.FileIO('gs://pipelines_artifacts/Artifacts/Graphs/scatterplot1.html', 'w') as f:
        f.write(s)    


    metadata = {
        'version' : 1, 
        'outputs' : [{
            'type' : 'web-app',
            'storage' : 'gcs',
            'source' : 'gs://pipelines_artifacts/Artifacts/Graphs/tenure.html'
            }, 
            {        
          'type': 'web-app',
          'storage': 'gcs',
          'source': "gs://pipelines_artifacts/Artifacts/Graphs/scatterplot1.html",
        }
        ]
    }
    
    with file_io.FileIO('/mlpipeline-ui-metadata.json', 'w') as f:
        json.dump(metadata, f)

    with file_io.FileIO(mlpipeline_ui_metadata, 'w') as f:
        json.dump(metadata, f)




In [9]:
kfp_numerical_analysis = kfp.components.func_to_container_op(func = numerical_analysis, 
                                                          output_component_file = './numerical_analysis.yaml',
                                                          packages_to_install = ['gcsfs', 'scikit-learn==0.22.2',
                                                                                 'numpy==1.17.2',
                                                                                 'pandas==1.0.3',
                                                                                 'seaborn==0.9.0',
                                                                                 'matplotlib==3.1.1',
                                                                                 'mpld3==0.5.1'])

#### Data Cleaning and One-Hot Encoding

In [10]:
from typing import NamedTuple
from kfp.components import *

def one_hot_encode(df_churn_ip: InputPath(), df_churn_imputed :InputPath(), df_one_hot: OutputPath()):
    
    import pandas as pd
    import numpy as np
    
    df_churn = pd.read_csv(df_churn_ip)
    df_churn_imp = pd.read_csv(df_churn_imputed)
    empty_cols = ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
           'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
           'OnlineSecurity', 'OnlineBackup', 'DeviceProtection','TechSupport',
           'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
           'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']
    
    #Replacing Empty values
    for i in empty_cols:
        df_churn[i]=df_churn[i].replace(" ",np.nan)

    df_churn.drop(['customerID'], axis=1, inplace=True)
    df_churn = df_churn.dropna()
    binary_cols = ['Partner','Dependents','PhoneService','PaperlessBilling']
    
    #Binary Encoding
    for i in binary_cols:
        df_churn[i] = df_churn[i].replace({"Yes":1,"No":0})

    #Encoding column 'gender'
    df_churn['gender'] = df_churn['gender'].replace({"Male":1,"Female":0})


    category_cols = ['PaymentMethod','MultipleLines','InternetService','OnlineSecurity',
                   'OnlineBackup','DeviceProtection',
                   'TechSupport','StreamingTV','StreamingMovies','Contract']
    
    #One-hot Encoding of multiple-category columns
    for cc in category_cols:
        dummies = pd.get_dummies(df_churn[cc], drop_first=False)
        dummies = dummies.add_prefix("{}#".format(cc))
        df_churn.drop(cc, axis=1, inplace=True)
        df_churn = df_churn.join(dummies)
    
    df_churn_targets = df_churn['Churn'].unique()
    df_churn['Churn'] = df_churn['Churn'].replace({"Yes":1,"No":0})
    
    #Output the encoded file 
    df_churn.to_csv(df_one_hot, index=False)


In [11]:
kfp_one_hot_encode = kfp.components.func_to_container_op(func = one_hot_encode, 
                                                          output_component_file = './one-hot-encode-func.yaml',
                                                          packages_to_install = ['scikit-learn==0.22.2','numpy==1.17.2',
                                                                                 'pandas==1.0.3',
                                                                                 'imbalanced-learn==0.6.2'])

#### Machine Learning Algorithm - Random Forest

In [12]:
from typing import NamedTuple
from kfp.components import *


def rf_model(df_churn_ip: InputPath(), n_estimators: int, max_depth: int, criterion: str, max_features: str,
              min_samples_split: int,
              conf_matr: OutputPath(),
              mlpipeline_ui_metadata: OutputPath(), mlpipeline_metrics: OutputPath()):
    
    import pandas as pd
    import numpy as np
    import sklearn
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve, precision_score, recall_score, f1_score
    import json
    import os
    import gcsfs
    from tensorflow.python.lib.io import file_io
    
    n_est = n_estimators
    m_dep = max_depth
    crit = criterion
    m_feat = max_features
    min_ss = min_samples_split
    
    
    df_churn = pd.read_csv(df_churn_ip)
    df_churn.dropna(inplace=True)

    y1 = df_churn['Churn']
    X1 = df_churn.drop(['Churn'],axis=1)
    
    #Split Data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X1, y1, random_state=0)
    
    #fit the model on the Data and train it
    rfc_best = RandomForestClassifier(random_state=42, max_features=m_feat, n_estimators = n_est, 
                                      max_depth = m_dep, criterion = crit, min_samples_split = min_ss)

    rfc_best.fit(X_train, y_train) 
    y_test_pred = rfc_best.predict(X_test)
    y_test_proba = rfc_best.predict_proba(X_test)[:,0]
    
    #Get Metrics scores
    
    rf_score = float('%.4f' %rfc_best.score(X_test, y_test))   
    rf_precision = float('%.4f' %precision_score(y_test, y_test_pred))
    rf_recall = float('%.4f' %recall_score(y_test, y_test_pred))
    rf_f1 = float('%.4f' %f1_score(y_test, y_test_pred))
    
    print("Accuraccy, Precision, Recall, f1: ")
    print(rf_score, rf_precision, rf_recall, rf_f1)
    
    
    #Confusion Matrix
    cm = confusion_matrix(y_test, y_test_pred)
    print("Confusion Matrix: {}".format(cm))
    
    #True and False Positive Rates
    fpr, tpr, thresholds = roc_curve(y_test, y_test_proba) 
    auc_score = float('%.4f' %roc_auc_score(y_test, y_test_proba))
    print('Auc score: ')
    print(auc_score)
    
    #Converting the Confusion matrix to a Dataframe
    #Note that for Generating the Confusion Matrix Artifact, the Confusion Matrix has to be converted from
    #a numpy array to a DataFrame in the exact format as given below
    
    flags = {0:'Not Churned',1:'Churned'}
    flag_list = ['Not Churned','Churned']
    data = []
    for target_index, target_row in enumerate(cm):
        for predicted_index, count in enumerate(target_row):
            data.append((flags[target_index], flags[predicted_index], count))

    df_cm = pd.DataFrame(data, columns=['target', 'predicted', 'count'])
    print(df_cm)
    
    
    with file_io.FileIO(conf_matr, 'w') as f:
        df_cm.to_csv(f, columns=['target', 'predicted', 'count'], header=False, index=False)
        
    fs = gcsfs.GCSFileSystem(project='YDataSynthetic', token = 'cloud')
    
    #Save confusion Matrix to GCS
    with file_io.FileIO('gs://pipelines_artifacts/Artifacts/Conf_matRF.csv', 'w') as f:
        df_cm.to_csv(f, columns=['target', 'predicted', 'count'], header=False, index=False)
    
    
    #roc curve
    #For generating the ROC curve, the tpr, fpr and thresholds need to be output as a DataFrame in the 
    #exact format as given below
    
    df_roc = pd.DataFrame({'fpr': fpr, 'tpr': tpr, 'thresholds': thresholds})
    with file_io.FileIO('gs://pipelines_artifacts/Artifacts/ROC_curveRF.csv', 'w') as f:
        df_roc.to_csv(f, columns=['fpr', 'tpr', 'thresholds'], header=False, index=False)

    
    #code to generate artifacts
    
    #Artifact generator - metadata
    
    from tensorflow.python.lib.io import file_io
    import json
    
    metadata = {
        'version' : 1, 
        'outputs' : [{
            'type': 'confusion_matrix',
            'format': 'csv',
            'storage': 'gcs',
            'schema': [   #schema is required in the exact same form for generating the artifact
                {'name': 'target', 'type': 'CATEGORY'},
                {'name': 'predicted', 'type': 'CATEGORY'},
                {'name': 'count', 'type': 'NUMBER'},
            ],
            'source': 'gs://pipelines_artifacts/Artifacts/Conf_matRF.csv', #conf_matr
            
       # Convert flags to string because for bealean values we want "True|False" to match csv data.
            'labels': flag_list
        },    
        {
          'type': 'roc',
          'format': 'csv',
          'storage': 'gcs',
          'schema': [  #schema is required in the exact same form for generating the artifact
            {'name': 'fpr', 'type': 'NUMBER'},
            {'name': 'tpr', 'type': 'NUMBER'},
            {'name': 'thresholds', 'type': 'NUMBER'},
          ],
          'source': 'gs://pipelines_artifacts/Artifacts/ROC_curveRF.csv'
        }
        ]
    }
    
    
    #Output the file to the container-level root with the exact same name as below 
    with file_io.FileIO('/mlpipeline-ui-metadata.json', 'w') as f:
        json.dump(metadata, f)
    
    
    #Also output to Minio 
    with file_io.FileIO(mlpipeline_ui_metadata, 'w') as f:
        json.dump(metadata, f)
        
    
    #The metric scores can output as Pipeline Metrics by generating a json file as below
    metrics = {
    'metrics': [{
      'name': 'accuracy-score', # The name of the metric. Visualized as the column name in the runs table.
      'numberValue':  rf_score, # The value of the metric. Must be a numeric value.
      'format': "RAW",   # The optional format of the metric. Supported values are "RAW" (displayed in raw format) 
                            # and "PERCENTAGE" (displayed in percentage format).
    },
    {
      'name': 'precision-score', # The name of the metric. Visualized as the column name in the runs table.
      'numberValue':  rf_precision, # The value of the metric. Must be a numeric value.
      'format': "RAW",   # The optional format of the metric. Supported values are "RAW" (displayed in raw format) and "PERCENTAGE" (displayed in percentage format).
    },
    {
      'name': 'recall', # The name of the metric. Visualized as the column name in the runs table.
      'numberValue':  rf_recall, # The value of the metric. Must be a numeric value.
      'format': "RAW",   # The optional format of the metric. Supported values are "RAW" (displayed in raw format) and "PERCENTAGE" (displayed in percentage format).
    },
    {
      'name': 'f1-score', # The name of the metric. Visualized as the column name in the runs table.
      'numberValue':  rf_f1, # The value of the metric. Must be a numeric value.
      'format': "RAW",   # The optional format of the metric. Supported values are "RAW" (displayed in raw format) and "PERCENTAGE" (displayed in percentage format).
    },
    {
      'name': 'auc-score', # The name of the metric. Visualized as the column name in the runs table.
      'numberValue':  auc_score, # The value of the metric. Must be a numeric value.
      'format': "RAW",   # The optional format of the metric. Supported values are "RAW" (displayed in raw format) and "PERCENTAGE" (displayed in percentage format).
    }]
    }
    
    #Dump the metrics json file with the exact same name to the container-root directory
    with file_io.FileIO('/mlpipeline-metrics.json', 'w') as f:
        json.dump(metrics, f)
    
    #Also dump it to the Minio file storage
    with file_io.FileIO(mlpipeline_metrics, 'w') as f:
        json.dump(metrics, f)
        

In [13]:
kfp_rf_model = kfp.components.func_to_container_op(func = rf_model, 
                                                          output_component_file = './rf-model-func.yaml', 
                                                   packages_to_install = ['scikit-learn==0.22.2','numpy==1.17.2',
                                                                                 'pandas==1.0.3', 
                                                                          'gcsfs'])

#### Defining the Pipeline Execution Sequence and Input-Output scheme

In [14]:
import kfp.dsl as dsl

@dsl.pipeline(name='Telco Merchant Churn Prediction Pipeline',description='Churn predictions using Random Forest Algorithm')
def TelcoChurnRF_func(file_path = "gs://pipelines_artifacts/Artifacts/Data.csv", 
                n_estimators = 100, max_depth = 8, criterion = 'gini',  max_features='auto', min_samples_split=2):
    
    #Passing pipeline parameter and a constant value as operation arguments
    read_data_task = kfp_read_data(file_name = file_path) 
    
    cat_analysis_task = kfp_categorical_analysis(df_churn_ip = read_data_task.outputs['df_churn_op'])
    mix_analysis_task = kfp_mixed_analysis(df_churn_ip = read_data_task.outputs['df_churn_op'], 
                                           df_churn_ip2 = cat_analysis_task.outputs['df_churn_op'])
    num_analysis_task = kfp_numerical_analysis(df_churn_ip = mix_analysis_task.outputs['df_churn_op'])

    ohe_task = kfp_one_hot_encode(df_churn_ip = read_data_task.outputs['df_churn_op'],
                                  df_churn_imputed = num_analysis_task.outputs['df_churn_op'])
    rf_model_task = kfp_rf_model(ohe_task.outputs['df_one_hot'],
                                   n_estimators, max_depth, criterion, max_features, min_samples_split)

#For an operation with a single return value, the output reference can be accessed using `task.output` or `task.outputs['output_name']` syntax
#For an operation with a multiple return values, the output references can be accessed using `task.outputs['output_name']` syntax

#### Compiling the Pipeline

In [15]:
pipeline_func = TelcoChurnRF_func
pipeline_filename = pipeline_func.__name__+'.pipeline.tar.gz'

import kfp.compiler as comp
comp.Compiler().compile(pipeline_func, pipeline_filename)

