In [33]:
# Check versions and install if needed
#!pip install google-cloud-aiplatform==1.66.0 kfp==2.6.0 scikit-learn==1.4.2 pandas==2.2.2 joblib==1.4.2 --quiet

import os
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, precision_recall_fscore_support
import json
from kfp import dsl
from kfp.dsl import component, Input, Output, Dataset, Model, Metrics


In [34]:
# ---- GCP and bucket setup ----
PROJECT_ID = "de2025-471807"
REGION = "us-central1"

# 3 buckets for clear separation of stages
DATA_BUCKET = "gs://spotify_data_25"
MODEL_BUCKET = "gs://spotify_models"
TEMP_BUCKET  = "gs://spotify_temp"

# Paths used in the pipeline
DATA_URI      = f"{DATA_BUCKET}/spotify_churn_dataset.csv"
PIPELINE_ROOT = f"{TEMP_BUCKET}/pipeline_root"
PIPELINE_NAME = "spotify_churn_pipeline_lr"

# Features and target (numeric only for now)
NUMERIC_FEATURES = ["age","listening_time","songs_played_per_day","skip_rate","ads_listened_per_week"]
TARGET_COL = "is_churned"

print("✅ Configuration loaded")


✅ Configuration loaded


In [35]:
@component(packages_to_install=["pandas","gcsfs"])
def data_ingestion(gcs_csv_uri: str, dataset: Output[Dataset]):
    import pandas as pd
    df = pd.read_csv(gcs_csv_uri)
    print("✅ Data loaded:", df.shape)
    df.to_csv(dataset.path, index=False)


In [36]:
@component(packages_to_install=["pandas","scikit-learn==1.4.2","joblib"])
def train_logistic_regression(
    dataset: Input[Dataset],
    model_artifact: Output[Model],
    metrics: Output[Metrics],
    numeric_features_json: str,
    target_col: str = "is_churned",
    test_size: float = 0.2,
    random_state: int = 42,
):
    import json, os, joblib, pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import accuracy_score, roc_auc_score, precision_recall_fscore_support

    NUMERIC_FEATURES = json.loads(numeric_features_json)
    df = pd.read_csv(dataset.path)

    X = df[NUMERIC_FEATURES]
    y = df[target_col].astype(int)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled  = scaler.transform(X_test)

    model = LogisticRegression(max_iter=200, class_weight="balanced", solver="liblinear")
    model.fit(X_train_scaled, y_train)

    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:,1]

    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="binary")

    metrics.log_metric("accuracy",  float(acc))
    metrics.log_metric("roc_auc",   float(auc))
    metrics.log_metric("precision", float(precision))
    metrics.log_metric("recall",    float(recall))
    metrics.log_metric("f1",        float(f1))

    print("✅ Model Performance:")
    print(json.dumps({"accuracy":acc,"roc_auc":auc,"precision":precision,"recall":recall,"f1":f1},indent=2))

    os.makedirs(model_artifact.path, exist_ok=True)
    joblib.dump((scaler, model), os.path.join(model_artifact.path, "model.pkl"))


In [37]:
@component(packages_to_install=["google-cloud-aiplatform","gcsfs"])
def register_model(
    project: str,
    region: str,
    model_bucket: str,
    model_artifact: Input[Model],
):
    from google.cloud import aiplatform
    import gcsfs, os

    aiplatform.init(project=project, location=region)
    fs = gcsfs.GCSFileSystem()

    gcs_model_path = model_bucket.rstrip("/") + "/spotify_lr_model/model.pkl"
    local_model = os.path.join(model_artifact.path, "model.pkl")

    # Upload model to your dedicated model bucket
    with fs.open(gcs_model_path, "wb") as f_out, open(local_model, "rb") as f_in:
        f_out.write(f_in.read())

    # Register in Vertex Model Registry
    model = aiplatform.Model.upload(
        display_name="spotify-churn-lr",
        artifact_uri=model_bucket,
        serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-4:latest",
        description="Spotify churn Logistic Regression numeric-only model",
    )

    print("✅ Model registered in Vertex AI:", model.resource_name)


In [38]:
@dsl.pipeline(
    name=PIPELINE_NAME,
    description="Spotify Churn Prediction Pipeline (Logistic Regression, numeric-only)"
)
def spotify_churn_pipeline(
    gcs_csv_uri: str = DATA_URI,
    project: str = PROJECT_ID,
    region: str = REGION,
    model_bucket: str = MODEL_BUCKET,
):
    ingest = data_ingestion(gcs_csv_uri=gcs_csv_uri)

    train = train_logistic_regression(
        dataset=ingest.outputs["dataset"],
        numeric_features_json=json.dumps(NUMERIC_FEATURES),
        target_col=TARGET_COL,
    )

    register = register_model(
        project=project,
        region=region,
        model_bucket=model_bucket,
        model_artifact=train.outputs["model_artifact"],
    )

In [39]:
from kfp import compiler

os.makedirs("pipeline/build", exist_ok=True)

compiler.Compiler().compile(
    pipeline_func=spotify_churn_pipeline,
    package_path="pipeline/build/spotify_churn_pipeline.json",
)
print("✅ Pipeline compiled: pipeline/build/spotify_churn_pipeline.json")


✅ Pipeline compiled: pipeline/build/spotify_churn_pipeline.json


In [40]:
from google.cloud import aiplatform

# Initialize Vertex AI client with your temporary/staging bucket
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=TEMP_BUCKET)

# Create and run pipeline job
job = aiplatform.PipelineJob(
    display_name="spotify-churn-lr-pipeline",
    template_path="pipeline/build/spotify_churn_pipeline.json",
    pipeline_root=PIPELINE_ROOT,
)

job.run()


Creating PipelineJob
PipelineJob created. Resource name: projects/951195898169/locations/us-central1/pipelineJobs/spotify-churn-pipeline-lr-20251016163743
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/951195898169/locations/us-central1/pipelineJobs/spotify-churn-pipeline-lr-20251016163743')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/spotify-churn-pipeline-lr-20251016163743?project=951195898169
PipelineJob projects/951195898169/locations/us-central1/pipelineJobs/spotify-churn-pipeline-lr-20251016163743 current state:
3
PipelineJob projects/951195898169/locations/us-central1/pipelineJobs/spotify-churn-pipeline-lr-20251016163743 current state:
3
PipelineJob projects/951195898169/locations/us-central1/pipelineJobs/spotify-churn-pipeline-lr-20251016163743 current state:
3
PipelineJob projects/951195898169/locations/us-central1/pipelineJobs/spotify-churn-pipeline-lr-20251016163743 curren

RuntimeError: Job failed with:
code: 9
message: " The DAG failed because some tasks failed. The failed tasks are: [data-ingestion].; Job (project_id = de2025-471807, job_id = 4519334325958737920) is failed due to the above error.; Failed to handle the job: {project_number = 951195898169, job_id = 4519334325958737920}"
