## Simple incident classifier

In [None]:
import sys
import os
import pandas as pd

from itertools import combinations
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [None]:
sys.path.append(os.path.dirname(os.getcwd()))
from utils import run_query, load_constants

In [None]:
constants = load_constants()

GOOGLE_CLOUD_PROJECT = constants["GCP"]["GOOGLE_CLOUD_PROJECT"]
GOOGLE_CLOUD_LOCATION = constants["GCP"]["GOOGLE_CLOUD_LOCATION"]
GOOGLE_CLOUD_GCS_BUCKET = constants["GCP"]["GOOGLE_CLOUD_GCS_BUCKET"]
GOOGLE_CLOUD_SERVICE_ACCOUNT = constants["GCP"]["GOOGLE_CLOUD_SERVICE_ACCOUNT"]
GOOGLE_GEMINI_MODEL_15 = constants["VERTEX"]["GOOGLE_GEMINI_MODEL_15"]

GOOGLE_CLOUD_BIGQUERY_PROJECT = constants["BIGQUERY"]["GOOGLE_CLOUD_BIGQUERY_PROJECT"]
GOOGLE_CLOUD_BIGQUERY_DATASET = constants["BIGQUERY"]["GOOGLE_CLOUD_BIGQUERY_DATASET"]


BASE_TABLE_NAME_EVENTS = constants["BIGQUERY"]["BASE_TABLE_NAME_EVENTS"]
BASE_TABLE_NAME_INCIDENTS = constants["BIGQUERY"]["BASE_TABLE_NAME_INCIDENTS"]

In [None]:
events_query = f"""
    SELECT *
    FROM `{GOOGLE_CLOUD_BIGQUERY_PROJECT}.{GOOGLE_CLOUD_BIGQUERY_DATASET}.{BASE_TABLE_NAME_EVENTS}` TABLESAMPLE SYSTEM (10 PERCENT) 
"""
events_df = run_query(events_query)

incidents_query = f"""
    SELECT *
    FROM `{GOOGLE_CLOUD_BIGQUERY_PROJECT}.{GOOGLE_CLOUD_BIGQUERY_DATASET}.{BASE_TABLE_NAME_INCIDENTS}`
"""
incidents_df = run_query(incidents_query)

In [None]:
def create_features(df, window_size="1h"):
    df = df.copy()
    df["timestamp"] = pd.to_datetime(df["timestamp"])
    df = df.set_index("timestamp")

    features = df.groupby(["network_element_id", pd.Grouper(freq=window_size)]).agg(
        mean_value=("value", "mean"),
        max_value=("value", "max"),
        min_value=("value", "min"),
        count_events=("event", "count"))
    features = features.reset_index()
    network_wide = df.groupby(pd.Grouper(freq=window_size)).agg(
        network_mean_value=("value", "mean"),
        network_max_value=("value", "max"),
        network_min_value=("value", "min"),
        network_count_events=("event", "count"))
    network_wide = network_wide.reset_index()
    features = pd.merge(features, network_wide, on="timestamp", how="left")
    element_ids = df["network_element_id"].unique()

    for element1, element2 in combinations(element_ids, 2):
        features_e1 = features[features['network_element_id'] == element1]
        features_e2 = features[features['network_element_id'] == element2]
        merged = pd.merge(features_e1, features_e2, on='timestamp', how='left', suffixes=('_e1', '_e2'))
        merged['mean_diff_e1_e2'] = merged['mean_value_e1'] - merged['mean_value_e2']
        merged = merged.rename(columns={'mean_diff_e1_e2': f'mean_diff_{element1}_{element2}'})
        features = pd.merge(
            features,
            merged[['timestamp', 'network_element_id_e1', f'mean_diff_{element1}_{element2}']],
            left_on=['timestamp', 'network_element_id'],
            right_on=['timestamp', 'network_element_id_e1'],
            how='left',
        )
        features = features.drop('network_element_id_e1', axis=1)

    return features



def join_with_incidents(features_df, incidents_df):
    df = features_df.copy()
    df["incident_occurred"] = 0  
    for _, row in incidents_df.iterrows():
        start_time = row["start_time"]
        end_time = row["end_time"]
        incident_name = row["incident_name"]
        matching_features = df[
            (df["timestamp"] >= start_time) & (df["timestamp"] <= end_time)
        ]
        
        df.loc[matching_features.index, "incident_occurred"] = 1
        df.loc[matching_features.index, "incident_name"] = incident_name
    return df

In [None]:
events_features = create_features(events_df)
events_with_incidents = join_with_incidents(events_features, incidents_df)

In [None]:
le = LabelEncoder()
events_with_incidents["network_element_id"] = le.fit_transform(
    events_with_incidents["network_element_id"]
)

features = [
    "network_element_id",
    "mean_value",
    "max_value",
    "min_value",
    "count_events",
    "network_mean_value",
    "network_max_value",
    "network_min_value",
    "network_count_events",
]

for element1, element2 in combinations(events_df["network_element_id"].unique(), 2):
    features.append(f"mean_diff_{element1}_{element2}")

target = "incident_occurred"


In [None]:
X = events_with_incidents[features]
y = events_with_incidents[target]


X.fillna(0, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

## Create a Vertex AI Pipeline

In [None]:
from kfp.v2 import dsl
from kfp.v2.dsl import (Dataset,InputPath, Model,OutputPath, component)


@component(
    packages_to_install=[
        "pandas",
        "scikit-learn",
        "google-cloud-bigquery",
        "google-cloud-storage",
        "db-dtypes"
    ],
    base_image="python:3.10", 
)
def load_data_from_bigquery_op(query: str, project_id: str, output_data_path: OutputPath(Dataset)):
    
    from google.cloud import bigquery
    
    bq_client = bigquery.Client(project=project_id)
    df = bq_client.query(query).to_dataframe()
    df.to_csv(output_data_path, index=False)


@component(
    packages_to_install=[
        "pandas",
        "scikit-learn",
        "google-cloud-bigquery",
        "google-cloud-storage",
        "db-dtypes"
    ],
    base_image="python:3.10",
)
def create_features_op(
    input_data_path: InputPath(Dataset),
    output_data_path: OutputPath(Dataset),
    window_size: str = '1h',
):
    import pandas as pd
    from itertools import combinations

    def create_features(df, window_size="1h"):
        df = df.copy()
        df["timestamp"] = pd.to_datetime(df["timestamp"])
        df = df.set_index("timestamp")

        features = df.groupby(["network_element_id", pd.Grouper(freq=window_size)]).agg(
            mean_value=("value", "mean"),
            max_value=("value", "max"),
            min_value=("value", "min"),
            count_events=("event", "count"),
        )
        features = features.reset_index()

        network_wide = df.groupby(pd.Grouper(freq=window_size)).agg(
            network_mean_value=("value", "mean"),
            network_max_value=("value", "max"),
            network_min_value=("value", "min"),
            network_count_events=("event", "count"),
        )
        network_wide = network_wide.reset_index()

        features = pd.merge(features, network_wide, on="timestamp", how="left")

        element_ids = df["network_element_id"].unique()
        for element1, element2 in combinations(element_ids, 2):
            features_e1 = features[features["network_element_id"] == element1]
            features_e2 = features[features["network_element_id"] == element2]
            merged = pd.merge(
                features_e1,
                features_e2,
                on="timestamp",
                how="left",
                suffixes=("_e1", "_e2"),
            )
            merged["mean_diff_e1_e2"] = merged["mean_value_e1"] - merged["mean_value_e2"]

            merged = merged.rename(
                columns={"mean_diff_e1_e2": f"mean_diff_{element1}_{element2}"}
            )

            features = pd.merge(
                features,
                merged[
                    [
                        "timestamp",
                        "network_element_id_e1",
                        f"mean_diff_{element1}_{element2}",
                    ]
                ],
                left_on=["timestamp", "network_element_id"],
                right_on=["timestamp", "network_element_id_e1"],
                how="left",
            )
            features = features.drop("network_element_id_e1", axis=1)

        return features

    df = pd.read_csv(input_data_path)
    features_df = create_features(df, window_size)
    features_df.to_csv(output_data_path, index=False)

@component(
    packages_to_install=["pandas", "scikit-learn", "google-cloud-bigquery", "google-cloud-storage","db-dtypes"],
    base_image="python:3.10",
)
def join_features_op(
    features_data_path: InputPath(Dataset),
    incidents_data_path: InputPath(Dataset),
    output_data_path: OutputPath(Dataset),
):
    import pandas as pd

    def join_with_incidents(features_df, incidents_df):
        df = features_df.copy()
        df["incident_occurred"] = 0
        for _, row in incidents_df.iterrows():
            start_time = row["start_time"]
            end_time = row["end_time"]
            incident_name = row["incident_name"]

            matching_features = df[
                (df["timestamp"] >= start_time) & (df["timestamp"] <= end_time)
            ]

            df.loc[matching_features.index, "incident_occurred"] = 1
            df.loc[matching_features.index, "incident_name"] = incident_name
        return df

    features_df = pd.read_csv(features_data_path)
    incidents_df = pd.read_csv(incidents_data_path)
    joined_df = join_with_incidents(features_df, incidents_df)
    joined_df.to_csv(output_data_path, index=False)

@component(
    packages_to_install=["pandas", "scikit-learn", "google-cloud-bigquery", "google-cloud-storage", "joblib", "db-dtypes"],
    base_image="python:3.10", 
)
def train_and_evaluate_model_op(
    input_data_path: InputPath(Dataset),
    model_output_path: OutputPath(Model),
):
    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import LabelEncoder
    from sklearn.metrics import accuracy_score
    from joblib import dump
    from itertools import combinations

    df = pd.read_csv(input_data_path)
    le = LabelEncoder()
    df['network_element_id'] = le.fit_transform(df['network_element_id'])
    target = 'incident_occurred'
    X = df.drop(columns=[target,'timestamp','incident_name'])
    y = df[target]

    X.fillna(0, inplace=True)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")

    dump(model, model_output_path)

In [None]:
@dsl.pipeline(
    name="telco-incident-prediction-pipeline",
    pipeline_root=f"gs://{GOOGLE_CLOUD_GCS_BUCKET}/pipeline-root-rca",
)
def telco_incident_prediction_pipeline():

    load_events_task = load_data_from_bigquery_op(
        project_id=GOOGLE_CLOUD_BIGQUERY_PROJECT,
        query=f"""
            SELECT *
            FROM `{GOOGLE_CLOUD_BIGQUERY_PROJECT}.{GOOGLE_CLOUD_BIGQUERY_DATASET}.{BASE_TABLE_NAME_EVENTS}` TABLESAMPLE SYSTEM (10 PERCENT) 
        """,
    )
    load_incidents_task = load_data_from_bigquery_op(
        project_id=GOOGLE_CLOUD_BIGQUERY_PROJECT,
        query=f"""
            SELECT *
            FROM `{GOOGLE_CLOUD_BIGQUERY_PROJECT}.{GOOGLE_CLOUD_BIGQUERY_DATASET}.{BASE_TABLE_NAME_INCIDENTS}`
        """,
    )

    create_features_task = create_features_op(
        input_data_path=load_events_task.outputs["output_data_path"],
        window_size="1h",
    )

    join_features_task = join_features_op(
        features_data_path=create_features_task.outputs["output_data_path"],
        incidents_data_path=load_incidents_task.outputs["output_data_path"],
    )

    train_model_task = train_and_evaluate_model_op(
        input_data_path=join_features_task.outputs["output_data_path"],
    )

In [None]:
from kfp import compiler
compiler.Compiler().compile(
    pipeline_func=telco_incident_prediction_pipeline,
    package_path="telco_incident_prediction_pipeline.yml",
)

In [None]:
from google.cloud import aiplatform

DISPLAY_NAME = "telco_incident_prediction_pipeline"

job = aiplatform.PipelineJob(
    display_name=DISPLAY_NAME,
    template_path=f"{DISPLAY_NAME}.yml",
)

job.run(service_account=GOOGLE_CLOUD_SERVICE_ACCOUNT)