In [1]:
# Vertex SDK for Python
! pip3 install --upgrade --quiet  google-cloud-aiplatform

In [2]:
BUCKET_URI= f"gs://temp-mock-oppe-1" #@param {type:"string"} custom
PROJECT_ID = "numeric-poetry-461213-v8"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

In [3]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)

In [4]:
# MODEL_ARTIFACT_DIR = "my-models/iris-classifier-week-3"  # @param {type:"string"}
# REPOSITORY = "iris-classifier-repo"  # @param {type:"string"}
# IMAGE = "iris-classifier-img"  # @param {type:"string"}
# MODEL_DISPLAY_NAME = "iris-classifier"  # @param {type:"string"}
# BIGQUERY_DATASET_NAME="iris_classifier_tutorial" #@param {type:"string"} custom
# AI_PLATFORM_MODEL_NAME="iris_classifier_jsd_model" #@param {type:"string"

# # Set the defaults if no names were specified
# if MODEL_ARTIFACT_DIR == "[your-artifact-directory]":
#     MODEL_ARTIFACT_DIR = "custom-container-prediction-model"

# if REPOSITORY == "[your-repository-name]":
#     REPOSITORY = "custom-container-prediction"

# if IMAGE == "[your-image-name]":
#     IMAGE = "sklearn-fastapi-server"

# if MODEL_DISPLAY_NAME == "[your-model-display-name]":
#     MODEL_DISPLAY_NAME = "sklearn-custom-container"

In [5]:
import pandas as pd
import numpy as np

In [6]:
data = pd.read_csv("data/iris.csv")
data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sample_id,event_timestamp
0,5.1,3.5,1.4,0.2,setosa,0,2025-07-09 14:15:04.253162+00:00
1,4.9,3.0,1.4,0.2,setosa,1,2025-06-30 14:15:04.255433+00:00
2,4.7,3.2,1.3,0.2,setosa,2,2025-06-24 14:15:04.255778+00:00
3,4.6,3.1,1.5,0.2,setosa,3,2025-07-08 14:15:04.255990+00:00
4,5.0,3.6,1.4,0.2,setosa,4,2025-07-02 14:15:04.256976+00:00
...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,145,2025-07-07 14:15:04.286532+00:00
146,6.3,2.5,5.0,1.9,virginica,146,2025-06-21 14:15:04.286748+00:00
147,6.5,3.0,5.2,2.0,virginica,147,2025-07-01 14:15:04.286898+00:00
148,6.2,3.4,5.4,2.3,virginica,148,2025-07-09 14:15:04.287051+00:00


In [7]:
# ! git init
# ! git remote add origin https://github.com/prashanttnegi/MLOPS-MOCK-OPPE.git
# ! git add data/
# ! git config --global user.email "prashant.negi0407@gmail.com"
# ! git config --global user.username "prashanttnegi"
# ! git branch -M main
# ! git commit -m "Uploaded original iris data"
# ! git push origin main
# ! git checkout -b dev

## Modifying dataset to upload in bigquery source

In [8]:
from datetime import datetime, timedelta
import random

df = data.copy()

# Add entity and timestamp
df["sample_id"] = df.index
df["event_timestamp"] = [
    pd.to_datetime(datetime.utcnow() - timedelta(days=random.randint(0, 30)), utc=True)
    for _ in range(len(df))
]

data['sample_id'] = data.index
data['event_timestamp'] = df['event_timestamp']

data.to_csv('data/iris.csv', index=False)

In [14]:
from google.cloud import bigquery

project_id = "numeric-poetry-461213-v8"  # replace this
dataset_id = "mlops_oppe_mock_1"      # create this manually if it doesn't exist
table_id = f"{project_id}.{dataset_id}.features_table"

client = bigquery.Client(project=project_id)

# Upload to BigQuery
job = client.load_table_from_dataframe(df, table_id)
job.result()

print(f"Uploaded {len(df)} rows to {table_id}")

Uploaded 150 rows to numeric-poetry-461213-v8.mlops_oppe_mock_1.features_table


## Introducing feast feature store

! pip install feast[gcp]

In [15]:
! cd feast-store/ && feast apply

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
  sample = Entity(name="sample_id", join_keys=["sample_id"])
No project found in the repository. Using project name common_iris defined in feature_store.yaml
Applying changes for project common_iris
Deploying infrastructure for [1m[32miris_features[0m


## Introducing Hyperopt for Hyperparameter tuning with MLFlow

In [26]:
# screen -S "mlflow_execution"
# pip install mlflow
# mflow server --host 0.0.0.0 --port 8100

In [16]:
import mlflow
from mlflow import MlflowClient
from mlflow.models import infer_signature
from pprint import pprint

mlflow.set_tracking_uri("http://127.0.0.1:8100")
client = MlflowClient(mlflow.get_tracking_uri())
all_experiments = client.search_experiments()
print(all_experiments)

[<Experiment: artifact_location='mlflow-artifacts:/608862416274110921', creation_time=1752848945495, experiment_id='608862416274110921', last_update_time=1752848945495, lifecycle_stage='active', name='IRIS practice classifier: Mlflow Practice', tags={}>, <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1752848697318, experiment_id='0', last_update_time=1752848697318, lifecycle_stage='active', name='Default', tags={}>]


In [17]:
# Setting experiment name for mlflow

mlflow.set_experiment("IRIS practice classifier: Mlflow Practice")

<Experiment: artifact_location='mlflow-artifacts:/608862416274110921', creation_time=1752848945495, experiment_id='608862416274110921', last_update_time=1752848945495, lifecycle_stage='active', name='IRIS practice classifier: Mlflow Practice', tags={}>

In [11]:
! pip install hyperopt

Collecting hyperopt
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl.metadata (1.7 kB)
Collecting future (from hyperopt)
  Downloading future-1.0.0-py3-none-any.whl.metadata (4.0 kB)
Collecting py4j (from hyperopt)
  Downloading py4j-0.10.9.9-py2.py3-none-any.whl.metadata (1.3 kB)
Downloading hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading future-1.0.0-py3-none-any.whl (491 kB)
Downloading py4j-0.10.9.9-py2.py3-none-any.whl (203 kB)
Installing collected packages: py4j, future, hyperopt
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [hyperopt]2/3[0m [hyperopt]
[1A[2KSuccessfully installed future-1.0.0 hyperopt-0.2.7 py4j-0.10.9.9


In [18]:
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

# Define hyperparameter search space
space = {
    'criterion': hp.choice('criterion', ['gini', 'entropy']),
    'max_depth': hp.quniform('max_depth', 1, 10, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 10, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 5, 1)
}

In [19]:
import feast
from joblib import dump
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# Load Iris entity dataframe (with sample_id and event_timestamp)
iris_df = pd.read_csv("data/iris.csv", sep=",")
iris_df["event_timestamp"] = pd.to_datetime(iris_df["event_timestamp"])
iris_df = iris_df.drop(columns=['sepal_length', 'sepal_width', 'petal_width', 'petal_length'])

# Connect to feature store
fs = feast.FeatureStore(repo_path="feast-store/")

# Load features from BigQuery (via Feast)
training_df = fs.get_historical_features(
    entity_df=iris_df,
    features=[
        "iris_features:sepal_length",
        "iris_features:petal_length",
        "iris_features:sepal_width",
        "iris_features:petal_width"
    ],
).to_df()

print("----- Feature schema -----\n")
print(training_df.info())

print("\n----- Example features -----\n")
print(training_df.head())

# Train model
target = "species"

train_X = training_df.drop(columns=[target, 'event_timestamp'])[:120]
print(train_X.head())
train_Y = training_df.loc[:119, target]
print(train_Y.head())
print(len(train_X), len(train_Y))

# model = LogisticRegression(max_iter=200)
model = DecisionTreeClassifier(max_depth = 3, random_state = 1)
model.fit(train_X[sorted(train_X.columns)], train_Y)

# Save the model
dump(model, "artifacts/model.joblib")

----- Feature schema -----

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   species          150 non-null    object             
 1   sample_id        150 non-null    int64              
 2   event_timestamp  150 non-null    datetime64[us, UTC]
 3   sepal_length     150 non-null    float64            
 4   petal_length     150 non-null    float64            
 5   sepal_width      150 non-null    float64            
 6   petal_width      150 non-null    float64            
dtypes: datetime64[us, UTC](1), float64(4), int64(1), object(1)
memory usage: 8.3+ KB
None

----- Example features -----

  species  sample_id                  event_timestamp  sepal_length  \
0  setosa         10 2025-07-13 14:28:39.313390+00:00           5.4   
1  setosa         23 2025-06-20 14:28:39.314834+00:00           5.1   
2  setosa  

['artifacts/model.joblib']

In [38]:
# from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

# # Define hyperparameter search space
# space = {
#     'criterion': hp.choice('criterion', ['gini', 'entropy']),
#     'max_depth': hp.quniform('max_depth', 1, 10, 1),
#     'min_samples_split': hp.quniform('min_samples_split', 2, 10, 1),
#     'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 5, 1)
# }

In [20]:
X = data.drop(columns=['species', 'sample_id', 'event_timestamp'])
Y = data['species']

In [21]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.15)

In [22]:
import pandas as pd
import numpy as np
from pandas.plotting import parallel_coordinates
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import metrics

# Define the objective function
def objective(params):
    # Cast hyperparameters to int where required
    params['max_depth'] = int(params['max_depth'])
    params['min_samples_split'] = int(params['min_samples_split'])
    params['min_samples_leaf'] = int(params['min_samples_leaf'])

    with mlflow.start_run(nested=True):
        
        # Log hyperparameters
        mlflow.log_params(params)

        clf = DecisionTreeClassifier(**params, random_state=42)

        clf.fit(train_X, train_Y)

        y_pred = clf.predict(test_X)

        # Calculate accuracy
        accuracy = metrics.accuracy_score(test_Y, y_pred)

        # Log metrics
        mlflow.log_metric("accuracy", accuracy)

        # Log tags
        mlflow.set_tag("Training Info", "DecisionTreeClassifier for Iris Dataset")

        # Log model
        mlflow.sklearn.log_model(clf, "model")        

        return {'loss': 1 - accuracy, 'status': STATUS_OK}

In [23]:
trials = Trials()
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=10,
    trials=trials,
    rstate=np.random.default_rng(42)
)

print("Best hyperparameters:", best)

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]





🏃 View run aged-ant-381 at: http://127.0.0.1:8100/#/experiments/608862416274110921/runs/ca82936656be428ba22a26bf3b79d24a

🧪 View experiment at: http://127.0.0.1:8100/#/experiments/608862416274110921

 10%|█         | 1/10 [00:05<00:46,  5.12s/trial, best loss: 0.08695652173913049]





🏃 View run painted-ant-147 at: http://127.0.0.1:8100/#/experiments/608862416274110921/runs/e8d0675bd7ec4c459464313684a66e36

🧪 View experiment at: http://127.0.0.1:8100/#/experiments/608862416274110921    

 20%|██        | 2/10 [00:08<00:30,  3.84s/trial, best loss: 0.08695652173913049]





🏃 View run suave-shoat-735 at: http://127.0.0.1:8100/#/experiments/608862416274110921/runs/d72bfa85942b4006ad52eff37b9efd27

🧪 View experiment at: http://127.0.0.1:8100/#/experiments/608862416274110921    

 30%|███       | 3/10 [00:11<00:24,  3.48s/trial, best loss: 0.08695652173913049]





🏃 View run peaceful-crane-118 at: http://127.0.0.1:8100/#/experiments/608862416274110921/runs/6ec1798cd34744ec9be72daa77017b03

🧪 View experiment at: http://127.0.0.1:8100/#/experiments/608862416274110921    

 40%|████      | 4/10 [00:14<00:19,  3.28s/trial, best loss: 0.08695652173913049]





🏃 View run dapper-croc-646 at: http://127.0.0.1:8100/#/experiments/608862416274110921/runs/365d5f5ba06540c9bb0716a92dc3d473

🧪 View experiment at: http://127.0.0.1:8100/#/experiments/608862416274110921    

 50%|█████     | 5/10 [00:17<00:15,  3.16s/trial, best loss: 0.08695652173913049]





🏃 View run resilient-cod-631 at: http://127.0.0.1:8100/#/experiments/608862416274110921/runs/cc7e81225b694bd19f65e1acad47f7b9

🧪 View experiment at: http://127.0.0.1:8100/#/experiments/608862416274110921    

 60%|██████    | 6/10 [00:19<00:12,  3.09s/trial, best loss: 0.08695652173913049]





🏃 View run grandiose-shad-474 at: http://127.0.0.1:8100/#/experiments/608862416274110921/runs/e0b4f4d6a5d84c2599c4801fe7170b0d

🧪 View experiment at: http://127.0.0.1:8100/#/experiments/608862416274110921    

 70%|███████   | 7/10 [00:22<00:09,  3.04s/trial, best loss: 0.08695652173913049]





🏃 View run suave-sheep-954 at: http://127.0.0.1:8100/#/experiments/608862416274110921/runs/501c708353944626bc45c18f7d5f489c

🧪 View experiment at: http://127.0.0.1:8100/#/experiments/608862416274110921    

 80%|████████  | 8/10 [00:25<00:06,  3.03s/trial, best loss: 0.08695652173913049]





🏃 View run luminous-cub-878 at: http://127.0.0.1:8100/#/experiments/608862416274110921/runs/b7377d9edb18403d952e55af4dd2089e

🧪 View experiment at: http://127.0.0.1:8100/#/experiments/608862416274110921    

 90%|█████████ | 9/10 [00:28<00:03,  3.01s/trial, best loss: 0.08695652173913049]





🏃 View run casual-stork-430 at: http://127.0.0.1:8100/#/experiments/608862416274110921/runs/e0f75a98e2e64ef49521aa6ea0cabfd5

🧪 View experiment at: http://127.0.0.1:8100/#/experiments/608862416274110921    

100%|██████████| 10/10 [00:31<00:00,  3.19s/trial, best loss: 0.08695652173913049]
Best hyperparameters: {'criterion': np.int64(0), 'max_depth': np.float64(9.0), 'min_samples_leaf': np.float64(3.0), 'min_samples_split': np.float64(9.0)}


In [24]:
# Mapping index to actual value
criterion_list = ['gini', 'entropy']
best['criterion'] = criterion_list[best['criterion']]

# Cast hyperparameters to int where required
best['max_depth'] = int(best['max_depth'])
best['min_samples_split'] = int(best['min_samples_split'])
best['min_samples_leaf'] = int(best['min_samples_leaf'])

In [25]:
mod_dt = DecisionTreeClassifier(**best, random_state=42)
mod_dt.fit(train_X, train_Y)
y_pred = mod_dt.predict(test_X)
accuracy = metrics.accuracy_score(test_Y, y_pred)

print(f"The accuracy of the Decision Tree is {accuracy}")

The accuracy of the Decision Tree is 0.9130434782608695


In [26]:
# Log model signature
signature = infer_signature(train_X, mod_dt.predict(train_X))
mlflow.sklearn.log_model(
    sk_model=mod_dt, 
    artifact_path="iris_model",
    signature=signature,
    input_example=train_X,
    registered_model_name="Iris-DT-Classifier"
)

Successfully registered model 'Iris-DT-Classifier'.
2025/07/18 14:34:49 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Iris-DT-Classifier, version 1
Created version '1' of model 'Iris-DT-Classifier'.


<mlflow.models.model.ModelInfo at 0x7f2dd9d915d0>

## Materialize the feast store

In [27]:
!cd feast-store/ && feast materialize 2025-06-01T00:00:00 2026-08-01T00:00:00

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
Materializing [1m[32m1[0m feature views from [1m[32m2025-06-01 00:00:00+00:00[0m to [1m[32m2026-08-01 00:00:00+00:00[0m into the [1m[32msqlite[0m online store.

[1m[32miris_features[0m:


## Make prediction

In [28]:
import pandas as pd
import feast
from joblib import load

class IrisClassifier:
    def __init__(self):
        # Load trained model
        self.model = load("artifacts/model.joblib")

        # Connect to feature store
        self.fs = feast.FeatureStore(repo_path="feast-store/")

    def predict(self, iris_classes):
        # Retrieve online features from Feast
        df = pd.read_csv("data/iris.csv", sep=",")
        features = self.fs.get_online_features(
            entity_rows=[{"sample_id": sample} for sample in df["sample_id"]],
            features=[
                "iris_features:sepal_length",
                "iris_features:petal_length",
                "iris_features:sepal_width",
                "iris_features:petal_width"
            ],
        )

        df = pd.DataFrame.from_dict(features.to_dict())
        print(df)

        # Predict species
        df["predicted_species"] = self.model.predict(df[sorted(df)])

        # Return most frequent predicted species
        common_flower_id = df["predicted_species"].mode()[0]
        
        return common_flower_id

In [29]:
def make_iris_prediction(): 
    
    model = IrisClassifier()
    flowers = data['species'].unique()
    # flowers=['setosa', 'virginica', 'versicolor']
    common_flower_id = model.predict(flowers)
    
    print("Predicted most common flower species:", common_flower_id)

In [30]:
make_iris_prediction()

     sample_id  sepal_length  sepal_width  petal_width  petal_length
0            0           5.1          3.5          0.2           1.4
1            1           4.9          3.0          0.2           1.4
2            2           4.7          3.2          0.2           1.3
3            3           4.6          3.1          0.2           1.5
4            4           5.0          3.6          0.2           1.4
..         ...           ...          ...          ...           ...
145        145           6.7          3.0          2.3           5.2
146        146           6.3          2.5          1.9           5.0
147        147           6.5          3.0          2.0           5.2
148        148           6.2          3.4          2.3           5.4
149        149           5.9          3.0          1.8           5.1

[150 rows x 5 columns]
Predicted most common flower species: versicolor


In [103]:
import os

# Create the directory if it doesn't exist
os.makedirs(".github/workflows", exist_ok=True)

# Define the workflow YAML content
workflow_content = """name: Sanity Test and Report

on:
  pull_request:
    branches: [main]
  workflow_dispatch:

permissions:
  pull-requests: write
  contents: write

jobs:
  test:
    runs-on: ubuntu-latest

    steps:
      - name: Checkout code
        uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: '3.10'

      - name: Install dependencies
        run: |
          pip install -r requirements.txt

      - name: Run model test
        run: |
          echo "## Test Results" >> report.md
          python test.py >> report.md 2>&1
          echo "Tests completed on $(date)" >> report.md

      - name: Run training
        run: |
          python train.py >> report.md 2>&1 || echo "Training failed with exit code $?" >> report.md

      - name: Setup CML
        uses: iterative/setup-cml@v2

      - name: Comment report with CML
        env:
          REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          cml comment create --publish report.md 
"""

# Write to the YAML file
with open(".github/workflows/sanity_test.yaml", "w") as f:
    f.write(workflow_content)

print("✅ sanity_test.yaml created in .github/workflows/")

✅ sanity_test.yaml created in .github/workflows/


In [102]:
# ! git add artifacts/ iris_pipeline.ipynb .github/
# ! git commit -m "Uploaded artifacts and pipeline"
# ! git push origin dev

In [None]:
# # To pull data from remote repo

# git branch --set-upstream-to=origin/dev dev
# git pull