In [1]:
!rm -rf .git .dvc iris.csv.dvc artifacts __pycache__ .ipynb_checkpoints .gitignore

In [2]:
!pip install -q dvc[gs] scikit-learn pandas joblib mlflow
!pip install -q --upgrade google-cloud-aiplatform

In [3]:
!git init
!dvc init
!git config user.email "tarunarora6029@email.com"
!git config user.name "tarunarora6029"

[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /home/jupyter/practice/.git/
Initialized DVC repository.

You can now commit the changes to git.

[31m+---------------------------------------------------------------------+
[0m[31m|[0m                                                                     [31m|[0m
[31m|[0m        DVC has enabled anonymous aggregate usage analytics.         [31m|[0m
[31m|[0m     Read the analytics documentation (and how to opt-out) here:     [31m|[0m
[31m|[0m             <[36mh

In [4]:
from google.cloud import aiplatform
import os
import sys

In [5]:
PROJECT_ID = "verdant-nova-461606-f4"
LOCATION = "us-central1"
BUCKET_URI = f"gs://ibdpractice"
DVC_REMOTE = "gcsremote"
MODEL_ARTIFACT_DIR = "my-models/iris"
REPOSITORY = "iris-classifier-repo"
IMAGE = "iris-classifier-img"
MODEL_DISPLAY_NAME = "iris-classifier"

In [6]:
aiplatform.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)

In [7]:
!dvc remote add -d {DVC_REMOTE} {BUCKET_URI}/dvcstore

Setting 'gcsremote' as a default remote.
[0m

In [8]:
import mlflow
from mlflow import MlflowClient
from mlflow.models import infer_signature
from pprint import pprint

In [9]:
mlflow.set_tracking_uri("http://127.0.0.1:8100")
client = MlflowClient(mlflow.get_tracking_uri())

In [10]:
all_experiments = client.search_experiments()
print("Available experiments:")
pprint(all_experiments)

Available experiments:
[<Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1752978486617, experiment_id='0', last_update_time=1752978486617, lifecycle_stage='active', name='Default', tags={}>]


In [11]:
mlflow.set_experiment("IRIS classifier: MLflow Hands-On")

2025/07/20 03:03:49 INFO mlflow.tracking.fluent: Experiment with name 'IRIS classifier: MLflow Hands-On' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/389631125268629828', creation_time=1752980629728, experiment_id='389631125268629828', last_update_time=1752980629728, lifecycle_stage='active', name='IRIS classifier: MLflow Hands-On', tags={}>

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pandas.plotting import parallel_coordinates
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import metrics
import joblib
import pickle

In [13]:
df = pd.read_csv("iris.csv")
print("Dataset shape:", df.shape)
print("Unique species:", df['species'].unique())
print(df.head(5))

Dataset shape: (150, 5)
Unique species: ['setosa' 'versicolor' 'virginica']
   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa


In [14]:
train, test = train_test_split(
    df, 
    test_size=0.4, 
    stratify=df['species'], 
    random_state=42
)

In [15]:
feature_columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
X_train = train[feature_columns]
y_train = train['species']
X_test = test[feature_columns]
y_test = test['species']

In [16]:
params = {
    "max_depth": 3,
    "random_state": 1
}

In [19]:
with mlflow.start_run():
    model = DecisionTreeClassifier(**params)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, preds)
    
    print(f"Accuracy: {accuracy:.3f}")
    
    mlflow.log_params(params)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.set_tag("Training Info", "Decision tree model for IRIS data")
    
    signature = infer_signature(X_train, model.predict(X_train))
    
    model_info = mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="iris_model",
        signature=signature,
        input_example=X_train,
        registered_model_name="IRIS-classifier-decisiontrees",
    )
    print(f"Model logged to MLflow with run ID: {mlflow.active_run().info.run_id}")



Accuracy: 0.983


Registered model 'IRIS-classifier-decisiontrees' already exists. Creating a new version of this model...
2025/07/20 03:05:36 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: IRIS-classifier-decisiontrees, version 2


Model logged to MLflow with run ID: 21fe001e72224902b74948e1ff4577a4
🏃 View run unique-turtle-618 at: http://127.0.0.1:8100/#/experiments/389631125268629828/runs/21fe001e72224902b74948e1ff4577a4
🧪 View experiment at: http://127.0.0.1:8100/#/experiments/389631125268629828


Created version '2' of model 'IRIS-classifier-decisiontrees'.


In [20]:
os.makedirs("artifacts", exist_ok=True)

joblib.dump(model, "artifacts/model.joblib")
print("Model saved to artifacts/model.joblib")

!gsutil cp artifacts/model.joblib {BUCKET_URI}/{MODEL_ARTIFACT_DIR}/
print(f"Model uploaded to {BUCKET_URI}/{MODEL_ARTIFACT_DIR}/")

Model saved to artifacts/model.joblib
Copying file://artifacts/model.joblib [Content-Type=application/octet-stream]...
/ [1 files][  2.5 KiB/  2.5 KiB]                                                
Operation completed over 1 objects/2.5 KiB.                                      
Model uploaded to gs://ibdpractice/my-models/iris/


In [21]:
!dvc add iris.csv
!echo '!iris.csv.dvc' >> .gitignore

!dvc add artifacts/model.joblib
!echo '!artifacts/**.dvc' >> .gitignore

print("DVC tracking configured for dataset and model")

 [?25l[32m⠋[0m Checking graph
Adding...                                                                       
![A
Collecting files and computing hashes in iris.csv     |0.00 [00:00,     ?file/s][A
                                                                                [A
![A
  0% Checking cache in '/home/jupyter/practice/.dvc/cache/files/md5'| |0/? [00:0[A
                                                                                [A
![A
  0%|          |Adding iris.csv to cache              0/1 [00:00<?,     ?file/s][A
                                                                                [A
![A
  0%|          |Checking out /home/jupyter/practice/ir0/1 [00:00<?,    ?files/s][A
100% Adding...|████████████████████████████████████████|1/1 [00:00, 10.43file/s][A

To track the changes with git, run:

	git add iris.csv.dvc .gitignore

To enable auto staging, run:

	dvc config core.autostage true
 [0m[?25l[32m⠋[0m Checking graph
Adding...               

In [22]:
!git add .gitignore iris.csv.dvc artifacts/model.joblib.dvc
!git commit -m "Initial commit: iris dataset and model with DVC tracking"

!dvc push
print("Data and model pushed to DVC remote storage")

!git add .
!git commit -m "Complete MLOps setup: DVC + MLflow tracking"

!git remote add origin git@github.com:tarunarora6029/practice.git
!git push -u origin main --force

print("Pipeline complete! Data versioned with DVC, experiments tracked with MLflow")

[master (root-commit) 75804a5] Initial commit: iris dataset and model with DVC tracking
 6 files changed, 19 insertions(+)
 create mode 100644 .dvc/.gitignore
 create mode 100644 .dvc/config
 create mode 100644 .dvcignore
 create mode 100644 .gitignore
 create mode 100644 artifacts/model.joblib.dvc
 create mode 100644 iris.csv.dvc
Collecting                                            |3.00 [00:00,  171entry/s]
Pushing
![A
  0% Checking cache in 'ibdpractice/dvcstore/files/md5'| |0/? [00:00<?,    ?file[A
 50% Querying cache in 'ibdpractice/dvcstore/files/md5'|▌|1/2 [00:00<00:00,  5.2[A
Pushing                                                                         [A
Everything is up to date.
[0mData and model pushed to DVC remote storage
[master 7d865b7] Complete MLOps setup: DVC + MLflow tracking
 7 files changed, 1249 insertions(+)
 create mode 100644 .ipynb_checkpoints/Untitled-checkpoint.ipynb
 create mode 100644 Untitled.ipynb
 create mode 100644 artifacts/.gitignore
 create 

In [None]:
print("\n" + "="*60)
print("PIPELINE SUMMARY")
print("="*60)
print(f"Model Accuracy: {accuracy:.3f}")
print(f"Model saved to: artifacts/model.joblib")
print(f"GCS location: {BUCKET_URI}/{MODEL_ARTIFACT_DIR}/")
print(f"MLflow tracking URI: {mlflow.get_tracking_uri()}")
print(f"Experiment: IRIS classifier: MLflow Hands-On")
print("DVC remote configured for data versioning")
print("Git repository updated with tracked artifacts")
print("="*60)