# Install Required Libraries 

In [0]:
# Install required libraries
%pip install torch torchvision scikit-learn datasets mlflow --upgrade --quiet



[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
dbutils.library.restartPython()

# Load the Dataset

In [0]:
from datasets import load_dataset
import pandas as pd

# Load the SMS Spam dataset
dataset = load_dataset("sms_spam")
df = pd.DataFrame(dataset["train"])

# Rename columns for clarity
df = df.rename(columns={"label": "target", "sms": "text"})

# Check distribution
df["target"].value_counts()
df.head()




README.md:   0%|          | 0.00/4.98k [00:00<?, ?B/s]



train-00000-of-00001.parquet:   0%|          | 0.00/359k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5574 [00:00<?, ? examples/s]

Unnamed: 0,text,target
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...\n,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


# Basic Preprocessing and Splitting

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["target"], test_size=0.2, random_state=42)

# Vectorize
vectorizer = TfidfVectorizer(max_features=1000)
X_train_vec = vectorizer.fit_transform(X_train).toarray()
X_test_vec = vectorizer.transform(X_test).toarray()


# Define a PyTorch Model

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim

# Convert to torch tensors
X_train_tensor = torch.tensor(X_train_vec, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test_vec, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

# Simple model
class SpamClassifier(nn.Module):
    def __init__(self, input_size):
        super(SpamClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return self.sigmoid(out)

model = SpamClassifier(input_size=1000)


# Train the Model and Log with MLflow

In [0]:
import mlflow
import mlflow.pytorch
from sklearn.metrics import accuracy_score, f1_score

# Set MLflow experiment
mlflow.set_experiment("/Users/gutsjts@gmail.com/spam-classification")

def train_model(model, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor):
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    with mlflow.start_run():
        for epoch in range(10):
            model.train()
            optimizer.zero_grad()
            outputs = model(X_train_tensor)
            loss = criterion(outputs, y_train_tensor)
            loss.backward()
            optimizer.step()

        model.eval()
        with torch.no_grad():
            preds = model(X_test_tensor)
            preds_binary = (preds > 0.5).float()

        acc = accuracy_score(y_test_tensor, preds_binary)
        f1 = f1_score(y_test_tensor, preds_binary)

        # Log metrics and model
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("f1_score", f1)
        mlflow.pytorch.log_model(model, "model")

        print(f"Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")

train_model(model, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor)


2025/06/18 20:07:25 INFO mlflow.tracking.fluent: Experiment with name '/Users/gutsjts@gmail.com/spam-classification' does not exist. Creating a new experiment.
🔗 View Logged Model at: https://dbc-1fb8b820-d7ca.cloud.databricks.com/ml/experiments/4289177189276222/models/m-adab88afcc4e43f4a06b004ba813070a?o=166373381389351


Accuracy: 0.9363, F1 Score: 0.7437


![](/Workspace/Users/gutsjts@gmail.com/Spam_classification_mlops/images/mlflow_expirement.png)
![](/Workspace/Users/gutsjts@gmail.com/Spam_classification_mlops/images/metrics.png)

# Register Model to MLflow Model Registry

In [0]:
import mlflow.pytorch
from mlflow.models.signature import infer_signature

# Preparing signature and input example
example_input = X_test_tensor[:5]
pred_example = model(example_input)
signature = infer_signature(example_input.numpy(), pred_example.detach().numpy())

with mlflow.start_run():
    mlflow.pytorch.log_model(
        pytorch_model=model,
        artifact_path="spam_model",
        signature=signature,
        registered_model_name="SpamClassifierModel"
    )

print("✅ Model registered to MLflow.")


🔗 View Logged Model at: https://dbc-1fb8b820-d7ca.cloud.databricks.com/ml/experiments/4289177189276222/models/m-1d06ca461a834e9fb3638457d9515751?o=166373381389351
Successfully registered model 'workspace.default.spamclassifiermodel'.


Uploading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]

✅ Model registered to MLflow.


🔗 Created version '1' of model 'workspace.default.spamclassifiermodel': https://dbc-1fb8b820-d7ca.cloud.databricks.com/explore/data/models/workspace/default/spamclassifiermodel/version/1?o=166373381389351


![](/Workspace/Users/gutsjts@gmail.com/Spam_classification_mlops/images/serving_image.png)

# Testing Model via REST API in databricks

In [0]:
import requests
import json

# Sample email
input_email = ["Congratulations! You’ve won a free iPhone. Click here to claim now."]

# Vectorizing with same vectorizer used in training
input_vec = vectorizer.transform(input_email).toarray().tolist()

# Databricks token and model endpoint
DATABRICKS_TOKEN = "dapi-***REDACTED***"
MODEL_ENDPOINT = "https://dbc-1fb8b820-d7ca.cloud.databricks.com/serving-endpoints/spamclassifier-endpoint/invocations"

headers = {
    "Authorization": f"Bearer {DATABRICKS_TOKEN}",
    "Content-Type": "application/json"
}

payload = {
    "inputs": input_vec
}

response = requests.post(MODEL_ENDPOINT, headers=headers, data=json.dumps(payload))

print("📬 Prediction response:", response.json())


📬 Prediction response: {'predictions': [[0.5042845010757446]]}
