<a href="https://colab.research.google.com/github/uelkariuki/Data_science_and_Machine_Learning/blob/master/Uel_Kariuki_MLOPs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
'''
Name: Uel Kariuki
Date: 11/7/2025

THis project is about building an end to end machine learning worflow to
predict housing prices using the California Housing dataset.
'''

'\nName: Uel Kariuki\nDate: 11/7/2025\n\nTHis project is about building an end to end machine learning worflow to\npredict housing prices using the California Housing dataset.\n'

In [None]:
!pip install fastapi uvicorn pydantic nest_asyncio pyngrok -q
print("Libraries installed")

Libraries installed


In [None]:
# Importing required liberaries
import pandas as pd
import pickle
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel # for defining request body schema
import uvicorn # ASGI server to run FastAPI
import nest_asyncio # to run async server in Colab
import os
import time
import getpass # facilitate secure password input
import threading # to run FastAPI in a separate thread
from pyngrok import ngrok # direct ngrok tunnel management

LOCAL_PORT = 8000



**1.Load the dataset**

In [None]:
# Load the dataset
# as_frame ensures that the results are returned as pas
X, y = fetch_california_housing(return_X_y=True, as_frame=True)
X


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


2. Splitting the dataset

In [None]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


3. Preprocessing

In [None]:
# Preprocessing: Imputation + Scaling for numerical features
numeric_features = X.columns # all are numerical
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])


4. Combine Preprocessing using ColumnTransformer

In [None]:
# Combine Preprocessing using ColumnTransformer

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features)
])

5. Build Pipeline: preprocessing + KNN

In [None]:
# creates the main pipeline that first applies the preprocessing steps and
# uses a KNeighborsRegressor for the regression task
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('knn', KNeighborsRegressor())
])

6. Define hyperparameter grid

In [None]:
# defines the grid of hyperparameters to search over for the KNeighborsRegressor
# knn__n_neighbors: Number of neighbors to consider ([3,5,7,9])
# knn__weights: Weight function used in prediction ('uniform', 'distance')
# knn__p: Power parameter for the Minkowski metric (1 for Manhattan, 2 for Eucledian)

param_grid = {
    'knn__n_neighbors': [3, 5, 7, 9],
    'knn__weights': ['uniform', 'distance'],
    'knn__p': [1, 2]

}

7. Apply GridSearchCV with 5-fold cross-validation

In [None]:
# sets up GridSearchCv to find the best hyperparameters
# estimator: The Pipeline to tune
# param_grid: The hyperparameter grid
# cv=5: Uses 5-fold cross-validation
# scoring='r2': uses R^2 score as metric to optimize
# verbose=1: Prints progress messages
# n_jobs=-1: Uses all available CPU scores for parallel processing

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    verbose=1,
    n_jobs=-1
)

8. Fit the model

In [None]:
# trains the GridSearchCV object, which performs the cross-validation and
# hyperparameter tuning
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


9. Evaluate on test set

In [None]:
# retrieves the best model found by GridSearchCV
best_model = grid_search.best_estimator_
# Make predictions on the unseen test data
y_pred = best_model.predict(X_test)

# calculate evaluation metrics
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
# Calculate RMSE by taking the square root of MSE
rmse = mse**0.5

10. Print results

In [None]:
print(f"Best parameters:{grid_search.best_params_}")
print(f"Best CV R^2 Score:{grid_search.best_score_}")
print(f"Test R^2 Score: {r2:.4f}")
print(f"Test MSE:{mse:.4f}")
print(f"Test RMSE:{rmse:.4f}")



Best parameters:{'knn__n_neighbors': 9, 'knn__p': 1, 'knn__weights': 'distance'}
Best CV R^2 Score:0.731266870986164
Test R^2 Score: 0.7221
Test MSE:0.3642
Test RMSE:0.6034


11. Save the pipeline

In [None]:
# saves the best trained pipeline to a file using pickle
# this allows the model to be loaded later for making new predictions
# without retraining

with open('california_knn_pipeline.pkl', 'wb') as f:
  pickle.dump(best_model, f)

print("Final pipeline saved to 'california_knn_pipeline.pkl' ")

Final pipeline saved to 'california_knn_pipeline.pkl' 


12. FastAPI app

In [None]:

# initialize FastAPI app
app = FastAPI()

# Define the path to the pickled model file
MODEL_PATH = '/content/california_knn_pipeline.pkl'
# initialize model to None
model = None
print(f"Attempting to load model from: {MODEL_PATH}")
# Load the model when the app starts
if os.path.exists(MODEL_PATH):
  try:
    with open(MODEL_PATH, 'rb') as f:
      model = pickle.load(f)
    print("Model loaded successfully")
  except Exception as e:
    print(f"Error loading model: {e}")
    model = None
else:
  print(f"Model file not found at {MODEL_PATH}")
  model = None

# Define the Pydantic model for input data validation
class HousingFeatures(BaseModel):
    MedInc: float
    HouseAge: float
    AveRooms: float
    AveBedrms: float
    Population: float
    AveOccup: float
    Latitude: float
    Longitude: float

# define home endpoint
@app.get('/')
async def read_root():
  '''
  Home endpoint to check if the API is running
  '''
  print("DEBUG: Home route accessed")
  if model:
    return {"message": "california housing prediction API is running and model is loaded"}
  else:
    return {"message": "california housing prediction API is running but the model failed to load"}
# define prediction endpoint
@app.post('/predict')
async def predict_house_value(features: HousingFeatures):
  '''
  Prediction endpoint
  Expects a JSON payload with features for prediction
  Returns a JSON response with the prediction
  '''
  print("DEBUG: Prediction route accessed")
  if model is None:
    print("Error: Model is None when /predict was called")
    raise HTTPException(status_code=500, detail="Model not loaded. Ensure 'california_knn_pipeline.pkl' exists")
  try:
    # comvert Pydantic model to a dictionary, then to a pandas DataFrame
    data = features.dict()
    print(f"DEBUG: Received data for prediction {data}")

    input_df = pd.DataFrame([data])

    # make predictions using the loaded model
    prediction = model.predict(input_df)

    # return the prediction as a JSON response
    return {"predicted_house_value": prediction[0]}
  except Exception as e:
    print(f"Error during prediction: {e}")
    raise HTTPException(status_code=500, detail=f"An unexpected error occured during prediction: {str(e)}")

# Run FastAPI with Uvicorn in Colab
nest_asyncio.apply()

def run_uvicorn_app():
  '''
  Runs the FastAPI app using Uvicorn
  '''
  uvicorn.run(app, host='0.0.0.0', port=LOCAL_PORT, log_level="info")
# start uvicorn in a separate thread
print(f"Starting FastAPI app with Uvicorn on port {LOCAL_PORT} in a background thread")
uvicorn_thread = threading.Thread(target=run_uvicorn_app)
uvicorn_thread.start()

time.sleep(5)

# start ngrok tunnel to the Uvicorn port
print(f"Starting ngrok tunnel to local port {LOCAL_PORT}...")
try:
  # prompt user for the ngrok auth token securely
  # auth token is found at https://dashboard.ngrok.com/get-started/your-authtoken
  ngrok_auth_token = getpass.getpass(prompt="Enter your ngrok auth token:")
  ngrok.set_auth_token(ngrok_auth_token) # set the auth token

  public_url = ngrok.connect(LOCAL_PORT)
  print(f"\n Your public ngrok.io API url is: {public_url}")
  print("Send a POST Request to the /predict endpoint using Postman or curl")
  print(f"POST URL: {public_url}/predict")
except Exception as e:
  print(f"Error starting ngrok tunnel: {e}")


INFO:     Started server process [230]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)


Attempting to load model from: /content/california_knn_pipeline.pkl
Model loaded successfully
Starting FastAPI app with Uvicorn on port 8000 in a background thread
Starting ngrok tunnel to local port 8000...
Enter your ngrok auth token:··········

 Your public ngrok.io API url is: NgrokTunnel: "https://9d8e9bb67967.ngrok-free.app" -> "http://localhost:8000"
Send a POST Request to the /predict endpoint using Postman or curl
POST URL: NgrokTunnel: "https://9d8e9bb67967.ngrok-free.app" -> "http://localhost:8000"/predict
