# Demo Collab with Bigquery and Google Storage

## Data Prep

In [1]:
# Dependencies for training the model
import pandas as pd
from google.cloud import bigquery
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Fetching data with google big query sdk

bq_client = bigquery.Client()
query = "SELECT * FROM `prolaio-data-testing.vpasquierdemo.heart`"
dataset = bq_client.query(query).to_dataframe()
dataset.head()

In [None]:
dataset_dummies = pd.get_dummies(
    dataset, columns=["sex", "cp", "fbs", "restecg", "exang", "slope", "ca", "thal"]
)
dataset_dummies.head()

In [None]:
standard_scaler = StandardScaler()
columns_to_scale = ["age", "trestbps", "chol", "thalach", "oldpeak"]
dataset_dummies[columns_to_scale] = standard_scaler.fit_transform(
    dataset_dummies[columns_to_scale]
)
dataset_dummies.head()

In [5]:
y = dataset_dummies["target"]
X = dataset_dummies.drop(["target"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## Random Forest


In [37]:
# Training on Random Forest model
rf_param_grid = {
    "n_estimators": range(1, 100, 10),
    "max_depth": [None, 10, 20, 30],
}
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(
    param_distributions=rf_param_grid,
    estimator=rf,
    scoring="accuracy",
    verbose=0,
    n_iter=100,
    cv=4,
)
rf_random.fit(X_train, y_train)
best_params = rf_random.best_params_
print(f"Best parameters: {best_params}")
print(f"Score: {rf_random.score(X_test, y_test)}")

KeyboardInterrupt: 

In [None]:
# Plot results
def feature_imp(df, model):
    fi = pd.DataFrame(columns=["feature", "importance"])
    fi["feature"] = df.columns
    fi["importance"] = model.best_estimator_.feature_importances_
    return fi.sort_values(by="importance", ascending=False)

feature_imp(X_train, rf_random).plot(
    "feature", "importance", "barh", figsize=(10, 7), legend=False
)

## Prolaio Toolkit Usage

## Saving Models

In [29]:
# Saving the model
import joblib
from google.cloud import aiplatform, storage

# Save via joblib
joblib.dump(rf_random, "saved_model.joblib")

# Set up GCP credentials
storage_client = storage.Client()

# Specify your GCP bucket and object paths
bucket_name = "configuration-pilot"
object_name = "saved_model.joblib"

# Upload the model to GCP Storage
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob(f"models/{object_name}")
blob.upload_from_filename(object_name)

# Set up GCP credentials
aiplatform.init(project="prolaio-data-testing")

# Specify the model display name and description
model_display_name = "random-forest-v1"
model_description = "The best one"

# Specify the model URI in GCP Storage
model_uri = f"gs://{bucket_name}/models"

# Register the model in the Model Registry
model = aiplatform.Model(model_name=model_display_name)
model.update()



<google.cloud.aiplatform.models.Model object at 0x7adde2ebffa0> 
resource name: projects/283985719780/locations/us-central1/models/random-forest-v1