**Name:** 

**ID:**

In [None]:
# Authenticate and authorize access
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

**BigQuery DataFrames**

- bigframes.pandas provides a pandas-compatible API for analytics.

- bigframes.ml provides a scikit-learn-like API for ML.


> https://cloud.google.com/python/docs/reference/bigframes/latest



**Get data from BigQuery**

In [None]:
import bigframes.pandas as bf

In [None]:
PROJECT_ID = 'ds-on-gcp-411105'

In [None]:
REGION = 'US'

In [None]:
bf.options.bigquery.project = PROJECT_ID
bf.options.bigquery.location = REGION

In [None]:
df = bf.read_gbq("bigquery-public-data.ml_datasets.penguins")

In [None]:
df.head()

In [None]:
df.shape

**Check missing values**

In [None]:
df.isna().sum()

**Impute missing values**

In [None]:
# Drop rows where 'culmen_length_mm' is NaN
df = df[df['culmen_length_mm'].notnull()]

In [None]:
# Fill NaN values in 'sex' column with mode_sex
mode_sex = df['sex'].mode()[0] # Mode can return multiple values, so we take the first one

df['sex'] = df['sex'].fillna(mode_sex)

**Train/ Test splitting**

In [None]:
from bigframes import ml

In [None]:
import bigframes.ml.model_selection
import bigframes.ml.ensemble
import bigframes.ml.metrics

In [None]:
# Define your features and target
X = df.drop(columns=['species'])  # Features
y = df['species']  # Target

In [None]:
# Split data using BigFrames
X_train, X_test, y_train, y_test = ml.model_selection.train_test_split(X, y, test_size=0.2, random_state=42)


**Create model with Train data using bigframes.ml**

In [None]:
import time

In [None]:
# Create classification model
start_time = time.time()
model = ml.ensemble.RandomForestClassifier(num_parallel_tree=5)  # Example classifier

# Train the model
model.fit(X_train, y_train)
training_time = time.time() - start_time

print("Training time:", training_time, "seconds")

**Predict Test data using the created model**

In [None]:
# Predict test set
y_pred = model.predict(X_test)

In [None]:
y_pred = y_pred['predicted_species']

**Evaluation**

**Confusion matrix**

In [None]:
conf_matrix = ml.metrics.confusion_matrix(y_test, y_pred)

In [None]:
conf_matrix

**Accuracy**,  **precision_score**, **recall_score**, **f1_score**

In [None]:
#Accuracy
accuracy = ml.metrics.accuracy_score(y_test, y_pred)
print(accuracy)

In [None]:
#Precision


In [None]:
#Recall

In [None]:
#F1

**Save the model in BigQuery**

In [None]:
from google.cloud import bigquery

PROJECT_ID = 'ds-on-gcp-411105'
DATASET_ID = "DemoSupervisedML"
REGION = 'US'

client = bigquery.Client(project=PROJECT_ID)
dataset = bigquery.Dataset(PROJECT_ID + "." + DATASET_ID)
dataset.location = REGION
dataset = client.create_dataset(dataset, exists_ok=True)

print(f"Dataset {dataset.dataset_id} created.")

In [None]:
MODEL_NAME = 'RF_predict_penguin_species'
model.to_gbq(DATASET_ID + "." + MODEL_NAME, replace=True)
print(f'https://console.developers.google.com/bigquery?p={PROJECT_ID}')