# Testing SQL Syntax on adding TRUE/FALSE Column with BigQuery

In [None]:
"""
gbigquery/cleaner.py

Contains helper functions for Data Cleaning stage using BigQuery ML/AI methods.

"""

import os
from pathlib import Path

os.chdir(Path(__file__).resolve().parents[4]) if "__file__" in globals() else os.chdir(Path.cwd().root)

from dotenv import load_dotenv
from google.cloud import bigquery
from google.oauth2 import service_account

load_dotenv('.env.local')

if credentials_path := os.getenv("GOOGLE_APPLICATION_CREDENTIALS", None):
    if not os.path.isfile(credentials_path):
        raise FileNotFoundError(f"Credentials file not found at {credentials_path}")

    credentials = service_account.Credentials.from_service_account_file(
        credentials_path
    )
    client = bigquery.Client(credentials=credentials, project=credentials.project_id)
else:
    raise EnvironmentError("GOOGLE_APPLICATION_CREDENTIALS environment variable not set.")

print('GCP BigQuery client initialized and verified.')

PROJECT_ID = client.project
BQ_MODEL_ID = f"{PROJECT_ID}.open_source_db.base"
BQ_MODEL_REMOTE_ENDPOINT = f"projects/gaby-472309/locations/us-central1/publishers/google/models/gemini-2.5-flash-lite"
BQ_MODEL_REMOTE_ENDPOINT = "projects/gaby-472309/locations/us/connections/__default_cloudresource_connection__"
BQ_MODEL_TYPE = "gemini-2.5-flash-lite"

DATASET_ID = "cafe_sales_db"
TABLE_ID = "data"
BQ_DATA_ID = f"{PROJECT_ID}.{DATASET_ID}.{TABLE_ID}"

def load_table(
    table_id: str = BQ_DATA_ID,
    client: bigquery.Client = client,
):
    """Load a BigQuery table into a pandas DataFrame."""

    query = f"SELECT * FROM `{table_id}`"
    return client.query(query).to_dataframe()

def detect_datatype(
    file_path: str = "src/gaby_agent/data/input/dirty_cafe_sales.csv",
    table_id: str = BQ_DATA_ID,
    client: bigquery.Client =client,
    return_dataset: bool = True
):
    """Detect the data type of each column in a CSV file using BigQuery's schema detection."""

    job_config = bigquery.LoadJobConfig(
        autodetect=True,
        source_format=bigquery.SourceFormat.CSV,
        skip_leading_rows=0,  # Skip header row
        write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE
    )

    with open(file_path, "rb") as source_file:
        load_job = client.load_table_from_file(
            source_file, table_id, job_config=job_config
        )

    load_job.result()  # Wait for the job to complete

    table = client.get_table(table_id)

    if return_dataset is True:
        return load_table(table_id), table.schema
    else:
        return table.schema

def validate_data_field_labels():
    """Validate and clean data field labels in the BigQuery table schema. CHANGE to prompt so that the LLM can review the first 5 subsets of the data columns. and decipher the data label if not loaded properly.  """

    query = f"""
    SELECT
      column_name,
      data_type,
      is_nullable
    FROM
      `{PROJECT_ID}.{DATASET_ID}.INFORMATION_SCHEMA.COLUMNS`
    WHERE
      table_name = '{TABLE_ID}'
    """
    results = client.query(query).result()

    for row in results:
        print(f"Column: {row.column_name}, Type: {row.data_type}, Nullable: {row.is_nullable}")

def validate_data_field(
    data_label: str = "string_field_3",
    data_type: str = "STRING",
    data_field_description: str = "This source dataset contains data on a Cafe Store total sales. This data field contains the prices per unit sold.",
    table_id: str = BQ_DATA_ID,
    client: bigquery.Client = client,
):

    query = f"""
    SELECT
        {data_label},
        AI.GENERATE_BOOL(
            ('Check if this field aligns with expected data type: {data_type} given its respective source data table description: {data_field_description}'),
            connection_id => '{BQ_MODEL_REMOTE_ENDPOINT}',
            endpoint => '{BQ_MODEL_TYPE}'
        ).result
    FROM
        `{table_id}`
    LIMIT 10;
    """
    print(f'running query: {query}')
    return client.query(query).to_dataframe()

results = validate_data_field()

#gb = get_data_schema_from_existing_table()

GCP BigQuery client initialized and verified.
running query: 
    SELECT
        string_field_3,
        AI.GENERATE_BOOL(
            ('Check if this field aligns with expected data type: STRING given its respective source data table description: This source dataset contains data on a Cafe Store total sales. This data field contains the prices per unit sold.'),
            connection_id => 'projects/gaby-472309/locations/us/connections/__default_cloudresource_connection__',
            endpoint => 'gemini-2.5-flash-lite'
        ).result
    FROM
        `gaby-472309.cafe_sales_db.data`
    LIMIT 10;
    




In [None]:
results

Unnamed: 0,string_field_3,result
0,1.0,True
1,1.0,True
2,1.0,True
3,1.0,True
4,2.0,False
5,4.0,True
6,1.0,False
7,5.0,True
8,1.0,True
9,1.0,True
