# How to upload to bigquery

In [None]:
"""
gbigquery/cleaner.py

Contains helper functions for Data Cleaning stage using BigQuery ML/AI methods.

"""

import os
from pathlib import Path

os.chdir(Path(__file__).resolve().parents[4]) if "__file__" in globals() else os.chdir(Path.cwd().root)

from dotenv import load_dotenv
from google.cloud import bigquery
from google.oauth2 import service_account

load_dotenv('.env.local')

if credentials_path := os.getenv("GOOGLE_APPLICATION_CREDENTIALS", None):
    if not os.path.isfile(credentials_path):
        raise FileNotFoundError(f"Credentials file not found at {credentials_path}")

    credentials = service_account.Credentials.from_service_account_file(
        credentials_path
    )
    client = bigquery.Client(credentials=credentials, project=credentials.project_id)
else:
    raise EnvironmentError("GOOGLE_APPLICATION_CREDENTIALS environment variable not set.")

print('GCP BigQuery client initialized and verified.')

PROJECT_ID = client.project
BQ_MODEL_ID = f"{PROJECT_ID}.open_source_db.base"

DATASET_ID = "cafe_sales_db"
TABLE_ID = "data"
BQ_DATA_ID = f"{PROJECT_ID}.{DATASET_ID}.{TABLE_ID}"

def detect_datatype(
    file_path: str = "src/gaby_agent/data/input/dirty_cafe_sales.csv",
    table_id: str = BQ_DATA_ID,
    client=client,
    return_pandas: bool = True
):
    """Detect the data type of each column in a CSV file using BigQuery's schema detection."""

    job_config = bigquery.LoadJobConfig(
        autodetect=True,
        source_format=bigquery.SourceFormat.CSV,
        skip_leading_rows=1,  # Skip header row
    )

    with open(file_path, "rb") as source_file:
        load_job = client.load_table_from_file(
            source_file, table_id, job_config=job_config
        )

    load_job.result()  # Wait for the job to complete

    table = client.get_table(table_id)
    schema_info = {schema_field.name: schema_field.field_type for schema_field in table.schema}

    if return_pandas is True:
        return table, schema_info
    else:
        return schema_info

def get_data_schema_from_existing_table(
    table_id: str = BQ_DATA_ID,
    client=client
):
    """
    Retrieve the schema of an existing BigQuery table. Will be removed after submission as detect_datatype() as this is unused.
    """

    table = client.get_table(table_id)
    schema_info = {schema_field.name: schema_field.field_type for schema_field in table.schema}

    return schema_info

#gb = get_data_schema_from_existing_table()
gb, schema = detect_datatype()
print(gb)

GCP BigQuery client initialized and verified.
gaby-472309.cafe_sales_db.data


In [None]:
gb

Table(TableReference(DatasetReference('gaby-472309', 'cafe_sales_db'), 'data'))

In [None]:
df = client.query(f"SELECT * FROM {BQ_DATA_ID}").to_dataframe()



In [None]:
df

Unnamed: 0,string_field_0,string_field_1,string_field_2,string_field_3,string_field_4,string_field_5,string_field_6,string_field_7
0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
1,TXN_6434414,,1,2.0,,Digital Wallet,Takeaway,2023-03-10
2,TXN_5651348,,3,5.0,,Cash,Takeaway,2023-11-04
3,TXN_9601220,,3,1.0,,Digital Wallet,Takeaway,2023-04-20
4,TXN_5926025,,2,4.0,,Digital Wallet,,2023-07-29
...,...,...,...,...,...,...,...,...
29996,TXN_8669012,UNKNOWN,3,1.5,UNKNOWN,Digital Wallet,,2023-07-26
29997,TXN_3222612,UNKNOWN,3,3.0,UNKNOWN,Cash,Takeaway,2023-01-25
29998,TXN_1766527,UNKNOWN,4,5.0,UNKNOWN,Digital Wallet,,2023-02-07
29999,TXN_1318943,UNKNOWN,2,2.0,UNKNOWN,Cash,In-store,2023-06-13
