# First time Running BigQuery

In [None]:
# Agent's Thought Chain Reasoning Module served with Ollama
# (c) 2024 Gaby AI Inc. - https://www.gaby.mimeus.com
# Author: Mimi Phan

import os
from pathlib import Path

os.chdir(Path(__file__).resolve().parents[3]) if "__file__" in globals() else os.chdir(Path.cwd().root)

import ollama
from google.cloud import bigquery
from google.oauth2 import service_account

import os
from abc import ABC, abstractmethod
from dotenv import load_dotenv

load_dotenv('.env.local')

OLLAMA_HOST_URL = os.getenv("OLLAMA_HOST_URL", "http://localhost:11434")
BASE_GUFF_LLM_MODEL = os.getenv("BASE_GUFF_LLM_MODEL", "hf.co/bartowski/Llama-3.2-3B-Instruct-GGUF:Q5_K_S")

class GabyBasement(ABC):
    """ Prompt Base Constructor. """

    _client = None

    def __new__(cls, *args, **kwargs):
        try:
            if cls._client is None:
                cls._client = ollama.Client(
                    base_url=OLLAMA_HOST_URL
                )
        except Exception as e:
            print(f"Error initializing GabyBasement: {e}")
        return super().__new__(cls, *args, **kwargs)

    def __init_subclass__(cls, prompt: str = "", **kwargs):
        super().__init_subclass__(**kwargs)
        cls.prompt = getattr(cls, "prompt", prompt)  # don’t overwrite if re-init
        cls.name = cls.__qualname__

    @property
    def client(self):
        return self._client

    @property
    def system_prompt(self):
        return [{"role": "system", "content": self.prompt}]

    def input_validator(self, **kwargs) -> None | str:
        """ Validates the user's input is what the subclassed instance's run method expects. Seperate function to add different input types by overriding this function in subclass, otherwise defaults to current function. """

        if kwargs.get("inputs", None) is None:
            raise ValueError("Missing required 'inputs' parameter.")

        return kwargs.get("inputs", None)

    @abstractmethod
    def post_process(self, response: str) -> str:
        """ Post-processes the response from the LLM before returning to the user. """
        pass

    def run(self, **kwargs) -> str:
        """ Main method to execute the thought chain. """

        if not hasattr(self, "client") or self.client is None:
            raise RuntimeError("Ollama client is not initialized."
                               " Ensure Ollama is running and OLLAMA_HOST_URL is correct.")

        print(f"Running thought Chain: {self.name}")

        user_inputs = self.input_validator(**kwargs)

        response = self.client.chat(
            model=BASE_GUFF_LLM_MODEL,
            messages=self.system_prompt + [{"role": "user", "content": user_inputs}],
            stream=False,
            options={"max_tokens": 500}
        )
        # TODO: ADD Post-processing later
        return response.message.get('content', None)

class BigQueryToolkit(ABC):
    """ Gaby Agent's BigQuery Base Toolkit constructor. """

    bf: bigquery.Client | None = None

    def __new__(cls, *args, **kwargs):
        try:
            if cls.bf is None:
                if credentials_path := os.getenv("GOOGLE_APPLICATION_CREDENTIALS", None):

                    if not os.path.isfile(credentials_path):
                        raise FileNotFoundError(f"Credentials file not found at {credentials_path}")

                    credentials = service_account.Credentials.from_service_account_file(
                        credentials_path
                    )
                    cls.bf = bigquery.Client(credentials=credentials, project=credentials.project_id)
                else:
                    raise EnvironmentError("GOOGLE_APPLICATION_CREDENTIALS environment variable not set.")

                print('GCP BigQuery client initialized and verified.')

        except Exception as e:
            print(f"Error initializing BigQueryToolkit: {e}")

        return super().__new__(cls, *args, **kwargs)

if __name__ == '__main__':
    gb = BigQueryToolkit()

GCP BigQuery client initialized and verified.


In [None]:
TEST_QUERY = "SELECT * FROM `bigquery-public-data.samples.shakespeare` LIMIT 10"

In [None]:
sample = gb.bf.query(TEST_QUERY).to_dataframe()



In [None]:
sample

Unnamed: 0,word,word_count,corpus,corpus_date
0,LVII,1,sonnets,0
1,augurs,1,sonnets,0
2,dimm'd,1,sonnets,0
3,plagues,1,sonnets,0
4,treason,1,sonnets,0
5,surmise,1,sonnets,0
6,heed,1,sonnets,0
7,Unthrifty,1,sonnets,0
8,quality,1,sonnets,0
9,wherever,1,sonnets,0


In [None]:
type(sample)

pandas.core.frame.DataFrame

In [None]:
from src.gaby_agent.core.data_cleaner_chain import DirtyDataInspector

In [None]:
dt = DirtyDataInspector(sample, ['financial dataset'])

Ran checks on dataset with 10 rows and 4 fields for cleaning.


In [None]:
dt

DirtyDataInspector(data=        word  word_count   corpus  corpus_date
0       LVII           1  sonnets            0
1     augurs           1  sonnets            0
2     dimm'd           1  sonnets            0
3    plagues           1  sonnets            0
4    treason           1  sonnets            0
5    surmise           1  sonnets            0
6       heed           1  sonnets            0
7  Unthrifty           1  sonnets            0
8    quality           1  sonnets            0
9   wherever           1  sonnets            0, description=['financial dataset'], fields=['word', 'word_count', 'corpus', 'corpus_date'], num_fields=4, num_rows=10, num_missing_values={'word': np.int64(0), 'word_count': np.int64(0), 'corpus': np.int64(0), 'corpus_date': np.int64(0)}, num_duplicates=np.int64(0))

In [None]:
print(dt.give_summary)

Number of fields: 4
Number of rows: 10
Number of duplicates: 0
Missing values per field:
  - word: 0
  - word_count: 0
  - corpus: 0
  - corpus_date: 0


In [None]:
dt

DirtyDataInspector(data=        word  word_count   corpus  corpus_date
0       LVII           1  sonnets            0
1     augurs           1  sonnets            0
2     dimm'd           1  sonnets            0
3    plagues           1  sonnets            0
4    treason           1  sonnets            0
5    surmise           1  sonnets            0
6       heed           1  sonnets            0
7  Unthrifty           1  sonnets            0
8    quality           1  sonnets            0
9   wherever           1  sonnets            0, description=['financial dataset'], fields=['word', 'word_count', 'corpus', 'corpus_date'], num_fields=4, num_rows=10, num_missing_values={'word': np.int64(0), 'word_count': np.int64(0), 'corpus': np.int64(0), 'corpus_date': np.int64(0)}, num_duplicates=np.int64(0))

In [None]:
gb.bf.list_datasets()

<google.api_core.page_iterator.HTTPIterator at 0x135dd91b0>

In [None]:
list(gb.bf.list_datasets())

[]

In [None]:

summary = {
    "shape": dt.data.shape,
    "columns": dt.data.dtypes.to_dict(),
    "missing": dt.data.isnull().sum().to_dict(),
    "sample": dt.data.head(3).to_dict(orient="records")
}

print(summary)

{'shape': (10, 4), 'columns': {'word': dtype('O'), 'word_count': Int64Dtype(), 'corpus': dtype('O'), 'corpus_date': Int64Dtype()}, 'missing': {'word': 0, 'word_count': 0, 'corpus': 0, 'corpus_date': 0}, 'sample': [{'word': 'LVII', 'word_count': 1, 'corpus': 'sonnets', 'corpus_date': 0}, {'word': 'augurs', 'word_count': 1, 'corpus': 'sonnets', 'corpus_date': 0}, {'word': "dimm'd", 'word_count': 1, 'corpus': 'sonnets', 'corpus_date': 0}]}


In [None]:
print(str(summary))

{'shape': (10, 4), 'columns': {'word': dtype('O'), 'word_count': Int64Dtype(), 'corpus': dtype('O'), 'corpus_date': Int64Dtype()}, 'missing': {'word': 0, 'word_count': 0, 'corpus': 0, 'corpus_date': 0}, 'sample': [{'word': 'LVII', 'word_count': 1, 'corpus': 'sonnets', 'corpus_date': 0}, {'word': 'augurs', 'word_count': 1, 'corpus': 'sonnets', 'corpus_date': 0}, {'word': "dimm'd", 'word_count': 1, 'corpus': 'sonnets', 'corpus_date': 0}]}


In [None]:
DATASET_ID = 'data-sample'

In [None]:
query = f"""
CREATE OR REPLACE MODEL `{PROJECT_ID}.{DATASET_ID}.llama`
OPTIONS(
    model_type='remote',
    remote_service_type='cloud_ai_large_language_model_v1',
    endpoint='projects/{PROJECT_ID}/locations/us-central1/publishers/meta/models/llama2-7b-chat'
);"""
qq = gb.bf.query(query)
