In [None]:
import vertexai
from google.cloud import aiplatform

import os
import google.generativeai as genai

from dotenv import load_dotenv
load_dotenv()

GOOGLE_API_KEY = os.getenv("GEMINI_API_KEY")
SEMANTIC_SCHOLAR_API_KEY = os.getenv("SEMANTIC_SCHOLAR_API_KEY")

genai.configure(api_key=os.environ["GEMINI_API_KEY"])



In [33]:
from vertexai.preview import tokenization

from dotenv import load_dotenv
load_dotenv()

# Initialize Vertex AI
PROJECT_ID = "rbio-p-datasharing"  # Replace with your project ID
LOCATION = "us-central1"  # Replace with your location
vertexai.init(project=PROJECT_ID, location=LOCATION)

model_name = "gemini-1.5-flash"
text = "this is a test"
tokenizer = tokenization.get_tokenizer_for_model(model_name)


response = tokenizer.compute_tokens(text)
response

PreviewComputeTokensResult(tokens_info=[TokensInfo(token_ids=[883, 603, 476, 2121], tokens=[b'this', b' is', b' a', b' test'], role='user')])

In [22]:
token_ids = response.tokens_info[0].token_ids

In [12]:
response.tokens_info[0].token_ids

[883, 603, 476, 2121]

In [29]:
import requests
import json
import os

LOCATION = "us-central1"
MODEL_NAME = "gemini-1.5-pro-latest"

def get_access_token():
    """Gets an access token using gcloud."""
    import subprocess
    try:
        output = subprocess.check_output(["gcloud", "auth", "print-access-token"])
        return output.strip().decode()
    except subprocess.CalledProcessError as e:
        print(f"Error getting access token: {e}")
        return None

def decode_with_tokenizer(token_ids: list):
    """Decodes token IDs using the REST API."""
    url = f"https://us-central1-aiplatform.googleapis.com/v1/projects/{PROJECT_ID}/locations/{LOCATION}/models/"
    headers = {
        "Authorization": f"Bearer {get_access_token()}",
        "Content-Type": "application/json"
    }
    data = {"instances": [{"token_ids": token_ids}]}
    response = requests.post(url, headers=headers, json=data)
    response.raise_for_status()
    return response.json()["predictions"][0]["text"]

decode_with_tokenizer(response.tokens_info[0].token_ids)

HTTPError: 404 Client Error: Not Found for url: https://us-central1-aiplatform.googleapis.com/v1/projects/rbio-p-datasharing/locations/us-central1/models/

In [3]:

def encode_with_tokenizer(text: str, model_name: str):
    """Encodes text using the Vertex AI API (which uses the model's tokenizer)."""
    try:
        endpoint = aiplatform.Endpoint.find(
            endpoint=model_name, location=LOCATION
        )[0]  # Find the endpoint associated with the model

        response = endpoint.predict(instances=[{"content": text}])  # Send text to the endpoint
        encoded_ids = response.predictions[0]["token_ids"]
        return encoded_ids
    except IndexError:
        print(f"Error: No endpoint found for model '{model_name}'. "
              f"Make sure the model is deployed or available in your project.")
        return None
    except Exception as e:
        print(f"An error occurred during encoding: {e}")
        return None


encode_with_tokenizer("test", model_name="gemini-1.5-pro")

An error occurred during encoding: type object 'Endpoint' has no attribute 'find'


In [None]:

def decode_with_tokenizer(token_ids: list, model_name: str):
    """Decodes token IDs using the Vertex AI API."""
    try:
        endpoint = aiplatform.Endpoint.find(
            endpoint=model_name, location=LOCATION
        )[0]  # Find the endpoint associated with the model

        response = endpoint.predict(instances=[{"token_ids": token_ids}])
        decoded_text = response.predictions[0]["text"]
        return decoded_text
    except IndexError:
        print(f"Error: No endpoint found for model '{model_name}'. "
              f"Make sure the model is deployed or available in your project.")
        return None
    except Exception as e:
        print(f"An error occurred during decoding: {e}")
        return None

if __name__ == "__main__":
    original_text = "This is a test sentence for encoding and decoding with Gemini Flash 1.5."
    model_name = "gemini-1.5-flash-001"

    encoded_ids = encode_with_tokenizer(original_text, model_name)

    if encoded_ids is not None: #Check if encode was successful
        print(f"Encoded IDs: {encoded_ids}")

        decoded_text = decode_with_tokenizer(encoded_ids, model_name)
        if decoded_text is not None: #Check if decode was successful
            print(f"Decoded Text: {decoded_text}")
            if original_text == decoded_text:
                print("Encode and decode successful! Text matches.")
            else:
                print("Encode and decode successful but text does not match. This is unusual.")
        else:
            print("Decoding failed.")
    else:
        print("Encoding failed.")

    #Test a different model
    model_name = "gemini-pro"
    encoded_ids = encode_with_tokenizer(original_text, model_name)

    if encoded_ids is not None: #Check if encode was successful
        print(f"\nEncoded IDs (Gemini Pro): {encoded_ids}")

        decoded_text = decode_with_tokenizer(encoded_ids, model_name)
        if decoded_text is not None: #Check if decode was successful
            print(f"Decoded Text (Gemini Pro): {decoded_text}")
            if original_text == decoded_text:
                print("Encode and decode successful! Text matches.")
            else:
                print("Encode and decode successful but text does not match. This is unusual.")
        else:
            print("Decoding failed.")
    else:
        print("Encoding failed.")