### What's up in the notebook?

This is the big one. ALl of the steps needed to get benchmarks.
#### 1. Function Definitions

#### 2.  Test the SQL functions

#### 3. Check the data in weaviate, just to be sure

#### 4. Create the baseline in SQLite
Run a weaviate query with a cursor, 
one at a time, add a new row to tSample 
with the 
    benchmark prompt 
    benchmark completion
    api_completion
    cosine similarity 

The above you can run every day or week or ...

#### 5. Export SQL tSample table to CSV

#### N.B. Cohere API Trial key is limited to 5 API calls/minute. 
You can continue to use the Trial key for free or upgrade to a Production key with higher rate limits at 'https://dashboard.cohere.ai/api-keys'. 



In [None]:
# 1. Function Definitions

import datetime
import sqlite3

print("Connecting to sqlite driftDb")
DRIFT_DB_PATH = "..\\data\\driftDb.db"
sql_conxn = sqlite3.connect(DRIFT_DB_PATH)
sql_cursor = con.cursor()

# SQLite benchmark sample and completion functions
# user
# insert_benchmark("chat_id00000000", "bmk_prompt", "bmk_completion")
# insert_api_completion("chat_id00000000", "bmk_prompt", "bmk_completion", 0.987654321, "api_completion")

run_time = ct = datetime.datetime.now().replace(second=0, microsecond=0)

# inserts default value of 1.0 for cos_sim
def insert_benchmark(chat_id, bmk_prompt, bmk_completion):
    sql = "INSERT INTO tSample (chat_id, run_time, bmk_prompt, bmk_completion, cos_sim) VALUES ('{}', '{}', \"{}\", \"{}\", {});".format(chat_id, run_time, bmk_prompt, bmk_completion, 1.0);
    print(sql)
    sql_cursor.execute(sql)
    sql_conxn.commit()
    

def insert_api_completion(chat_id, bmk_prompt, bmk_completion, cos_sim, api_completion):
    sql = "INSERT INTO tSample (chat_id, run_time, bmk_prompt, bmk_completion, cos_sim, api_completion) VALUES ('{}', '{}', \"{}\", \"{}\", {}, \"{}\");".format(chat_id, run_time, bmk_prompt, bmk_completion, cos_sim, api_completion);
    print(sql)
    sql_cursor.execute(sql)
    sql_conxn.commit()    

def IsOkString(value):
    if not value or len(value)==0:
        return False 
    else:
        return True
    
def print_runtime():
    run_time = datetime.datetime.now().replace(second=0, microsecond=0)
    print("Last run time: {}".format(run_time))
    
def escape(text):
    return text.replace('"', '""')    
    
print_runtime()



In [None]:
# 2.  Test the SQL functions

# check we have the right database
sql_cursor.execute("PRAGMA database_list")
rows = sql_cursor.fetchall()

for row in rows:
    print(row[0], row[1], row[2])
    
insert_benchmark("chat_id00000000", "bmk_prompt", "bmk_completion")
insert_api_completion("chat_id00000000", "bmk_prompt", "bmk_completion", 0.987654321, "api_completion")

res = sql_cursor.execute("SELECT * FROM tSample;")
rows = res.fetchall()
print(json.dumps(rows, indent=4))    

In [None]:
# 2a Clean up SQL DB
sql_cursor.execute("DELETE FROM tSample;")
sql_conxn.commit()

print("sqlite3 script completed.")

In [None]:
# 3. Check the data in weaviate
 
import weaviate
import json

print("Let's a go!")

print("Connecting to weaviate instance on localhost:8080...")
client = weaviate.Client("http://localhost:8080")
print("Client created")

# get the schema
# print(json.dumps(client.schema.get(), indent=4))

batch_size = 5

result = (
    client.query.get("DriftBenchmark", ["chat_id", "prompt_token_len", "prompt_sent_len", "prompt", "completion_token_len", "completion_sent_len", "completion"])
        # Optional when checking
        # Retrieve the vector embedding by adding `vector` to the _additional fields
        #.with_additional(["id vector"])
        .with_limit(batch_size)
        .do()
)

print_runtime()
print(json.dumps(result, indent=4))

In [None]:

# 4. Using a weaviate cursor, 
# send the benchmark prompt to cohere
# calculate the cosine similarity
# insert new row into db.

import weaviate
import cohere
import os
from dotenv import load_dotenv
load_dotenv()

api_key = os.getenv('COHERE_API_KEY')

from sentence_transformers import SentenceTransformer, util

print("Load the sentence transformer model: multi-qa-MiniLM-L6-cos-v1 ")
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

print("Connecting source_client to weaviate instance on localhost:8080...")
source_client = weaviate.Client("http://localhost:8080")
print("source_client created")

# Cohere trial key is limited to 5 requests per minute.
limit = 3
result_count = 0
aggregate_count = 0
batch_size = 3
class_name = "DriftBenchmark"
class_properties = ["prompt"]
cursor = None

print("Creating cohere_client")
cohere_client = cohere.Client(api_key) # This is your trial API key


def wv_get_batch_with_cursor(client, class_name, class_properties, batch_size, cursor=None):

    query = (
        client.query.get("DriftBenchmark", ["chat_id", "prompt", "completion"])
        # Optionally retrieve the vector embedding by adding `vector` to the _additional fields
        .with_additional(["id vector"])
        .with_limit(batch_size)
    )

    if cursor is not None:
        return query.with_after(cursor).do()
    else:
        return query.do()

# Batch import all objects to the target instance
while True:
    
    # From the SOURCE instance, get the next group of objects
    results = wv_get_batch_with_cursor(source_client, class_name, class_properties, batch_size, cursor)

    # If empty, we're finished
    if len(results["data"]["Get"][class_name]) == 0:
        break

    # Otherwise, add the objects to the batch to be added to the target instance
    objects_list = results["data"]["Get"][class_name]
    aggregate_count += len(objects_list)
    
    print('aggregate_count = {}'.format(aggregate_count))
    
    for res in objects_list:
        the_prompt = res['prompt']
        result_count += 1
        
        response = cohere_client.generate(
            model='command',
            prompt=the_prompt,
            max_tokens=300,
            temperature=0.9,
            k=0,
            stop_sequences=[],
            return_likelihoods='NONE')
        
        # cohere completion        
        api_completion = response.generations[0].text.replace("\n", " ")
        # encode as a vector        
        vec_current_completion = model.encode(api_completion)
        
        # calculate the cosine similarity
        cos_sim = util.cos_sim(res['_additional']['vector'], vec_current_completion)
        print("Orig completion: {}".format(res['completion']))
        print("API completion: {}".format(api_completion))
        print("Cosine-Similarity: {} \n".format(cos_sim.item()))        
        print("------------------------------------------------------------------------------------------------")
        
        
        bmk_prompt = escape(res["prompt"])
        bmk_completion = escape(res["completion"])
        completion = escape(api_completion)
        insert_api_completion(res["chat_id"], bmk_prompt, bmk_completion, cos_sim.item(), completion)
                
    if(aggregate_count >= limit):
        break

    # Update the cursor to the id of the last retrieved object
    cursor = results["data"]["Get"][class_name][-1]["_additional"]["id"]
    
sql_conxn.commit()

In [None]:
# have a look
res = sql_cursor.execute("SELECT * FROM tSample;")
rows = res.fetchall()
print(json.dumps(rows, indent=4)) 

In [None]:
# When you're ready you can close these
sql_cursor.close()
sql_conxn.close()

In [None]:
# 5. Export SQL tSample table to CSV

import csv
import sqlite3

print("Connecting to sqlite driftDb")
DRIFT_DB_PATH = "..\\data\\driftDb.db"
sql_conxn = sqlite3.connect(DRIFT_DB_PATH)
sql_cursor = con.cursor()
try:
    # If you want to have a look first:
    # res = sql_cursor.execute("SELECT * FROM tSample;")
    # rows = res.fetchall()
    # print(json.dumps(rows, indent=4)) 

    outFile = "..\\data\\tSample.csv"

    sql_cursor.execute("SELECT chat_id, run_time, cos_sim, bmk_prompt, bmk_completion, api_completion from tSample ORDER BY run_time ASC, cos_sim DESC;")
    columns = [column[0] for column in sql_cursor.description]
    results = []
    for row in sql_cursor.fetchall():
        results.append(dict(zip(columns, row)))
        
    with open(outFile, "w", newline='') as new_file:
        fieldnames = columns
        writer = csv.DictWriter(new_file,fieldnames=fieldnames)
        writer.writeheader()
        for line in results:
            writer.writerow(line)
            
    print("Finished writing to: {}".format(outFile))
    
except sqlite3.Error as error:
    print("sqlite3.Error: ", error)
finally:
    if sql_conxn:
        sql_cursor.close()
        sql_conxn.close()   
