In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from ai_data_formatter.copilot.model import PromptCodeChatSession
from ai_data_formatter.config import ModelConfig, DBClient
import pandas as pd
import json
from sqlalchemy.engine import URL
from configparser import ConfigParser

config = ConfigParser()
with open("set-env") as stream:
    config.read_string("[DEFAULT]\n" + stream.read())  # This line does the trick.

with open("code_prompt_template.json", "r") as f:
    model_configs = json.load(f)

pg_host=config['DEFAULT'].get("PG_HOST")
pg_uname=config['DEFAULT'].get("PG_UNAME")
pg_secret=config['DEFAULT'].get("PG_SECRET")
pg_db=config['DEFAULT'].get("PG_DB")
conn_str_alchemy = f"postgresql://{pg_uname}:{pg_secret}@{pg_host}/{pg_db}"
cache_secret=config['DEFAULT'].get("CACHE_SECRET")
cache_host=config['DEFAULT'].get("CACHE_HOST")
cache_port=config['DEFAULT'].get("CACHE_PORT")
conn_str_redis = f"redis://:{cache_secret}@{cache_host}:{cache_port}/0"
test_data = pd.read_csv("data/pii_test.csv")
github_token = config['DEFAULT'].get("GITHUB_SECRET")

import os
os.environ["GOOGLE_CLOUD_PROJECT"] = "docai-warehouse-demo"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/zjia/Workspace/gen-ai-data-transformer/sa_token.json"

dbclient = DBClient.from_dict(
    {
        "db": {
            "url": conn_str_alchemy
        },
        "cache": {
            "url": "redis://:fKPWbbOrbgvQI46TRiY04UjcFLH33GVTcAzCaMzMmYs=@aidf-cache.redis.cache.windows.net:6379/0",
            "expire_time_second": 120
        }
    }
)

model_config_spec = list(filter(lambda config: config.get("tag")=="copilot-cleansing", model_configs))[0]
model_config = ModelConfig.from_dict(model_config_spec)



In [None]:
import requests

headers = {
    "Accept": "application/vnd.github+json",
    "Authorization": f"Bearer {github_token}",
    "X-GitHub-Api-Version": "2022-11-28"
}
url = "https://api.github.com/search/code?q=clean ssn+org:ksmc"

res = requests.get(url, headers=headers)
res.json()["items"][0]

In [None]:
path_array = res.json()["items"][0]["html_url"].split("/")
repository_name = res.json()["items"][0].get("repository", {}).get("name")
raw_file_path = os.path.join(*path_array[path_array.index('blob')+1:])
raw_url = f"https://raw.githubusercontent.com/ksmc/{repository_name}/{raw_file_path}"
raw_url

In [None]:
# raw_url = "https://github.com/ksmc/p3rl-prototype-azure/raw/b576cb4fd0a20efbddf502e9844471bb0890a97b/notebooks/main-clean.py"
# raw_url = "https://raw.githubusercontent.com/ksmc/p3rl-prototype-azure/b576cb4fd0a20efbddf502e9844471bb0890a97b/notebooks/main-clean.py"

In [None]:
res.json()

In [None]:
res = requests.get(raw_url, headers=headers)
res.text

In [None]:
session = PromptCodeChatSession(
    project_id="docai-warehouse-demo", 
    location="us-central1",
    model_config=model_config,
    session_id=None,
    dbclient=dbclient,
    pre_load_model=False
)

In [None]:
res = session.load_sample_data(test_data)
res

In [None]:
res = session.chat_session.send_message(
    """
    Web search results:

[1] "Number and Keep Your Information Safe Social Security maintains a robust cybersecurity system, but you are the most important factor in helping us keep your information safe. You can help by: â€¢ Opening your personal my Social Security account. Create your account today and take away the risk of someone else trying to create one in your name ..." URL: https://www.ssa.gov/pubs/EN-05-10220.pdf

[2] "If documentation isn't required and you just need to share an ID number or some other details, you can provide the information over the phone. Again, do so only if you know the person is legitimate and trustworthy. Short of relying on an overnight courier or the postal service, your alternatives involve technology." URL: https://www.aarp.org/home-family/personal-technology/info-2021/online-ssn-security-tips.html

[3] "Social Security Number Format. The Social Security number format is a nine-digit number, generally separated by hyphens into sections of three digits, two digits, and four digits. For example, a typical SSN follows the format of "AAA-GG-SSSS," where A represents the Area number, G represents the Group number, and S represents the Serial number." URL: https://ssofficelocation.com/resources/social-security-number-format/

[4] "Never give them this information without verifying their identity. The best way to do this is by calling them back at a verified phone number or visiting them in person if possible. The phone call might be from a scammer who is looking to steal your information. #4." URL: https://ssofficelocation.com/resources/how-to-protect-your-social-security-number/

[5] "Social Security numbers, also known as SSNs, are allocated through a process called randomization that was introduced in June 2011. This system retains the long-standing nine-digit format but assigns a number to each new Social Security cardholder randomly, eliminating methods that date to the inception of Social Security in the mid-1930s." URL: https://www.aarp.org/retirement/social-security/questions-answers/how-are-SSNs-assigned.html

Current date: 12/17/2023

Instructions: Using the provided web search results, write a comprehensive reply to the given query. Make sure to cite results using [number] notation after the reference. If the provided search results refer to multiple subjects with the same name, write separate answers for each subject. Query: how to clean and standardize social security number
    """
)
print(res.text)

In [None]:
res = session.send_message("how to clean and standardize social security number")
print(res)

In [None]:
res = session.send_message("The code doesn't work on the last example. Rewrite the logic.")
res

In [None]:
import redis
import pickle
session_id = "d70bd6b639a84753866836cb9ba7c110"
cache = redis.Redis.from_url("redis://:fKPWbbOrbgvQI46TRiY04UjcFLH33GVTcAzCaMzMmYs=@aidf-cache.redis.cache.windows.net:6379/0")
messages = cache.get(f"session_logging_queue:{session_id}") 
if messages is None:
    messages = []
else:
    messages = pickle.loads(messages)
messages