In [10]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
from ai_data_formatter.cleansing.model import PromptTextGenModel
from ai_data_formatter.config import ModelConfig, DBClient
import pandas as pd
import json
from sqlalchemy.engine import URL
from configparser import ConfigParser

config = ConfigParser()
with open("set-env") as stream:
    config.read_string("[DEFAULT]\n" + stream.read())  # This line does the trick.

with open("cleansing_prompt_template.json", "r") as f:
    model_configs = json.load(f)

pg_host=config['DEFAULT'].get("PG_HOST")
pg_uname=config['DEFAULT'].get("PG_UNAME")
pg_secret=config['DEFAULT'].get("PG_SECRET")
pg_db=config['DEFAULT'].get("PG_DB")
conn_str_alchemy = f"postgresql://{pg_uname}:{pg_secret}@{pg_host}/{pg_db}"
cache_secret=config['DEFAULT'].get("CACHE_SECRET")
cache_host=config['DEFAULT'].get("CACHE_HOST")
cache_port=config['DEFAULT'].get("CACHE_PORT")
conn_str_redis = f"redis://:{cache_secret}@{cache_host}:{cache_port}/0"
test_data = pd.read_csv("data/pii_org.csv")

import os
os.environ["GOOGLE_CLOUD_PROJECT"] = "docai-warehouse-demo"
# os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/zjia/Workspace/gen-ai-data-transformer/sa_token.json"

dbclient = DBClient.from_dict(
    {
        "db": {
            "url": conn_str_alchemy
        },
        "cache": {
            # "url": conn_str_redis,
            "expire_time_second": 120
        }
    }
)

In [16]:
import sys
import os
import logging
import math
logging.basicConfig(stream=sys.stdout, level=logging.INFO)

import time
from tqdm import tqdm
from multiprocessing import cpu_count
from concurrent.futures import ThreadPoolExecutor, as_completed
from ratelimiter import RateLimiter
from pyspark.sql import SparkSession
from pyspark import SparkConf
import findspark
findspark.init(edit_rc=False)

rate_limiter = RateLimiter(max_calls=50, period=60)
config = SparkConf().setAll([('spark.sql.autoBroadcastJoinThreshold', '-1')])
spark = SparkSession.builder\
    .config("spark.driver.memory", "4g") \
    .appName('aidf-app') \
    .config(conf=config).getOrCreate()

def clean(column, tag):
    """
    Apply cleansing
    """
    model_config_spec = list(filter(lambda config: config.get("tag")==tag, model_configs))[0]
    model_config = ModelConfig.from_dict(model_config_spec)
    model = PromptTextGenModel(
        project_id="docai-warehouse-demo", 
        location="us-central1",
        model_config=model_config
    )
    if not spark:
        ## Parallel in single server with multiprocessing
        data = pd.read_csv("data/pii_org.csv")
        values = data[column].unique()

        parallem = cpu_count() - 1
        batch_size = round(math.ceil(len(values) / parallem), 0)
        batches = [values[(min((i)*batch_size, len(values)-1)):min((i+1)*batch_size, len(values)-1)] for i in range(0, parallem)]
        
        values_cln = []
        with ThreadPoolExecutor() as executor:
            futures = []
            for values in batches:
                with rate_limiter:
                    futures.append(
                        executor.submit(
                            # function name
                            model.batch_predict,
                            # parameters
                            values
                        )
                    )
            for f in tqdm(
                as_completed(futures),
                desc=f"Predicting the model output",
                total=len(futures),
            ):
                values_cln.append(list(f.result()))
        return pd.DataFrame([item for sublist in values_cln for item in sublist], columns=[column, f"{column}_cln"])
    
    else:
        from pyspark.sql.functions import col, pandas_udf
        from pyspark.sql.types import ArrayType, StringType
        ## Parallel in cluster using Apache Spark
        data = spark.read.format("csv").option("header", "true").load("data/pii_org.csv")
        
        @pandas_udf(ArrayType(StringType()))
        def cln_udf(inputs):
            return pd.Series(model.batch_predict(inputs))
        
        data = data.withColumn(f"{column}_cln", cln_udf(col(column)))
        return data.select(f"{column}_cln").toPandas()

In [18]:
spark = None
result = clean("r_org", "cleansing-race")

Predicting the model output: 100%|██████████| 7/7 [00:02<00:00,  2.57it/s]


In [14]:
def evaluate(pii):
    evaluation_data = test_data.merge(result, on=f"{pii}_org")
    print(evaluation_data.groupby(f"{pii}_dl").apply(lambda x: sum(x[pii]==x[f"{pii}_org_cln"])/x["id"].count()))
evaluate("r")

r_dl
-1    0.762152
 0    0.762609
 1    0.858779
dtype: float64


In [9]:
# result.to_csv("fn_org_cln.csv", index=False)