In [0]:
par_database = getArgument("par_database")

In [0]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
from pyspark.sql.functions import collect_set,expr,col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [0]:
spark.sql(f"USE `{par_database}`")
table_list = spark.sql("SHOW TABLES").filter("tableName != '_sqldf'").selectExpr(f"collect_set(concat('hive_metastore', '.', '{par_database}', '.', tableName)) as table_list").first().table_list

In [0]:
# Define a lock for thread-safe operations
write_lock = threading.Lock()

# List to accumulate rows
accumulated_rows = []
batch_size = 100

# Set concurrency level
concurrency_level = 25

In [0]:
def describe_table(table_fqn):
  catalog, database, tableName = table_fqn.split('.')
  return spark.sql(f"""
            WITH history AS (
                DESCRIBE HISTORY {database}.{tableName} LIMIT 100
            ),
            job_exp AS (
                SELECT operation, job.*, timestamp FROM history where operation not in ("FSCK","CONVERT","OPTIMIZE","CLONE","RESTORE","VACUUM")
            ),
            dist_job AS (
                SELECT DISTINCT operation, jobId, jobName, timestamp FROM job_exp
            )
            SELECT '{database}' AS database, '{tableName}' AS tableName, operation, jobId, jobName, max(timestamp)  FROM dist_job group by all
        """)

In [0]:
def write_to_table(rows):
  schema = StructType([
        StructField("database", StringType(), True),
        StructField("tableName", StringType(), True),
        StructField("operation", StringType(), True),
        StructField("jobId", StringType(), True),
        StructField("jobName", StringType(), True),
        StructField("timestamp", StringType(), True),
    ])
  df = spark.createDataFrame(rows,schema=schema)
  df.write.mode("append").saveAsTable("dataops_prd.migration.tables_jobs_temp")

In [0]:
def describe_and_accumulate(table_fqn):
    global accumulated_rows
    result = describe_table(table_fqn)
    for column in result.columns:
        result = result.withColumn(column, col(column).cast("string"))

    rows = result.collect()

    # Use lock to ensure thread-safe operations
    with write_lock:
        accumulated_rows.extend(rows)
        if len(accumulated_rows) >= batch_size:
            write_to_table(accumulated_rows)
            accumulated_rows = []

In [0]:
# Use ThreadPoolExecutor for concurrent execution
with ThreadPoolExecutor(max_workers=concurrency_level) as executor:
    # Submit tasks to the thread pool
    future_to_table = {executor.submit(describe_and_accumulate, table): table for table in table_list}

    # Wait for all tasks to complete
    for future in as_completed(future_to_table):
        item = future_to_table[future]
        try:
            future.result()  # Get result or exception
        except Exception as exc:
            print(f'{item} generated an exception')

# Write any remaining rows
if accumulated_rows:
    write_to_table(accumulated_rows)