In [None]:
from pyhive import hive
import psycopg2
import pandas as pd
import time
import math

# Connection to HTS Data Mart
# conn_mart = psycopg2.connect(dbname="HTSDATA", user="postgres", password="wGMCAE6zFHcyrBmXtus97JPanxvkY4fb", host="127.0.0.1",port= 5431)
conn_mart = psycopg2.connect(dbname="LIMSDATA", user="postgres", password="wGMCAE6zFHcyrBmXtus97JPanxvkY4fb", host="127.0.0.1",port= 5431)

# conn_mart = psycopg2.connect(dbname="HTS_DB", user="postgres", password="root", host="127.0.0.1",port= 5432)
conn_mart.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
cur_mart = conn_mart.cursor()

# Function to establish a Hive connection
def create_hive_connection(host, port, username, password, database, auth_mode):
    """Creates a connection to the Hive database."""
    try:
        conn = hive.Connection(
            host=host,
            port=port,
            username=username,
            password=password,
            database=database,
            auth=auth_mode
        )
        return conn
    except Exception as e:
        print(f"Error creating connection: {e}")
        return None

# Function to execute a query and fetch results
def execute_hive_query(conn, query):
    """Executes the given query and returns the results."""
    try:
        cursor = conn.cursor()
        cursor.execute(query)
        columns = [desc[0] for desc in cursor.description]
        results = cursor.fetchall()
        return results, columns
    except Exception as e:
        print(f"Error executing query: {e}")
        return None
    finally:
        cursor.close()


# Function to get all data from the fact_lab_request_orders
def get_all_patient_data(conn):
 
    """Fetches all data from the act_lab_request_orders view."""
    query = 'SELECT * FROM fact_lab_request_orders where cast(task_authored_on as date) >=2025-02-18 AND encounter_facility_id = ZW090A17'
    return execute_hive_query(conn, query)

# Function to get new or updated patient records (incremental pull)
def get_new_patient_data(conn, last_timestamp):
    """Fetches newly inserted/updated patient data based on last timestamp."""
    # Assuming there is a timestamp column 'last_modified' in the patient table
    query = f"SELECT * FROM fact_lab_request_orders WHERE last_updated > '{last_timestamp}' order by last_updated asc"
    return execute_hive_query(conn, query)

# Function to close the Hive connection
def close_hive_connection(conn):
    """Closes the connection to the Hive database."""
    try:
        conn.close()
    except Exception as e:
        print(f"Error closing connection: {e}")

# Polling logic to listen for new or updated records
def listen_for_changes(conn, last_timestamp, polling_interval=300):
    """Polls the Hive database for changes at a regular interval."""
    while True:
        try:
            print(f"Polling for changes after {last_timestamp}...")
            print("start here")
            new_data = get_new_patient_data(conn, last_timestamp)
            print("new data", new_data[0])
            
            if new_data[0]:
                print("New/Updated records found:")
                for row in new_data:
                    print(row)
                    # Update the last_timestamp to the latest record's timestamp
                    # Assuming the 'last_modified' column exists and is at index -1
                    last_timestamp = row[-1]  # Update the last timestamp
                    
            else:
                print("No new records found.")
            
            # Wait for the polling interval before checking again
            time.sleep(polling_interval)
        except Exception as e:
            print(f"Error during polling: {e}")
            break  # Exit polling loop if an error occurs

def check_if_snapshot_done():
    cur_mart.execute("SELECT dw_date_created FROM marts.dm_lab_request_orders order by dm_date_created desc limit 1") 
    result = cur_mart.fetchone()
    if result:
        return  result[0]
    else:
        return None

def get_all_patient_data_in_batches(conn, batch_size=50000):
    batch_number = 1
    snapshot_date = check_if_snapshot_done()

    if snapshot_date:
        last_processed_value = snapshot_date #get last date for the snapshot and start from there 
        print(">>> Detected Snapshot done", last_processed_value)
    else:
        last_processed_value = '1900-01-01 00:00:00'  #Initialize last_processed_value to start fetching from
        print(">>> Initial snapshot")

    print("...................................................")

    # Combine all queries into one
    query = f"""
    SELECT 
        COUNT(*) AS total_records, 
        COUNT(DISTINCT encounter_facility_id) AS total_facilities, 
        COUNT(DISTINCT patient_id) AS distinct_people, 
        COUNT(DISTINCT encounter_id) AS distinct_encounters, 
        MAX(last_updated) AS last_update_time
    FROM fact_lab_request_orders where last_updated > '{last_processed_value}' and encounter_facility_id = 'ZW090A17'
    """
    # Execute the single query
    result = execute_hive_query(conn, query)[0][0]

    # Output the results
    print("Total Number of Records to be fetched:", result[0])
    print("Total Number of Facilities:", result[1])
    print("Number of Distinct People:", result[2])
    print("Number of Distinct Encounters:", result[3])
    print("Last time update:", result[4])

    print("Total Number of Batches ", str(math.ceil(int(result[0])/ 50000)))


    while True:
        print("..................................................")
        print("working on Batch ", batch_number)
        # Query to fetch data in batches

        query = f"SELECT * FROM fact_lab_request_orders where last_updated > '{last_processed_value}' and encounter_facility_id = 'ZW090A17'  order by last_updated asc LIMIT {batch_size}"

        batch_data, columns = execute_hive_query(conn, query)
        
        
        if not batch_data:  # No more data
            print(f"All data fetched. Total batches: {batch_number - 1}")
            break
        
        print(f"Batch {batch_number}: Fetched {len(batch_data)} rows.")
        
        # Append the batch data to the all_data list
        df = pd.DataFrame(batch_data, columns=columns)
        print("Shape of dataframe created ", df.shape)

        # Drop duplicate rows 
        # df['temp_last_updated'] = pd.to_datetime(df['last_updated'], errors='coerce')
        # df.sort_values(by=['encounter_id', 'temp_last_updated'], inplace=True)
        # df.drop_duplicates(subset=['encounter_id'], keep='last', inplace=True)
        # print("Shape of dataframe after dropping duplicates ", df.shape)

        df['test_type'] =  df['test_type'].str.replace(')','')
        df.rename(columns={'patient_id':'person_id', 
                           'encounter_facility_id':'facility_id_code',
                           'birth_date':'birthdate',
                           'last_updated':'date_created',
                           'result':'test_results',
                           'task_authored_on':'shr_date',
                           'task_execution_start_date':'impilo_registration_date',
                           'task_status':'lab_order_status'}, inplace= True)
        
        df['event_date'] = df['shr_date']
        #df.drop(columns=['has_hts_results'], inplace=True)
        print(df.shape)

        for _, row in df.iterrows():
            encounter_id = row['task_id']
            # Check if a record with the given hts_number exists
            cur_mart.execute("SELECT 1 FROM marts.dm_lab_request_orders WHERE encounter_id = %s", (encounter_id,))
            if cur_mart.fetchone():
                # Update existing record
                print(f"Updating record for encounter_id: {encounter_id}")
                update_query = """
                UPDATE marts.dm_lab_request_orders
                SET event_date = %s, dedupe_id = %s, lab_request_number = %s, birthdate = %s, gender = %s, 
                    shr_date = %s, impilo_registration_date = %s, date_sample_taken = %s , lab_order_status = %s, status_reason = %s, note = %s, sample_code = %s, sample_type = %s,
                    test_type = %s, facility_id_code = %s, lab = %s, dw_date_created = %s,
                     test_results = %s, dm_date_created = NOW(),person_id = %s
                WHERE encounter_id = %s
                """
                cur_mart.execute(update_query, (
                    row['event_date'], row['dedupe_id'], row['lab_request_number'], row['birthdate'], row['gender'], row['shr_date'],
                    row['impilo_registration_date'], row['date_sample_taken'], row['lab_order_status'],
                    row['status_reason'],  row['note'], row['sample_code'],  row['sample_type'],  row['test_type'],
                    row['facility_id_code'], row['lab'], row['date_created'],  row['test_results'],
                     row['person_id'], encounter_id
                ))
            else:
                # Insert new record
                print(f"Inserting new record for encounter_id: {encounter_id}")
                # insert_query = """
                # INSERT INTO marts.dm_hts (
                #     event_date, dedupe_id, birthdate, sex, date_of_hiv_test,
                #     reason_for_hiv_testing, hts_test_result, hts_type, age_at_visit,
                #     first_test_ever_in_life, client_profile, self_identified_gender, 
                #     dw_date_created, dm_date_created, person_id, facility_id_code, received_hiv_test_results,
                #     encounter_id
                # ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, NOW(), %s, %s, %s,%s)
                # """
                insert_query = """
                    INSERT INTO marts.dm_lab_request_orders (
                    event_date, dedupe_id, lab_request_number, birthdate,  gender, shr_date, impilo_registration_date,
                    date_sample_taken,lab_order_status, status_reason, note, sample_code, sample_type , test_type,
                    facility_id_code, lab, test_results,
                    dw_date_created, dm_date_created, person_id,
                    encounter_id
                ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,  %s, %s, %s, %s, %s, NOW(),%s,%s)
                """
                cur_mart.execute(insert_query, (
                    row['event_date'], row['dedupe_id'], row['lab_request_number'], row['birthdate'], row['gender'], row['shr_date'],
                    row['impilo_registration_date'], row['date_sample_taken'], row['lab_order_status'],
                    row['status_reason'],  row['note'], row['sample_code'],  row['sample_type'],  row['test_type'],
                    row['facility_id_code'], row['lab'], row['date_created'],  row['test_results'],
                     row['person_id'], encounter_id
                ))

        # Update the last_processed_value to the latest timestamp in the batch
        last_processed_value = df['date_created'].max()
        print("max",last_processed_value)
        print("min", df['date_created'].min())
        batch_number += 1
    return last_processed_value


# Main function 
def main():
    # Connection details..............................................
    # Test server.....................................................
    # host = "57.151.95.136"
    # port = 10001
    # username = "tnhema"
    # password = "*nhm@4865!"
    # database = "default"
    # auth_mode = "LDAP"

    # production server...............................................
    host = "197.221.242.150"
    port = 17251
    username = "tnhema"
    password = "ZFCG9ZSGksEMpSpA"
    database = "default"
    auth_mode = "LDAP"
    
    # Variable to control initial full load or change listening
    start = "on"  # Set to "on" for initial load or "off" to only listen for changes
    
    # Retry mechanism variables
    retry_attempts = 0
    max_retries = 5000000  # Maximum number of retries before stopping 
    retry_delay = 10  # Delay between retries in seconds
    last_processed_value = '1900-01-01 00:00:00'
    
    while max_retries is None or retry_attempts < max_retries:
        try:
            print(f"Attempting to connect (Attempt {retry_attempts + 1})...")
            
            # Create a connection to Hive
            conn = create_hive_connection(host, port, username, password, database, auth_mode)
            
            if conn:
                if start == "on":
                    # Fetch all data from the patient table during initial load in batches
                    print("Initial load: Fetching all data from the fact_hts_dedup table in batches of 50000.")
                    last_processed_value = get_all_patient_data_in_batches(conn, batch_size=50000)
                
                # After initial load, listen for changes (or if start is "off")
                print("Listening for changes in the hts table...")
                listen_for_changes(conn, last_processed_value, polling_interval=300)
            
            # Close the connection after completing tasks
            if conn:
                close_hive_connection(conn)
            
            # Reset retry_attempts if everything succeeds
            retry_attempts = 0
            break  # Exit the loop if everything worked successfully

        except Exception as e:
            retry_attempts += 1
            print(f"An error occurred: {e}. Retrying in {retry_delay} seconds...")

            # Wait before retrying
            time.sleep(retry_delay)
            
            if max_retries is not None and retry_attempts >= max_retries:
                print(f"Max retries reached ({max_retries}). Exiting.")
                break

# Run the main function
if __name__ == "__main__":
    main()


Attempting to connect (Attempt 1)...
Initial load: Fetching all data from the fact_hts_dedup table in batches of 50000.
>>> Initial snapshot
...................................................
Total Number of Records to be fetched: 9300
Total Number of Facilities: 1
Number of Distinct People: 4137
Number of Distinct Encounters: 5039
Last time update: 2025-09-24T09:58:00.022+00:00
Total Number of Batches  1
..................................................
working on Batch  1


In [None]:
from pyhive import hive
import psycopg2
import pandas as pd
import time
import math
import logging
from datetime import datetime

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def safe_convert_value(value):
    """Convert pandas values to Python native types safely"""
    if pd.isna(value):
        return None
    elif isinstance(value, (pd.Series, pd.Index)):
        # If it's a Series/Index, get the first value
        return str(value.iloc[0]) if len(value) > 0 else None
    elif isinstance(value, (pd.Timestamp, datetime)):
        return value.strftime('%Y-%m-%d %H:%M:%S')
    else:
        return str(value)

# Connection to HTS Data Mart
conn_mart = psycopg2.connect(
    dbname="LIMSDATA", 
    user="postgres", 
    password="wGMCAE6zFHcyrBmXtus97JPanxvkY4fb", 
    host="127.0.0.1",
    port=5431
)
conn_mart.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
cur_mart = conn_mart.cursor()

def create_hive_connection(host, port, username, password, database, auth_mode):
    """Creates a connection to the Hive database."""
    try:
        conn = hive.Connection(
            host=host,
            port=port,
            username=username,
            password=password,
            database=database,
            auth=auth_mode
        )
        logger.info("Hive connection established successfully")
        return conn
    except Exception as e:
        logger.error(f"Error creating Hive connection: {e}")
        return None

def execute_hive_query(conn, query):
    """Executes the given query and returns the results."""
    cursor = None
    try:
        cursor = conn.cursor()
        cursor.execute(query)
        columns = [desc[0] for desc in cursor.description]
        results = cursor.fetchall()
        return results, columns
    except Exception as e:
        logger.error(f"Error executing query: {e}")
        logger.error(f"Query: {query}")
        return None, None
    finally:
        if cursor:
            cursor.close()

def get_new_patient_data(conn, last_timestamp):
    """Fetches newly inserted/updated patient data based on last timestamp."""
    # Fixed: Added facility filter and proper parameter handling
    query = f"""
    SELECT * FROM fact_lab_request_orders 
    WHERE last_updated > '{last_timestamp}' 
    AND encounter_facility_id = 'ZW090A17'
    ORDER BY last_updated ASC
    """
    return execute_hive_query(conn, query)

def close_hive_connection(conn):
    """Closes the connection to the Hive database."""
    try:
        if conn:
            conn.close()
            logger.info("Hive connection closed")
    except Exception as e:
        logger.error(f"Error closing connection: {e}")

def check_if_snapshot_done():
    """Check the last processed date from the data mart."""
    try:
        cur_mart.execute("""
            SELECT dm_date_created 
            FROM marts.dm_lab_request_orders 
            ORDER BY dm_date_created DESC 
            LIMIT 1
        """) 
        result = cur_mart.fetchone()
        if result:
            logger.info(f"Found last snapshot date: {result[0]}")
            return result[0]
        else:
            logger.info("No previous snapshot found")
            return None
    except Exception as e:
        logger.error(f"Error checking snapshot status: {e}")
        return None

def get_all_patient_data_in_batches(conn, batch_size=50000):
    """Process data in batches with proper error handling."""
    batch_number = 1
    total_processed = 0
    snapshot_date = check_if_snapshot_done()

    if snapshot_date:
        last_processed_value = snapshot_date.strftime('%Y-%m-%d %H:%M:%S')
        logger.info(f">>> Detected Snapshot done: {last_processed_value}")
    else:
        last_processed_value = '2025-01-01 00:00:00'
        logger.info(">>> Initial snapshot")

    logger.info("=" * 50)
    
    # DEBUG: Test Hive connection first
    test_query = "SELECT 1 as test_value"
    test_result = execute_hive_query(conn, test_query)
    if test_result[0]:
        logger.info("✓ Hive connection test successful")
    else:
        logger.error("✗ Hive connection test failed")
        return last_processed_value
    
    # DEBUG: Test data mart connection
    try:
        cur_mart.execute("SELECT 1 as test_value")
        test_mart = cur_mart.fetchone()
        if test_mart:
            logger.info("✓ Data mart connection test successful")
        else:
            logger.error("✗ Data mart connection test failed")
            return last_processed_value
    except Exception as e:
        logger.error(f"✗ Data mart connection error: {e}")
        return last_processed_value

    # Get total count with proper facility filter
    count_query = f"""
    SELECT 
        COUNT(*) AS total_records, 
        COUNT(DISTINCT encounter_facility_id) AS total_facilities, 
        COUNT(DISTINCT patient_id) AS distinct_people, 
        COUNT(DISTINCT encounter_id) AS distinct_encounters, 
        MAX(last_updated) AS last_update_time
    FROM fact_lab_request_orders 
    WHERE last_updated > '{last_processed_value}' 
    AND encounter_facility_id = 'ZW090A17'
    """
    
    logger.info(f"DEBUG: Executing count query: {count_query}")
    result_data = execute_hive_query(conn, count_query)
    
    if not result_data[0]:
        logger.error("✗ Failed to get count data from Hive")
        return last_processed_value

    result = result_data[0][0]
    total_records = result[0]
    
    logger.info(f"✓ Count query successful")
    logger.info(f"Total Number of Records to be fetched: {total_records}")
    logger.info(f"Total Number of Facilities: {result[1]}")
    logger.info(f"Number of Distinct People: {result[2]}")
    logger.info(f"Number of Distinct Encounters: {result[3]}")
    logger.info(f"Last time update: {result[4]}")
    logger.info(f"Total Number of Batches: {math.ceil(int(total_records) / batch_size)}")

    if total_records == 0:
        logger.info("⚠ No new records to process - this might be why no data is entering")
        # DEBUG: Let's check what data exists
        debug_query = f"""
        SELECT COUNT(*) as total_in_table,
               MIN(last_updated) as earliest_date,
               MAX(last_updated) as latest_date
        FROM fact_lab_request_orders 
        WHERE encounter_facility_id = 'ZW090A17'
        """
        logger.info(f"DEBUG: Checking overall data availability: {debug_query}")
        debug_result = execute_hive_query(conn, debug_query)
        if debug_result[0]:
            debug_data = debug_result[0][0]
            logger.info(f"DEBUG: Total records in table for facility: {debug_data[0]}")
            logger.info(f"DEBUG: Earliest date: {debug_data[1]}")
            logger.info(f"DEBUG: Latest date: {debug_data[2]}")
            logger.info(f"DEBUG: Current filter date: {last_processed_value}")
        return last_processed_value

    while True:
        logger.info("=" * 50)
        logger.info(f"Working on Batch {batch_number}")
        
        # Fixed: Added facility filter to batch query
        batch_query = f"""
        SELECT * FROM fact_lab_request_orders 
        WHERE last_updated > '{last_processed_value}' 
        AND encounter_facility_id = 'ZW090A17'
        ORDER BY last_updated ASC 
        LIMIT {batch_size}
        """

        logger.info(f"DEBUG: Executing batch query: {batch_query[:200]}...")
        batch_data, columns = execute_hive_query(conn, batch_query)
        
        if not batch_data or len(batch_data) == 0:
            logger.info(f"All data fetched. Total batches processed: {batch_number - 1}")
            logger.info(f"Total records processed: {total_processed}")
            break
        
        logger.info(f"✓ Batch {batch_number}: Fetched {len(batch_data)} rows from Hive")
        logger.info(f"DEBUG: First row sample: {batch_data[0][:5] if batch_data else 'No data'}")
        logger.info(f"DEBUG: Columns: {columns[:10] if columns else 'No columns'}")
        
        # Process the batch
        try:
            df = pd.DataFrame(batch_data, columns=columns)
            logger.info(f"✓ DataFrame created - Shape: {df.shape}")
            
            # DEBUG: Check if required columns exist
            required_cols = ['task_id', 'patient_id', 'last_updated']
            missing_cols = [col for col in required_cols if col not in df.columns]
            if missing_cols:
                logger.error(f"✗ Missing required columns: {missing_cols}")
                logger.info(f"Available columns: {list(df.columns)}")
                break

            # Data cleaning and transformation
            df['test_type'] = df['test_type'].str.replace(')', '', regex=False)
            
            # DEBUG: Check columns before renaming
            logger.info(f"DEBUG: Columns before renaming: {list(df.columns)}")
            
            # Fixed: Proper column renaming
            rename_mapping = {
                'patient_id': 'person_id', 
                'encounter_facility_id': 'facility_id_code',
                'birth_date': 'birthdate',
                'last_updated': 'date_created',
                'result': 'test_results',
                'task_authored_on': 'shr_date',
                'task_execution_start_date': 'impilo_registration_date',
                'task_status': 'lab_order_status',
                'task_id': 'encounter_id'
            }
            
            # Only rename columns that exist
            existing_renames = {k: v for k, v in rename_mapping.items() if k in df.columns}
            missing_renames = {k: v for k, v in rename_mapping.items() if k not in df.columns}
            
            if missing_renames:
                logger.warning(f"⚠ Missing columns for renaming: {missing_renames}")
            
            df.rename(columns=existing_renames, inplace=True)
            logger.info(f"✓ Renamed columns: {existing_renames}")
            
            # Check if encounter_id exists after renaming
            if 'encounter_id' not in df.columns:
                logger.error("✗ CRITICAL: encounter_id column missing after renaming!")
                logger.info(f"Available columns after renaming: {list(df.columns)}")
                break
            
            df['event_date'] = df.get('shr_date')
            logger.info(f"✓ After transformation shape: {df.shape}")
            
            # DEBUG: Sample the data
            logger.info(f"DEBUG: Sample encounter_id values: {df['encounter_id'].head(3).tolist()}")
            
            # Process each row with proper error handling
            successful_inserts = 0
            successful_updates = 0
            errors = 0
            
            for idx, row in df.iterrows():
                try:
                    # Fix: Convert Series to individual values properly
                    encounter_id = str(row['encounter_id']) if pd.notna(row['encounter_id']) else None
                    
                    if not encounter_id or encounter_id == 'nan' or encounter_id == '':
                        logger.warning(f"⚠ Skipping row {idx} - empty encounter_id")
                        continue
                    
                    # DEBUG: Log first few operations
                    if idx < 3:
                        logger.info(f"DEBUG: Processing row {idx} with encounter_id: {encounter_id}")
                        logger.info(f"DEBUG: Row type: {type(row)}, encounter_id type: {type(encounter_id)}")
                    
                    # Check if record exists
                    cur_mart.execute("""
                        SELECT 1 FROM marts.dm_lab_request_orders 
                        WHERE encounter_id = %s
                    """, (encounter_id,))
                    
                    existing_record = cur_mart.fetchone()
                    
                    if existing_record:
                        # Update existing record
                        if idx < 3:  # Debug first few
                            logger.info(f"DEBUG: Updating existing record for encounter_id: {encounter_id}")
                        
                        update_query = """
                        UPDATE marts.dm_lab_request_orders
                        SET event_date = %s, dedupe_id = %s, lab_request_number = %s, 
                            birthdate = %s, gender = %s, shr_date = %s, 
                            impilo_registration_date = %s, date_sample_taken = %s, 
                            lab_order_status = %s, status_reason = %s, note = %s, 
                            sample_code = %s, sample_type = %s, test_type = %s, 
                            facility_id_code = %s, lab = %s, test_results = %s,
                            dw_date_created = %s, dm_date_created = NOW(), person_id = %s
                        WHERE encounter_id = %s
                        """
                        
                        # Fix: Use helper function for safe conversion
                        update_params = (
                            safe_convert_value(row.get('event_date')),
                            safe_convert_value(row.get('dedupe_id')),
                            safe_convert_value(row.get('lab_request_number')),
                            safe_convert_value(row.get('birthdate')),
                            safe_convert_value(row.get('gender')),
                            safe_convert_value(row.get('shr_date')),
                            safe_convert_value(row.get('impilo_registration_date')),
                            safe_convert_value(row.get('date_sample_taken')),
                            safe_convert_value(row.get('lab_order_status')),
                            safe_convert_value(row.get('status_reason')),
                            safe_convert_value(row.get('note')),
                            safe_convert_value(row.get('sample_code')),
                            safe_convert_value(row.get('sample_type')),
                            safe_convert_value(row.get('test_type')),
                            safe_convert_value(row.get('facility_id_code')),
                            safe_convert_value(row.get('lab')),
                            safe_convert_value(row.get('test_results')),
                            safe_convert_value(row.get('date_created')),
                            safe_convert_value(row.get('person_id')),
                            encounter_id
                        )
                        
                        cur_mart.execute(update_query, update_params)
                        successful_updates += 1
                        
                        if idx < 3:
                            logger.info(f"✓ Successfully updated encounter_id: {encounter_id}")
                            
                    else:
                        # Insert new record
                        if idx < 3:  # Debug first few
                            logger.info(f"DEBUG: Inserting new record for encounter_id: {encounter_id}")
                        
                        insert_query = """
                        INSERT INTO marts.dm_lab_request_orders (
                            event_date, dedupe_id, lab_request_number, birthdate, gender, 
                            shr_date, impilo_registration_date, date_sample_taken,
                            lab_order_status, status_reason, note, sample_code, 
                            sample_type, test_type, facility_id_code, lab, test_results,
                            dw_date_created, dm_date_created, person_id, encounter_id
                        ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, 
                                 %s, %s, %s, %s, %s, %s, NOW(), %s, %s)
                        """
                        
                        # Fix: Use helper function for safe conversion
                        insert_params = (
                            safe_convert_value(row.get('event_date')),
                            safe_convert_value(row.get('dedupe_id')),
                            safe_convert_value(row.get('lab_request_number')),
                            safe_convert_value(row.get('birthdate')),
                            safe_convert_value(row.get('gender')),
                            safe_convert_value(row.get('shr_date')),
                            safe_convert_value(row.get('impilo_registration_date')),
                            safe_convert_value(row.get('date_sample_taken')),
                            safe_convert_value(row.get('lab_order_status')),
                            safe_convert_value(row.get('status_reason')),
                            safe_convert_value(row.get('note')),
                            safe_convert_value(row.get('sample_code')),
                            safe_convert_value(row.get('sample_type')),
                            safe_convert_value(row.get('test_type')),
                            safe_convert_value(row.get('facility_id_code')),
                            safe_convert_value(row.get('lab')),
                            safe_convert_value(row.get('test_results')),
                            safe_convert_value(row.get('date_created')),
                            safe_convert_value(row.get('person_id')),
                            encounter_id
                        )
                        
                        cur_mart.execute(insert_query, insert_params)
                        successful_inserts += 1
                        
                        if idx < 3:
                            logger.info(f"✓ Successfully inserted encounter_id: {encounter_id}")
                        
                except Exception as e:
                    errors += 1
                    logger.error(f"✗ Error processing row {idx}: {e}")
                    if idx < 5:  # Show detailed error for first few rows
                        logger.error(f"Encounter ID type: {type(row.get('encounter_id'))}")
                        logger.error(f"Row keys: {list(row.keys())}")
                    continue
            
            logger.info(f"✓ Batch {batch_number} completed:")
            logger.info(f"  - Successful inserts: {successful_inserts}")
            logger.info(f"  - Successful updates: {successful_updates}")
            logger.info(f"  - Errors: {errors}")
            logger.info(f"  - Total processed in this batch: {successful_inserts + successful_updates}")
            
            total_processed += successful_inserts + successful_updates
            
            # Verify data was actually inserted
            if successful_inserts > 0 or successful_updates > 0:
                try:
                    cur_mart.execute("""
                        SELECT COUNT(*) FROM marts.dm_lab_request_orders 
                        WHERE dm_date_created::date = CURRENT_DATE
                    """)
                    today_count = cur_mart.fetchone()
                    logger.info(f"DEBUG: Total records in mart created today: {today_count[0] if today_count else 'Unknown'}")
                except Exception as e:
                    logger.warning(f"Could not verify today's inserts: {e}")
            
            # Update the last_processed_value to the latest timestamp in the batch
            if 'date_created' in df.columns and not df['date_created'].isna().all():
                new_last_value = df['date_created'].max()
                logger.info(f"✓ Updated last_processed_value from {last_processed_value} to {new_last_value}")
                last_processed_value = new_last_value
            else:
                logger.warning("⚠ Could not update last_processed_value - date_created column issues")
            
            batch_number += 1
            
        except Exception as e:
            logger.error(f"✗ Error processing batch {batch_number}: {e}")
            logger.error(f"Exception details: {type(e).__name__}: {str(e)}")
            break
    
    return last_processed_value

def listen_for_changes(conn, last_timestamp, polling_interval=300):
    """Polls the Hive database for changes at a regular interval."""
    while True:
        try:
            logger.info(f"Polling for changes after {last_timestamp}...")
            new_data_result = get_new_patient_data(conn, last_timestamp)
            
            if new_data_result[0] and len(new_data_result[0]) > 0:
                logger.info(f"New/Updated records found: {len(new_data_result[0])}")
                # Process the new data using the same batch processing logic
                # For simplicity, we'll just update the timestamp here
                # In practice, you'd want to process this data similarly
                last_timestamp = max([row[-1] for row in new_data_result[0]])
            else:
                logger.info("No new records found.")
            
            time.sleep(polling_interval)
            
        except Exception as e:
            logger.error(f"Error during polling: {e}")
            time.sleep(polling_interval)  # Continue polling even after errors

def main():
    # Connection details
    host = "197.221.242.150"
    port = 17251
    username = "tnhema"
    password = "ZFCG9ZSGksEMpSpA"
    database = "default"
    auth_mode = "LDAP"
    
    start = "on"
    retry_attempts = 0
    max_retries = 5
    retry_delay = 10
    last_processed_value = '2025-01-01 00:00:00'
    
    while retry_attempts < max_retries:
        conn = None
        try:
            logger.info(f"Attempting to connect (Attempt {retry_attempts + 1})...")
            
            conn = create_hive_connection(host, port, username, password, database, auth_mode)
            
            if conn:
                if start == "on":
                    logger.info("Initial load: Fetching all data from the fact_lab_request_orders table in batches of 50000.")
                    last_processed_value = get_all_patient_data_in_batches(conn, batch_size=50000)
                
                logger.info("Starting polling for changes...")
                listen_for_changes(conn, last_processed_value, polling_interval=300)
            else:
                raise Exception("Failed to establish Hive connection")

        except Exception as e:
            retry_attempts += 1
            logger.error(f"An error occurred: {e}. Retry {retry_attempts}/{max_retries} in {retry_delay} seconds...")
            
            if conn:
                close_hive_connection(conn)
            
            if retry_attempts >= max_retries:
                logger.error(f"Max retries reached ({max_retries}). Exiting.")
                break
                
            time.sleep(retry_delay)
        
        finally:
            if conn:
                close_hive_connection(conn)

if __name__ == "__main__":
    main()

2025-09-24 15:16:43,089 - INFO - Attempting to connect (Attempt 1)...
2025-09-24 15:16:43,371 - INFO - USE `default`
2025-09-24 15:16:43,544 - INFO - Hive connection established successfully
2025-09-24 15:16:43,547 - INFO - Initial load: Fetching all data from the fact_lab_request_orders table in batches of 50000.
2025-09-24 15:16:43,554 - INFO - No previous snapshot found
2025-09-24 15:16:43,555 - INFO - >>> Initial snapshot
2025-09-24 15:16:43,560 - INFO - SELECT 1 as test_value
2025-09-24 15:16:43,887 - INFO - ✓ Hive connection test successful
2025-09-24 15:16:43,889 - INFO - ✓ Data mart connection test successful
2025-09-24 15:16:43,891 - INFO - DEBUG: Executing count query: 
    SELECT 
        COUNT(*) AS total_records, 
        COUNT(DISTINCT encounter_facility_id) AS total_facilities, 
        COUNT(DISTINCT patient_id) AS distinct_people, 
        COUNT(DISTINCT encounter_id) AS distinct_encounters, 
        MAX(last_updated) AS last_update_time
    FROM fact_lab_request_order