In [12]:
from pyhive import hive
import psycopg2
import pandas as pd
import time
import hashlib
import thrift_sasl
import math

class HiveConnection:
    def __init__(self, host, port, username, password, database, auth_mode):
        self.host = host
        self.port = port
        self.username = username
        self.password = password
        self.database = database
        self.auth_mode = auth_mode
        self.connection = None

    def connect(self):
        """Creates a connection to the Hive database."""
        try:
            self.connection = hive.Connection(
                host=self.host,
                port=self.port,
                username=self.username,
                password=self.password,
                database=self.database,
                auth=self.auth_mode
            )
            return True
        except Exception as e:
            print(f"Error creating connection: {e}")
            return False

    def close(self):
        """Closes the connection to the Hive database."""
        if self.connection:
            try:
                self.connection.close()
            except Exception as e:
                print(f"Error closing connection: {e}")

    def execute_query(self, query):
        """Executes the given query and returns the results."""
        try:
            cursor = self.connection.cursor()
            cursor.execute(query)
            columns = [desc[0] for desc in cursor.description]
            results = cursor.fetchall()
            return results, columns
        except Exception as e:
            print(f"Error executing query: {e}")
            return None, None
        finally:
            cursor.close()


class LocalConnection:
    def __init__(self, dbname, user, password, host, port):
        self.dbname = dbname
        self.user = user
        self.password = password
        self.host = host
        self.port = port
        self.connection = None
        self.cursor = None

    def connect(self):
        """Creates a connection to the Local database."""
        try:
            self.connection = psycopg2.connect(
                dbname=self.dbname,
                user=self.user,
                password=self.password,
                host=self.host,
                port=self.port
            )
            self.connection.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
            self.cursor = self.connection.cursor()
            return True
        except Exception as e:
            print(f"Error creating Local Database connection: {e}")
            return False

    def execute_query(self, query, params=None):
        """Executes a query with optional parameters for SELECT, INSERT, or UPDATE operations."""
        try:
            # Execute the query with parameters
            self.cursor.execute(query, params)

            # Check if the query is a SELECT statement
            if query.strip().upper().startswith("SELECT"):
                # Fetch and return results for SELECT queries
                return self.cursor.fetchall()
            else:
                # Commit transaction for non-SELECT queries (e.g., INSERT, UPDATE)
                self.connection.commit()
                return True  # Return True to indicate success for INSERT/UPDATE queries
        except Exception as e:
            print(f"Error executing LocalDB query: {e}")
            return None

    def close(self):
        """Closes the Local database connection."""
        if self.connection:
            try:
                self.connection.close()
            except Exception as e:
                print(f"Error closing Local database connection: {e}")

    def create_lab_datamart_table(self):
        """Creates the lab data mart table if it doesn't exist."""
        create_table_query = """
        CREATE TABLE IF NOT EXISTS marts.dm_lab (
            lab_request_number VARCHAR(100),
            patient_id VARCHAR(100),
            task_id VARCHAR(100),
            encounter_id VARCHAR(100),
            servicerequest_id VARCHAR(100),
            specimen_id VARCHAR(100),
            task_status VARCHAR(50),
            status_reason_display TEXT,
            task_execution_start_date TIMESTAMP,
            encounter_period_start TIMESTAMP,
            sample_code VARCHAR(50),
            specimen_last_updated TIMESTAMP,
            diagnosticreport_last_updated TIMESTAMP,
            encounter_last_updated TIMESTAMP,
            observations_last_updated TIMESTAMP,
            servicerequest_last_updated TIMESTAMP,
            task_last_updated TIMESTAMP,
            task_output_reference TEXT,
            task_output_type VARCHAR(100),
            test_type VARCHAR(100),
            test_code VARCHAR(50),
            service_request_category VARCHAR(100),
            service_request_authored_on TIMESTAMP,
            task_authored_on TIMESTAMP,
            service_request_reason TEXT,
            sample_type VARCHAR(100),
            date_sample_taken TIMESTAMP,
            diagnostic_report_date_issued TIMESTAMP,
            diagnostic_report_effective_date TIMESTAMP,
            result TEXT,
            result_interpretation TEXT,
            dedupe_id VARCHAR(100),
            gender VARCHAR(10),
            birth_date DATE,
            patient_managing_organization_id VARCHAR(100),
            patient_managing_organization VARCHAR(200),
            lab_id VARCHAR(100),
            encounter_facility VARCHAR(200),
            encounter_facility_id VARCHAR(100),
            dw_date_created TIMESTAMP,
            dm_date_created TIMESTAMP DEFAULT NOW(),
            record_hash VARCHAR(64) UNIQUE,
            PRIMARY KEY (record_hash)
        );
        
        -- Create indexes for better performance
        CREATE INDEX IF NOT EXISTS idx_dm_lab_patient_id ON marts.dm_lab(patient_id);
        CREATE INDEX IF NOT EXISTS idx_dm_lab_encounter_id ON marts.dm_lab(encounter_id);
        CREATE INDEX IF NOT EXISTS idx_dm_lab_task_id ON marts.dm_lab(task_id);
        CREATE INDEX IF NOT EXISTS idx_dm_lab_last_updated ON marts.dm_lab(dw_date_created);
        CREATE INDEX IF NOT EXISTS idx_dm_lab_lab_request_number ON marts.dm_lab(lab_request_number);
        """
        
        try:
            self.cursor.execute(create_table_query)
            self.connection.commit()
            print("Lab data mart table created successfully.")
            return True
        except Exception as e:
            print(f"Error creating lab data mart table: {e}")
            return False


class LabDataFetcher:
    def __init__(self, hive_conn, pg_conn, batch_size=50000, polling_interval=600):
        self.hive_conn = hive_conn
        self.pg_conn = pg_conn
        self.batch_size = batch_size
        self.polling_interval = polling_interval
        self.last_processed_value = '1900-01-01 00:00:00'

    def check_if_snapshot_done(self):
        """Check the latest processed timestamp from the data mart."""
        query = "SELECT MAX(dw_date_created) FROM marts.dm_lab"
        result = self.pg_conn.execute_query(query)
        if result and result[0][0]:
            return result[0][0]
        else:
            return None

    def hash_record(self, record):
        """
        Hash the combined values of a record using SHA-256.
        If a value is None or NaN, replace it with an empty string before hashing.
        """
        combined = ''.join([str(value) if not pd.isna(value) and value is not None else '' for value in record])
        return hashlib.sha256(combined.encode('utf-8')).hexdigest()

    def fetch_data_in_batches(self):
        """Fetch data from Hive in batches and process them."""
        snapshot_date = self.check_if_snapshot_done()
        if snapshot_date:
            self.last_processed_value = snapshot_date
            print(f">>> Detected Snapshot done: {self.last_processed_value}")
        else:
            print(">>> Initial snapshot")

        while True:
            print(f"Fetching batch data where last_updated > {self.last_processed_value}")
            
            # Update this query with your actual view name
            query = f"""
            SELECT 
                lab_request_number, patient_id, task_id, encounter_id, last_updated,
                serviceRequest_id, specimen_id, task_status, status_reason_display,
                task_execution_start_date, encounter_period_start, sample_code,
                specimen_last_updated, diagnosticreport_last_updated, encounter_last_updated,
                observations_last_updated, servicerequest_last_updated, task_last_updated,
                task_output_reference, task_output_type, test_type, test_code,
                service_request_category, service_request_authored_on, task_authored_on,
                service_request_reason, sample_type, date_sample_taken,
                diagnostic_report_date_issued, diagnostic_report_effective_date,
                result, result_interpretation, dedupe_id, gender, birth_date,
                patient_managing_organization_id, patient_managing_organization,
                lab_id, encounter_facility, encounter_facility_id
            FROM your_lab_view_name 
            WHERE last_updated > '{self.last_processed_value}' 
            ORDER BY last_updated ASC 
            LIMIT {self.batch_size}
            """
            
            batch_data, columns = self.hive_conn.execute_query(query)

            if not batch_data:  # No more data
                print("No more data to fetch. Ending batch fetching.")
                break

            df = pd.DataFrame(batch_data, columns=columns)
            print(f"Fetched batch data: {df.shape}")
            
            # Add hash column to the DataFrame
            df['record_hash'] = df.apply(lambda row: self.hash_record(row), axis=1)
            self.process_data(df)

            # Update the last_processed_value to the latest timestamp in the batch
            self.last_processed_value = df['last_updated'].max()
            print(f"Updated last_processed_value to: {self.last_processed_value}")

    def process_data(self, df):
        """Process each record - either insert new or update existing."""
        for _, row in df.iterrows():
            record_hash = row['record_hash']
            
            # Check if record already exists
            check_query = "SELECT 1 FROM marts.dm_lab WHERE record_hash = %s"
            existing_record = self.pg_conn.execute_query(check_query, (record_hash,))
            
            if existing_record:
                # Update existing record
                print(f"Updating record for record_hash: {record_hash}")
                update_query = """
                UPDATE marts.dm_lab
                SET 
                    lab_request_number = %s, patient_id = %s, task_id = %s, encounter_id = %s,
                    servicerequest_id = %s, specimen_id = %s, task_status = %s, 
                    status_reason_display = %s, task_execution_start_date = %s, 
                    encounter_period_start = %s, sample_code = %s, specimen_last_updated = %s,
                    diagnosticreport_last_updated = %s, encounter_last_updated = %s,
                    observations_last_updated = %s, servicerequest_last_updated = %s,
                    task_last_updated = %s, task_output_reference = %s, task_output_type = %s,
                    test_type = %s, test_code = %s, service_request_category = %s,
                    service_request_authored_on = %s, task_authored_on = %s,
                    service_request_reason = %s, sample_type = %s, date_sample_taken = %s,
                    diagnostic_report_date_issued = %s, diagnostic_report_effective_date = %s,
                    result = %s, result_interpretation = %s, dedupe_id = %s, gender = %s,
                    birth_date = %s, patient_managing_organization_id = %s,
                    patient_managing_organization = %s, lab_id = %s, encounter_facility = %s,
                    encounter_facility_id = %s, dw_date_created = %s, dm_date_created = NOW()
                WHERE record_hash = %s
                """
                
                self.pg_conn.execute_query(update_query, (
                    row['lab_request_number'], row['patient_id'], row['task_id'], row['encounter_id'],
                    row['serviceRequest_id'], row['specimen_id'], row['task_status'], 
                    row['status_reason_display'], row['task_execution_start_date'],
                    row['encounter_period_start'], row['sample_code'], row['specimen_last_updated'],
                    row['diagnosticreport_last_updated'], row['encounter_last_updated'],
                    row['observations_last_updated'], row['servicerequest_last_updated'],
                    row['task_last_updated'], row['task_output_reference'], row['task_output_type'],
                    row['test_type'], row['test_code'], row['service_request_category'],
                    row['service_request_authored_on'], row['task_authored_on'],
                    row['service_request_reason'], row['sample_type'], row['date_sample_taken'],
                    row['diagnostic_report_date_issued'], row['diagnostic_report_effective_date'],
                    row['result'], row['result_interpretation'], row['dedupe_id'], row['gender'],
                    row['birth_date'], row['patient_managing_organization_id'],
                    row['patient_managing_organization'], row['lab_id'], row['encounter_facility'],
                    row['encounter_facility_id'], row['last_updated'], record_hash
                ))
            else:
                # Insert new record
                print(f"Inserting new record for record_hash: {record_hash}")
                insert_query = """
                INSERT INTO marts.dm_lab (
                    lab_request_number, patient_id, task_id, encounter_id, servicerequest_id,
                    specimen_id, task_status, status_reason_display, task_execution_start_date,
                    encounter_period_start, sample_code, specimen_last_updated,
                    diagnosticreport_last_updated, encounter_last_updated, observations_last_updated,
                    servicerequest_last_updated, task_last_updated, task_output_reference,
                    task_output_type, test_type, test_code, service_request_category,
                    service_request_authored_on, task_authored_on, service_request_reason,
                    sample_type, date_sample_taken, diagnostic_report_date_issued,
                    diagnostic_report_effective_date, result, result_interpretation,
                    dedupe_id, gender, birth_date, patient_managing_organization_id,
                    patient_managing_organization, lab_id, encounter_facility,
                    encounter_facility_id, dw_date_created, dm_date_created, record_hash
                ) VALUES (
                    %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
                    %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, NOW(), %s
                )
                """
                
                self.pg_conn.execute_query(insert_query, (
                    row['lab_request_number'], row['patient_id'], row['task_id'], row['encounter_id'],
                    row['serviceRequest_id'], row['specimen_id'], row['task_status'], 
                    row['status_reason_display'], row['task_execution_start_date'],
                    row['encounter_period_start'], row['sample_code'], row['specimen_last_updated'],
                    row['diagnosticreport_last_updated'], row['encounter_last_updated'],
                    row['observations_last_updated'], row['servicerequest_last_updated'],
                    row['task_last_updated'], row['task_output_reference'], row['task_output_type'],
                    row['test_type'], row['test_code'], row['service_request_category'],
                    row['service_request_authored_on'], row['task_authored_on'],
                    row['service_request_reason'], row['sample_type'], row['date_sample_taken'],
                    row['diagnostic_report_date_issued'], row['diagnostic_report_effective_date'],
                    row['result'], row['result_interpretation'], row['dedupe_id'], row['gender'],
                    row['birth_date'], row['patient_managing_organization_id'],
                    row['patient_managing_organization'], row['lab_id'], row['encounter_facility'],
                    row['encounter_facility_id'], row['last_updated'], record_hash
                ))

    def start_polling(self):
        """Starts polling to fetch data every polling_interval seconds."""
        while True:
            print("Starting lab data fetch cycle.")
            self.fetch_data_in_batches()
            print(f"Waiting for {self.polling_interval} seconds before next fetch.")
            time.sleep(self.polling_interval)


def main():
    # Initialize Hive connection
    hive_conn = HiveConnection(
        host="197.221.242.150",
        port=17251,
        username="tjima",
        password="vHYWzTVyygV4Q8tq",
        database="default",
        auth_mode="LDAP"
    )

    # Initialize LocalDB connection
    pg_conn = LocalConnection(
        dbname="health_db",
        user="tynash",
        password="password",
        host="localhost",
        port=3307
    )

    print("Connecting to Hive and Local databases...")
    
    if hive_conn.connect() and pg_conn.connect():
        print("Successfully connected to both databases.")
        
        # Create the lab data mart table
        print("Creating lab data mart table...")
        pg_conn.create_lab_datamart_table()
        
        # Initialize and start the data fetcher
        print("Starting lab data ETL process...")
        lab_data_fetcher = LabDataFetcher(hive_conn, pg_conn, batch_size=10000, polling_interval=300)
        lab_data_fetcher.start_polling()
    else:
        print("Failed to establish database connections.")
        print("Please check your connection parameters and network connectivity.")

    # Close connections
    print("Closing database connections...")
    hive_conn.close()
    pg_conn.close()

if __name__ == "__main__":
    main()

Connecting to Hive and Local databases...
Error creating Local Database connection: connection to server at "localhost" (127.0.0.1), port 3307 failed: Connection refused
	Is the server running on that host and accepting TCP/IP connections?

Failed to establish database connections.
Please check your connection parameters and network connectivity.
Closing database connections...


In [8]:
!pip install thrift_sasl

Defaulting to user installation because normal site-packages is not writeable
Collecting thrift_sasl
  Using cached thrift_sasl-0.4.3-py2.py3-none-any.whl (8.3 kB)
Collecting pure-sasl>=0.6.2
  Using cached pure-sasl-0.6.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: pure-sasl
  Building wheel for pure-sasl (setup.py) ... [?25ldone
[?25h  Created wheel for pure-sasl: filename=pure_sasl-0.6.2-py3-none-any.whl size=11443 sha256=7c0c416a5812532f4fb0cfc5de59b49d747688dd22d34652ae3037ccfb253a09
  Stored in directory: /home/tynash/.cache/pip/wheels/57/7c/93/062238b0a68efe214024ca178233f248971045db1033c96a52
Successfully built pure-sasl
Installing collected packages: pure-sasl, thrift_sasl
Successfully installed pure-sasl-0.6.2 thrift_sasl-0.4.3
