In [None]:
from pyhive import hive
import psycopg2
import pandas as pd
import hashlib
import time
import math

class MobileConnection:
    def __init__(self, dbname, user, password, host, port):
        self.dbname = dbname
        self.user = user
        self.password = password
        self.host = host
        self.port = port
        self.connection = None
        self.cursor = None

    def connect(self):
        """Creates a connection to the Postgres Mobile database."""
        try:
            self.connection = psycopg2.connect(
                dbname=self.dbname,
                user=self.user,
                password=self.password,
                host=self.host,
                port=self.port
            )
            self.connection.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
            self.cursor = self.connection.cursor()
            return True
        except Exception as e:
            print(f"Error creating PostgreSQL Mobile connection: {e}")
            return False

    def close(self):
        """Closes the connection to the Mobile database."""
        if self.connection:
            try:
                self.connection.close()
            except Exception as e:
                print(f"Error closing connection: {e}")

    def execute_query(self, query):
        """Executes the given query and returns the results."""
        try:
            cursor = self.connection.cursor()
            cursor.execute(query)
            columns = [desc[0] for desc in cursor.description]
            results = cursor.fetchall()
            return results, columns
        except Exception as e:
            print(f"Error executing query: {e}")
            return None
        finally:
            cursor.close()


class PostgresConnection:
    def __init__(self, dbname, user, password, host, port):
        self.dbname = dbname
        self.user = user
        self.password = password
        self.host = host
        self.port = port
        self.connection = None
        self.cursor = None

    def connect(self):
        """Creates a connection to the PostgreSQL database."""
        try:
            self.connection = psycopg2.connect(
                dbname=self.dbname,
                user=self.user,
                password=self.password,
                host=self.host,
                port=self.port
            )
            self.connection.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
            self.cursor = self.connection.cursor()
            return True
        except Exception as e:
            print(f"Error creating PostgreSQL connection: {e}")
            return False

    def execute_query(self, query, params=None):
        """Executes a query with optional parameters for SELECT, INSERT, or UPDATE operations."""
        try:
            # Execute the query with parameters
            self.cursor.execute(query, params)

            # Check if the query is a SELECT statement
            if query.strip().upper().startswith("SELECT"):
                # Fetch and return results for SELECT queries
                return self.cursor.fetchall()
            else:
                # Commit transaction for non-SELECT queries (e.g., INSERT, UPDATE)
                self.connection.commit()
                return True  # Return True to indicate success for INSERT/UPDATE queries
        except Exception as e:
            print(f"Error executing PostgreSQL query: {e}")
            return None

    def close(self):
        """Closes the PostgreSQL database connection."""
        if self.connection:
            try:
                self.connection.close()
            except Exception as e:
                print(f"Error closing PostgreSQL connection: {e}")


class DataFetcher:
    def __init__(self, mob_conn, pg_conn, batch_size=50000, polling_interval=600):
        self.mob_conn = mob_conn
        self.pg_conn = pg_conn
        self.batch_size = batch_size
        self.polling_interval = polling_interval
        self.last_processed_value = '1900-01-01 00:00:00'

    def check_if_snapshot_done(self):
        query = "SELECT event_date FROM marts.dm_viral_load_test ORDER BY event_date DESC LIMIT 1"
        result = self.pg_conn.execute_query(query)
        if result:
            return result[0][0]
        else:
            return None

    def fetch_data_in_batches(self):
        snapshot_date = self.check_if_snapshot_done()
        if snapshot_date:
            self.last_processed_value = snapshot_date
            print(">>> Detected Snapshot done:", self.last_processed_value)
        else:
            print(">>> Initial snapshot")

        while True:
            print(f"Fetching batch data where event_date > {self.last_processed_value}")
            query = f"""
            SELECT * FROM report.viral_load
            WHERE event_date > '{self.last_processed_value}' 
            ORDER BY event_date ASC LIMIT {self.batch_size}
            """

            
            batch_data, columns = self.mob_conn.execute_query(query)

            if not batch_data:  # No more data
                print("No more data to fetch. Ending batch fetching.")
                break
            
            df = pd.DataFrame(batch_data, columns=columns)
            print("Fetched batch data:", df.shape)

            # Add hash column to the DataFrame
            # df['record_hash'] = df.apply(lambda row: self.hash_record(row), axis=1)

            #df.to_csv("outputwithdps.csv", index=False)
            #pprint()
            # Process and update the last_processed_value
            self.process_data(df)

            # Update the last_processed_value to the latest timestamp in the batch

            self.last_processed_value = df['event_date'].max()
            print("Updated last_processed_value to:", self.last_processed_value)




    def process_data(self, df):
        """Processes the data batch by cleaning and then inserting/updating it in PostgreSQL."""
      
        for _, row in df.iterrows():
            encounter_id = row['encounter_id']
            check_query = "SELECT 1 FROM marts.dm_viral_load_test WHERE encounter_id = %s"
            if self.pg_conn.execute_query(check_query, (encounter_id,)):
                # Update existing record
                print(f"Updating record for encounter_id: {encounter_id}")
                update_query = """
                UPDATE marts.dm_viral_load_test
                SET art_number = %s, lab_request_number = %s, age = %s, sex = %s,
                    facility_id_code = %s,
                    client_profile = %s, current_art_regimen = %s,
                    date_of_art_initiation = %s,
                    reason_for_viral_load_testing = %s,
                    date_of_viral_load_sample_collection = %s,
                    type_of_viral_load_sample = %s,
                    date_of_viral_load_results = %s,
                    viral_load_results = %s, 
                    date_viral_load_results_issued_to_client =%s,
                    reason_for_hiv_drug_resistance_testing =%s,
                    date_of_hiv_drug_resistance_sample_collection =%s,
                    type_of_hiv_drug_resistance_sample_collected = %s,
                    date_hiv_drug_resistance_result_received = %s,
                    hiv_drug_resistance_results =%s,
                    switched_to_3rd_line =%s,
                    "3rd_line_regimen" =%s,
                    event_date= %s, dm_date_created = NOW()
                WHERE encounter_id = %s
                """
                self.pg_conn.execute_query(update_query, (
                    row['art_number'], row['laboratory_request_number'], row['age'], 
                    row['sex'], row['facility_id'], row['client_profile'],
                    row['current_art_regimen'], row['date_of_art_initiation'], 
                    row['reason_for_viral_load_test'],
                    row['date_of_viral_load_sample_collection'], 
                    row['type_of_viral_load_sample'],
                    row['date_of_viral_load_results'],
                    row['viral_load_results'],row['date_viral_load_results_issued_to_client'], '',
                    row['date_of_hiv_drug_resistance_sample_collection'],
                    row['type_of_hiv_drug_resistance_sample_collected'], 
                    row['date_hiv_drug_resistance_result_received'],
                    row['hiv_drug_resistance_results'],
                    row['switched_to_third_line_regimen'],
                    row['third_line_regimen'],
                    row['event_date'], encounter_id
                ))
            else:
                # Insert new record
                print(f"Inserting new record for encounter_id: {encounter_id}")
                insert_query = """
                INSERT INTO marts.dm_viral_load_test (
                    art_number, lab_request_number, age, sex, facility_id_code,
                    client_profile, current_art_regimen  ,
                    date_of_art_initiation  ,
                    reason_for_viral_load_testing  ,
                    date_of_viral_load_sample_collection  ,
                    type_of_viral_load_sample  ,
                      date_of_viral_load_results  ,
                    viral_load_results  , 
                    date_viral_load_results_issued_to_client ,
                    reason_for_hiv_drug_resistance_testing ,
                    date_of_hiv_drug_resistance_sample_collection ,
                    type_of_hiv_drug_resistance_sample_collected  ,
                    date_hiv_drug_resistance_result_received  ,
                    hiv_drug_resistance_results ,
                    switched_to_3rd_line ,
                    "3rd_line_regimen",
                    event_date, dm_date_created, encounter_id
                ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, NOW(), %s)
                """
                self.pg_conn.execute_query(insert_query, (
                    row['art_number'], row['laboratory_request_number'], row['age'], 
                    row['sex'], row['facility_id'], row['client_profile'],
                    row['current_art_regimen'], row['date_of_art_initiation'], 
                    row['reason_for_viral_load_test'],
                    row['date_of_viral_sample_collection'], 
                    row['type_of_viral_load_sample'],
                    row['date_of_viral_load_results'],
                    row['viral_load_results'],row['date_viral_load_results_issued_to_client'], '',
                    row['date_of_hiv_drug_resistance_sample_collection'],
                    row['type_of_hiv_drug_resistance_sample_collected'], 
                    row['date_hiv_drug_resistance_result_received'],
                    row['hiv_drug_resistance_results'],
                    row['switched_to_third_line_regimen'],
                    row['third_line_regimen'],
                    row['event_date'], encounter_id
                ))

    def start_polling(self):
        """Starts polling to fetch data every 5 minutes."""
        while True:
            print("Starting data fetch cycle.")
            self.fetch_data_in_batches()
            print(f"Waiting for {self.polling_interval} seconds before next fetch.")
            time.sleep(self.polling_interval)

def main():
    # Initialize Mobile connection
    mob_conn = MobileConnection(
        dbname="master",
        user="postgres",
        password="wGMCAE6zFHcyrBmXtus97JPanxvkY4fb",
        host="127.0.0.1",
        port=5434
    )

    # Initialize PostgreSQL connection
    pg_conn = PostgresConnection(
        dbname="HTSDATA",
        user="postgres",
        password="wGMCAE6zFHcyrBmXtus97JPanxvkY4fb",
        host="127.0.0.1",
        port=5431
    )

    if mob_conn.connect() and pg_conn.connect():
        data_fetcher = DataFetcher(mob_conn, pg_conn)
        data_fetcher.start_polling()

    mob_conn.close()
    pg_conn.close()


if __name__ == "__main__":
    main()


Starting data fetch cycle.
>>> Initial snapshot
Fetching batch data where event_date > 1900-01-01 00:00:00
Fetched batch data: (18306, 26)
Inserting new record for encounter_id: 0847e050-1167-48d1-b1a1-82eb0be67ca9


KeyError: 'date_of_viral_load_results'