In [2]:
from pyhive import hive
import psycopg2
import pandas as pd
import hashlib
import time
import math

class MobileConnection:
    def __init__(self, dbname, user, password, host, port):
        self.dbname = dbname
        self.user = user
        self.password = password
        self.host = host
        self.port = port
        self.connection = None
        self.cursor = None

    def connect(self):
        """Creates a connection to the Postgres Mobile database."""
        try:
            self.connection = psycopg2.connect(
                dbname=self.dbname,
                user=self.user,
                password=self.password,
                host=self.host,
                port=self.port
            )
            self.connection.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
            self.cursor = self.connection.cursor()
            return True
        except Exception as e:
            print(f"Error creating PostgreSQL Mobile connection: {e}")
            return False

    def close(self):
        """Closes the connection to the Mobile database."""
        if self.connection:
            try:
                self.connection.close()
            except Exception as e:
                print(f"Error closing connection: {e}")

    def execute_query(self, query):
        """Executes the given query and returns the results."""
        try:
            cursor = self.connection.cursor()
            cursor.execute(query)
            columns = [desc[0] for desc in cursor.description]
            results = cursor.fetchall()
            return results, columns
        except Exception as e:
            print(f"Error executing query: {e}")
            return None
        finally:
            cursor.close()


class PostgresConnection:
    def __init__(self, dbname, user, password, host, port):
        self.dbname = dbname
        self.user = user
        self.password = password
        self.host = host
        self.port = port
        self.connection = None
        self.cursor = None

    def connect(self):
        """Creates a connection to the PostgreSQL database."""
        try:
            self.connection = psycopg2.connect(
                dbname=self.dbname,
                user=self.user,
                password=self.password,
                host=self.host,
                port=self.port
            )
            self.connection.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
            self.cursor = self.connection.cursor()
            return True
        except Exception as e:
            print(f"Error creating PostgreSQL connection: {e}")
            return False

    def execute_query(self, query, params=None):
        """Executes a query with optional parameters for SELECT, INSERT, or UPDATE operations."""
        try:
            # Execute the query with parameters
            self.cursor.execute(query, params)

            # Check if the query is a SELECT statement
            if query.strip().upper().startswith("SELECT"):
                # Fetch and return results for SELECT queries
                return self.cursor.fetchall()
            else:
                # Commit transaction for non-SELECT queries (e.g., INSERT, UPDATE)
                self.connection.commit()
                return True  # Return True to indicate success for INSERT/UPDATE queries
        except Exception as e:
            print(f"Error executing PostgreSQL query: {e}")
            return None

    def close(self):
        """Closes the PostgreSQL database connection."""
        if self.connection:
            try:
                self.connection.close()
            except Exception as e:
                print(f"Error closing PostgreSQL connection: {e}")


class DataFetcher:
    def __init__(self, mob_conn, pg_conn, batch_size=50000, polling_interval=600):
        self.mob_conn = mob_conn
        self.pg_conn = pg_conn
        self.batch_size = batch_size
        self.polling_interval = polling_interval
        self.last_processed_value = '1900-01-01 00:00:00'

    def check_if_snapshot_done(self):
        query = "SELECT event_date FROM marts.dm_hts_test ORDER BY dm_date_created DESC LIMIT 1"
        result = self.pg_conn.execute_query(query)
        if result:
            return result[0][0]
        else:
            return None

    # def hash_record(self, record):
    #     """
    #     Hash the combined values of a record using SHA-256.
    #     If a value is None or NaN, replace it with an empty string before hashing.
    #     """
    #     combined = ''.join([str(value) if not pd.isna(value) else '' for value in record])
    #     return hashlib.sha256(combined.encode('utf-8')).hexdigest()

    def fetch_data_in_batches(self):
        snapshot_date = self.check_if_snapshot_done()
        if snapshot_date:
            self.last_processed_value = snapshot_date
            print(">>> Detected Snapshot done:", self.last_processed_value)
        else:
            print(">>> Initial snapshot")

        while True:
            print(f"Fetching batch data where event_date > {self.last_processed_value}")
            query = f"""
            SELECT * FROM report.hts
            WHERE event_date > '{self.last_processed_value}' 
            ORDER BY event_date ASC LIMIT {self.batch_size}
            """

            
            batch_data, columns = self.mob_conn.execute_query(query)

            if not batch_data:  # No more data
                print("No more data to fetch. Ending batch fetching.")
                break
            
            df = pd.DataFrame(batch_data, columns=columns)
            print("Fetched batch data:", df.shape)

            # Add hash column to the DataFrame
            # df['record_hash'] = df.apply(lambda row: self.hash_record(row), axis=1)

            df.to_csv("outputwithdps.csv", index=False)
            
            # Process and update the last_processed_value
            self.process_data(df)

            # Update the last_processed_value to the latest timestamp in the batch

            self.last_processed_value = df['event_date'].max()
            print("Updated last_processed_value to:", self.last_processed_value)




    def process_data(self, df):
        """Processes the data batch by cleaning and then inserting/updating it in PostgreSQL."""
      
        for _, row in df.iterrows():
            encounter_id = row['encounter_id']
            check_query = "SELECT 1 FROM marts.dm_hts_test WHERE encounter_id = %s"
            if self.pg_conn.execute_query(check_query, (encounter_id,)):
                # Update existing record
                print(f"Updating record for encounter_id: {encounter_id}")
                update_query = """
                UPDATE marts.dm_hts_test
                SET hts_model= %s, person_id = %s, birthdate = %s, sex = %s, date_of_hiv_test = %s, 
                reason_for_hiv_testing =%s, hts_test_result = %s, hts_approach = %s,
                mobile_client = %s, hts_number = %s, hts_type = %s, age_at_visit = %s,
                event_date = %s, consent_to_index_testing = %s, client_profile = %s, index_case_hts_number = %s,
                contact_of_index_case = %s, dedupe_id = %s, rtri_result_testhts = %s,
                entry_point = %s, first_test_ever_in_life = %s, first_test_for_this_pregnancy = %s, hts_sub_model= %s,
                reasonfor_not_performingrecency_test = %s, timing_of_hiv_diagnosis = %s, "hts_test_A1" = %s, "hts_test_A2" = %s, 
                "hts_test_A3" = %s, opted_out_of_hiv_testing =%s, pre_test_information_given = %s, rtri_test_done = %s,
                reason_for_not_issuing_result = %s, received_hiv_test_results = %s, received_post_test_counselling = %s,
                "retesting_before_art_Initiation" = %s, self_identified_gender = %s, verification_test_done = %s, hts_hiv_positive = %s,
                number_of_hiv_tests_done = %s, facility_id_code = %s, dm_date_created = NOW()
                WHERE encounter_id = %s
                """
                self.pg_conn.execute_query(update_query, (
                    row['htc_model'], row['person_id'], row['birthdate'], row['sex'], row['date'], row['test_purpose'], row['result'], row['approach'],
                    '', row['hts_number'], row['test_type'], row['age_at_visit'], row['event_date'], row['consent_to_index_testing'], row['client_profile'],
                    row['index_case_hts_number'], '', '', row['rtri_test_results'], row['refered_service'], row['first_test_ever'], row['pregnancytest'], row['hts_model_sub_type'], row['reason_for_not_performing_test'], 
                    row['care_giver_result_date'], row['hts_test_a1'],row['hts_test_a2'], row['hts_test_a3'], row['opt'], row['pre_test_counselling'], row['rtri_test_done'], 
                    row['reason_for_not_issuing_result'],
                    row['results_issued'], row['post_test_counselling'], row['retest_before_art_initiation'], row['self_identified_gender'], row['verification_test_done'], row['hts_hiv_positive'], row['test_count'], row['tenant_id'], 
                    encounter_id
                ))
            else:
                # Insert new record
                print(f"Inserting new record for encounter_id: {encounter_id}")
                insert_query = """
                INSERT INTO marts.dm_hts_test (hts_model, person_id, birthdate, sex, date_of_hiv_test,
                reason_for_hiv_testing, hts_test_result, hts_approach, mobile_client, hts_number, hts_type, age_at_visit,
                event_date, consent_to_index_testing, client_profile, index_case_hts_number,
                contact_of_index_case, dedupe_id, rtri_result_testhts,
                entry_point, first_test_ever_in_life, first_test_for_this_pregnancy, hts_sub_model,
                reasonfor_not_performingrecency_test, timing_of_hiv_diagnosis, "hts_test_A1", "hts_test_A2",
                "hts_test_A3", opted_out_of_hiv_testing, pre_test_information_given, rtri_test_done,
                reason_for_not_issuing_result, received_hiv_test_results, received_post_test_counselling,
                "retesting_before_art_Initiation", self_identified_gender, verification_test_done, hts_hiv_positive,
                number_of_hiv_tests_done, facility_id_code, dm_date_created, encounter_id 
                ) VALUES (%s, %s, %s, %s, 
                            %s, %s, %s,%s,
                            %s, %s, %s, %s,
                            %s, %s, %s, %s,
                            %s, %s, %s, %s,
                            %s, %s, %s, %s,
                            %s, %s, %s, %s,
                            %s, %s, %s, %s,
                            %s, %s, %s, %s,
                            %s, %s, %s, %s,
                            NOW(), %s) 
                """
                self.pg_conn.execute_query(insert_query, (
                    row['htc_model'], row['person_id'], row['birthdate'], row['sex'], row['date'], row['test_purpose'], row['result'], row['approach'],
                    '', row['hts_number'], row['test_type'], row['age_at_visit'], row['event_date'], row['consent_to_index_testing'], row['client_profile'],
                    row['index_case_hts_number'], '', '', row['rtri_test_results'], row['refered_service'], row['first_test_ever'], row['pregnancytest'], row['hts_model_sub_type'], row['reason_for_not_performing_test'], 
                    row['care_giver_result_date'], row['hts_test_a1'],row['hts_test_a2'], row['hts_test_a3'], row['opt'], row['pre_test_counselling'], row['rtri_test_done'], 
                    row['reason_for_not_issuing_result'],
                    row['results_issued'], row['post_test_counselling'], row['retest_before_art_initiation'], row['self_identified_gender'], row['verification_test_done'], row['hts_hiv_positive'],row['test_count'], row['tenant_id'], 
                    encounter_id
                ))

    def start_polling(self):
        """Starts polling to fetch data every 5 minutes."""
        while True:
            print("Starting data fetch cycle.")
            self.fetch_data_in_batches()
            print(f"Waiting for {self.polling_interval} seconds before next fetch.")
            time.sleep(self.polling_interval)


def main():
    # Initialize Mobile connection
    mob_conn = MobileConnection(
        dbname="master",
        user="postgres",
        password="wGMCAE6zFHcyrBmXtus97JPanxvkY4fb",
        host="127.0.0.1",
        port=5434
    )

    # Initialize PostgreSQL connection
    pg_conn = PostgresConnection(
        dbname="HTSDATA",
        user="postgres",
        password="wGMCAE6zFHcyrBmXtus97JPanxvkY4fb",
        host="127.0.0.1",
        port=5431
    )

    if mob_conn.connect() and pg_conn.connect():
        data_fetcher = DataFetcher(mob_conn, pg_conn)
        data_fetcher.start_polling()

    mob_conn.close()
    pg_conn.close()


if __name__ == "__main__":
    main()


Starting data fetch cycle.
>>> Detected Snapshot done: 2024-11-05 17:32:48
Fetching batch data where event_date > 2024-11-05 17:32:48
No more data to fetch. Ending batch fetching.
Waiting for 600 seconds before next fetch.
Starting data fetch cycle.
>>> Detected Snapshot done: 2024-11-05 17:32:48
Fetching batch data where event_date > 2024-11-05 17:32:48
No more data to fetch. Ending batch fetching.
Waiting for 600 seconds before next fetch.
Starting data fetch cycle.
>>> Detected Snapshot done: 2024-11-05 17:32:48
Fetching batch data where event_date > 2024-11-05 17:32:48
No more data to fetch. Ending batch fetching.
Waiting for 600 seconds before next fetch.
Starting data fetch cycle.
>>> Detected Snapshot done: 2024-11-05 17:32:48
Fetching batch data where event_date > 2024-11-05 17:32:48
No more data to fetch. Ending batch fetching.
Waiting for 600 seconds before next fetch.
Starting data fetch cycle.
>>> Detected Snapshot done: 2024-11-05 17:32:48
Fetching batch data where event_d

TypeError: cannot unpack non-iterable NoneType object