In [7]:
import os
import re
import pandas as pd
import time
import datetime
from csv import reader
from dateutil import parser
import glob
from pathlib import Path
from dateutil.parser import parse
import warnings
import pandas_dedupe
import sqlite3
import sys 
from datetime import datetime
import json
import subprocess
import mysql.connector
import configparser
import numpy as np
warnings.filterwarnings("ignore")

class Extraction:

    def __init__(self, folder_path ) -> None:
        self.folder_path = folder_path

    def get_faility_db_list(self):
        facility_db_list = [f for f in os.listdir(self.folder_path) if f.endswith('.sql') and not f.startswith("modified")]
        return facility_db_list
    
    
    # Get database connection
    def get_connection(self):
         user, password, host, port  = self.get_db_params()
         connection = mysql.connector.connect(user=user, password=password, host=host, port=port)
         return connection
    

    def get_db_params(self):
        config = configparser.ConfigParser()
        config.read('config.ini') 
        user = config.get('Database', 'user')
        password = config.get('Database', 'password')
        host = config.get('Database', 'host')
        port = config.get('Database', 'port')
        return user, password, host, port  
    
    
    def restore_database(self, backup_file, time_):
        backup_file = backup_file.replace("\\", "/")
        # Helper function for deleting existing schemas
        def drop_database(cursor, schema):
            schemas_to_drop = ['client', 'consultation', 'deduplication', 'facility', 'mrs', 'provider', 'report', 'terminology', 'zimepms']
            if schema in schemas_to_drop:
                cursor.execute(f'DROP DATABASE {schema}')
                print(f"    >>> DROPPED [{schema}]")
        # Establish connection to the server
        connection = self.get_connection()
        # Drop existing EHR schemas before restoring new database
        with connection.cursor() as cursor:
            cursor.execute('SET foreign_key_checks = 0')
            cursor.execute('SELECT SCHEMA_NAME FROM information_schema.schemata;')
            schemas = [row[0] for row in cursor.fetchall()]
            for schema in schemas:
                drop_database(cursor, schema)
        # Get DB credentials 
        user, password , host, port = self.get_db_params()
        restore_command = f"mysql -u {user} -p{password} -h {host} -P {port} < {backup_file}"
        try:
            print(" >>> Restoring database: "+restore_command)
            subprocess.run(restore_command, shell=True, check=True)
            print(' >>> DATABASE RESTORE [SUCCESSFUL>>>]')
        except Exception as e:
            log_file = os.path.join(os.getcwd(), 'logs.txt')
            with open(log_file, 'a') as f:
                f.write(f'{time_} {backup_file} {str(e)}\n')
    

    # remove database mrs from sql file 
    def trim_database(self, database_path_and_filename):
        with open(database_path_and_filename, "r", encoding='ISO-8859-1') as f:
            content = f.read()

        start_text = "USE `mrs`;"
        updated_text = content.split(start_text)[0]

        directory, filename = os.path.split(database_path_and_filename)
        new_filename = f"modified_{filename}"
        file_path = os.path.join(directory, new_filename)

        with open(file_path, "w",encoding='utf-8') as f:
            f.write(updated_text)

        return file_path
    

    def get_facility_details(self,mapping_file,latest_site_id):
        facility_name = mapping_file.loc[mapping_file['Facility ID'] == latest_site_id] ["Facility"].values
        if facility_name.size > 0:
            facility_name = facility_name[0]
            district_name = mapping_file.loc[mapping_file['Facility ID'] == latest_site_id] ["District"].values
            if district_name.size > 0:
                district_name = district_name[0]
            else:
                district_name = ""
            province_name = mapping_file.loc[mapping_file['Facility ID'] == latest_site_id] ["Province"].values
            if province_name.size > 0:
                province_name = province_name[0]
            else:
                province_name = ""
        else:
            facility_name = ""
            district_name = ""
            province_name = ""
        return facility_name,district_name,province_name
    

    def get_reason_for_not_eiligible1(self,row):
            if row["Eligible for TPT at enrolment"] == "No":
                if row["sign_and_symptoms_of_active_tb"] == "Yes":
                        return "Sign and symptoms of active tb"
                elif row["patient_currently_on_tb_treatment"] =="Yes":
                        return "Patient currently on tb treatment"
                elif row["completed_ipt_in_the_last_three_years"] == "Yes":
                        return "Completed ipt in the last three years"
                elif row["signs_of_active_liver_disease"] == "Yes":
                        return "Signs of active liver disease"
                elif row["heavy_alcohol_use"] == "Yes":
                        return "Heavy alcohol use"
                elif row["severe_peripheral_neuropathy"] == "Yes":
                        return "Severe peripheral neuropathy"
            else:
                return ""

    def get_reason_for_not_eiligible(self,row):
            if row["Eligible for TPT"] == "No":
                if row["sign_and_symptoms_of_active_tb"] == "Yes":
                        return "Sign and symptoms of active tb"
                elif row["patient_currently_on_tb_treatment"] =="Yes":
                        return "Patient currently on tb treatment"
                elif row["completed_ipt_in_the_last_three_years"] == "Yes":
                        return "Completed ipt in the last three years"
                elif row["signs_of_active_liver_disease"] == "Yes":
                        return "Signs of active liver disease"
                elif row["heavy_alcohol_use"] == "Yes":
                        return "Heavy alcohol use"
                elif row["severe_peripheral_neuropathy"] == "Yes":
                        return "Severe peripheral neuropathy"
            else:
                return ""
            
    # def get_first_tpt_completion_date(self,row):
    #         if row["TPT Start Regimen"].startswith("3"):
    #             return row["TPT start date"] + 50
    #         else:
    #             return ""
   

    def get_mapping_file(self):
        mapping_file = pd.read_csv("mapping_file.csv")
        mapping_file['Facility ID'] = mapping_file['Facility ID'].str.strip()
        return mapping_file
    
    
    def extracting_data(self,sql,connection):
        try:
            connection = self.get_connection()
            df = pd.read_sql(sql,connection)
            connection.close()
            return df
        except Exception as e:  # noqa: E722
            print(e)
            return None
    
    
if __name__ == '__main__':
    processing_time = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
    extraction  = Extraction("./")
    mapping_file = extraction.get_mapping_file()
    facilities = extraction.get_faility_db_list()
    folder_path = extraction.folder_path
    connection = extraction.get_connection()
  
    for filename in facilities:
        print()
        print("........................................................................")
        print(">>> working on ", filename)
        print(">>> Trimming database")
        trimmed_database = extraction.trim_database(filename)
        # # Restoring Database
        extraction.restore_database(trimmed_database, processing_time)

        # ----- get facility id,version and last time stamp
        df_facility = extraction.extracting_data("""
                                SELECT facility_id, time FROM consultation.patient
                                where time <= now()
                                order by time desc
                                limit 1               
                            """, connection)
        if df_facility.empty:
             print(">>> database is empty")
             continue
        
        latest_site_id, latest_timestamp = df_facility.values.tolist()[0]
        version= ""
        first_time_stamp = ""

        facility_name , district_name, province_name =  extraction.get_facility_details(mapping_file,latest_site_id)

        print("   >>> Facility Id",latest_site_id)
        print("   >>> Facility Name",facility_name)
  

        # ART ----------------
        art_df = extraction.extracting_data("""
                                        select 
                                            p.person_id,
                                            p.firstname,
                                            p.lastname,
                                            p.birthdate,
                                            p.sex,
                                            pp.client_profile,
                                            a.art_id, 
                                            pt.patient_id,
                                            a.art_number ,
                                            a.date_of_hiv_test as 'Date of HIV diagnosis',
                                            a.date_enrolled as 'Date of enrolment in care',
                                            pt.time as 'Date of ART visit',
                                            v.lactating_status as 'Pregnancy Status',
                                            w.value as 'Weight',
                                            pt.back_captured,
                                            pt.back_captured_by
                                        from consultation.art a
                                            left join  client.person p
                                            on a.person_id = p.person_id
                                            
                                            left join consultation.patient_client_profile pp
                                            on p.person_id = pp.person_id
                                            
                                            left join consultation.art_visit v
                                            on a.art_id = v.art_id
                                            
                                            left join consultation.patient pt
                                            on v.patient_id = pt.patient_id
                                            
                                            left join consultation.weight w
                                            on v.patient_id = w.patient_id
    
                                                            """, connection)
        art_df['Date of ART visit'] = pd.to_datetime(art_df['Date of ART visit'],errors='coerce',format='mixed')
        art_df['Date of ART visit'] = art_df['Date of ART visit'].dt.date
        

        art_start_date = extraction.extracting_data("""
                                                    select 
                                                        art_id, date as 'Art Initiation Date', state,
                                                        art_initiation_category
                                                    from consultation.art_current_status
                                                        order by art_id, date
                                                    """,connection
                                                    )
        art_start_date_unique = art_start_date.drop_duplicates(subset=['art_id'],keep='first').drop('art_initiation_category', axis=1)

        patient_tb_screening = extraction.extracting_data("""
                                                        select 
                                                           p.patient_id,
                                                           p.time as 'Date of TB screening 4WSS',
                                                           tb.presumptive as 'Screening results (4WSS)'
                                                        from consultation.patient_tb_screening tb
                                                           left join consultation.patient p
                                                           on tb.patient_id = p.patient_id
                                                          """, connection)
        patient_tb_screening['Screening results (4WSS)'] = patient_tb_screening['Screening results (4WSS)'].replace(
             {0:'Not Presumptive',1:'Presumptive'})
    
        art_who_stage = extraction.extracting_data("""
                                                    select 
                                                        art_id ,
                                                        date as 'Date of ART visit',
                                                        stage as 'WHO stage at TB screening'
                                                    from 
                                                        consultation.art_who_stage 
                                                   """, connection)
        
        art_tpt = extraction.extracting_data("""
                                            select 
                                             art_id,
                                             date as 'Date of ART visit',
                                             status as 'TPT status at TB screening'
                                             from consultation.art_ipt
                                             """,connection)
        

        tb_lam = extraction.extracting_data("""
                                            select
                                            person_id, 
                                            date,
                                            test,
                                            result,
                                            from consultation.person_investigation 
                                            where test = "tb lam"
                                            """,connection)
        






        merge = pd.merge(art_df,art_start_date_unique, on =['art_id'],how = 'left')
        
        merge1 = pd.merge(merge,patient_tb_screening, on =['patient_id'],how = 'left')

        merge2 = pd.merge(merge1, art_who_stage, on = ['art_id','Date of ART visit'], how = 'left')

        # merge tpt
        merge3 = pd.merge(merge2,art_tpt , on = ['art_id','Date of ART visit'], how = 'left')
        
        merge = pd.merge()


........................................................................
>>> working on  Nyamutumbu09April24.sql
>>> Trimming database


AttributeError: __enter__

[31mERROR: Could not find a version that satisfies the requirement 1.20 (from versions: none)[0m
[31mERROR: No matching distribution found for 1.20[0m
Note: you may need to restart the kernel to use updated packages.
