In [2]:
import subprocess
import sys

print("--- Installing ultralytics package ---")
try:
    # Corrected command for PyTorch installation
    install_command_torch = [sys.executable, "-m", "pip", "install", "torch", "torchvision", "torchaudio", "--index-url", "https://download.pytorch.org/whl/cpu"] # For CPU
    print("Installing PyTorch (CPU version)... This might take a while.")
    torch_result = subprocess.run(install_command_torch, capture_output=True, text=True, check=True)
    print(torch_result.stdout)
    if torch_result.stderr:
        print("Torch installation warnings/errors:\n", torch_result.stderr)
    print("PyTorch installed.")

    install_command_ultralytics = [sys.executable, "-m", "pip", "install", "ultralytics"]
    print("Installing ultralytics...")
    ultralytics_result = subprocess.run(install_command_ultralytics, capture_output=True, text=True, check=True)
    print(ultralytics_result.stdout)
    if ultralytics_result.stderr:
        print("Ultralytics installation warnings/errors:\n", ultralytics_result.stderr)
    print("Ultralytics installed successfully.")

except subprocess.CalledProcessError as e:
    print(f"Failed to install package: {e.returncode}")
    print(f"Stdout:\n{e.stdout}")
    print(f"Stderr:\n{e.stderr}")
    print("\n!!! Installation Failed. Please check the error messages. !!!")
    print("If PyTorch fails, consider installing manually for your specific CPU/CUDA setup from https://pytorch.org/get-started/locally/")
except Exception as e:
    print(f"An unexpected error occurred during installation: {e}")

print("\n--- Ultralytics Installation Finished ---")

--- Installing ultralytics package ---
Installing PyTorch (CPU version)... This might take a while.
Looking in indexes: https://download.pytorch.org/whl/cpu
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cpu/torchaudio-2.7.1%2Bcpu-cp313-cp313-win_amd64.whl.metadata (6.8 kB)
Downloading https://download.pytorch.org/whl/cpu/torchaudio-2.7.1%2Bcpu-cp313-cp313-win_amd64.whl (2.5 MB)
   ---------------------------------------- 0.0/2.5 MB ? eta -:--:--
   ---- ----------------------------------- 0.3/2.5 MB ? eta -:--:--
   ---------------- ----------------------- 1.0/2.5 MB 4.1 MB/s eta 0:00:01
   ---------------- ----------------------- 1.0/2.5 MB 4.1 MB/s eta 0:00:01
   --------------------- ------------------ 1.3/2.5 MB 1.6 MB/s eta 0:00:01
   --------------------- ------------------ 1.3/2.5 MB 1.6 MB/s eta 0:00:01
   ------------------------- -------------- 1.6/2.5 MB 1.4 MB/s eta 0:00:01
   ----------------------------- ---------- 1.8/2.5 MB 1.3 MB/s eta 0:00:01
  

In [11]:
import subprocess
import sys
import os

print("--- Installing ultralytics package ---")
try:
    # Corrected command for PyTorch installation (CPU version)
    install_command_torch = [sys.executable, "-m", "pip", "install", "torch", "torchvision", "torchaudio", "--index-url", "https://download.pytorch.org/whl/cpu"]
    print("Installing PyTorch (CPU version)... This might take a while.")
    torch_result = subprocess.run(install_command_torch, capture_output=True, text=True, check=True)
    print(torch_result.stdout)
    if torch_result.stderr:
        print("Torch installation warnings/errors:\n", torch_result.stderr)
    print("PyTorch installed.")

    install_command_ultralytics = [sys.executable, "-m", "pip", "install", "ultralytics"]
    print("Installing ultralytics...")
    ultralytics_result = subprocess.run(install_command_ultralytics, capture_output=True, text=True, check=True)
    print(ultralytics_result.stdout)
    if ultralytics_result.stderr:
        print("Ultralytics installation warnings/errors:\n", ultralytics_result.stderr)
    print("Ultralytics installed successfully.")

except subprocess.CalledProcessError as e:
    print(f"Failed to install package: {e.returncode}")
    print(f"Stdout:\n{e.stdout}")
    print(f"Stderr:\n{e.stderr}")
    print("\n!!! Installation Failed. Please check the error messages. !!!")
    print("If PyTorch fails, consider installing manually for your specific CPU/CUDA setup from https://pytorch.org/get-started/locally/")
except Exception as e:
    print(f"An unexpected error occurred during installation: {e}")

print("\n--- Ultralytics Installation Finished ---")


# Ensure project_root is defined
current_notebook_dir = os.getcwd()
if os.path.basename(current_notebook_dir) == 'src':
    project_root = os.path.abspath(os.path.join(current_notebook_dir, '..'))
else:
    temp_path = current_notebook_dir
    while os.path.basename(temp_path) != '' and not os.path.exists(os.path.join(temp_path, 'dbt_project')):
        temp_path = os.path.dirname(temp_path)
    if os.path.exists(os.path.join(temp_path, 'dbt_project')):
        project_root = temp_path
    else:
        raise FileNotFoundError("Could not determine project root. Please ensure 'dbt_project' is in a parent directory.")

yolo_analyzer_script_path = os.path.join(project_root, "src", "yolo_image_analyzer.py")

# The content of the yolo_image_analyzer.py script
# The regex string for filename_pattern uses 'r' prefix internally, which is correct for the generated file.
# The SyntaxWarning you see in the Jupyter output is due to the Jupyter interpreter's parsing of this
# multi-line string literal itself, and it's harmless for the functionality of the saved script.
yolo_analyzer_script_content = """
import os
import sys
import psycopg2
from dotenv import load_dotenv
from ultralytics import YOLO 
import pandas as pd
from datetime import datetime
import re 
import hashlib 

# Add project root to sys.path to allow importing utils.config
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
sys.path.insert(0, project_root)
from utils import config as config_module

# Load environment variables
load_dotenv(os.path.join(project_root, '.env'), override=True)

POSTGRES_DB = os.getenv("POSTGRES_DB")
POSTGRES_USER = os.getenv("POSTGRES_USER")
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD")
POSTGRES_HOST = os.getenv("POSTGRES_HOST")
POSTGRES_PORT = os.getenv("POSTGRES_PORT")

BASE_IMAGE_DIR = os.path.join(project_root, 'data', 'raw', 'telegram_images')


# Initialize YOLOv8 model (using 'yolov8n.pt' for nano version, smaller and faster)
model = YOLO('yolov8n.pt') 

def get_db_connection():
    return psycopg2.connect(
        dbname=POSTGRES_DB,
        user=POSTGRES_USER,
        password=POSTGRES_PASSWORD,
        host=POSTGRES_HOST,
        port=POSTGRES_PORT,
        sslmode="require" 
    )

def get_existing_message_ids(conn):
    cursor = conn.cursor()
    cursor.execute("SELECT message_id FROM public.stg_telegram_messages;") 
    message_ids = {row[0] for row in cursor.fetchall()}
    cursor.close()
    return message_ids

def scan_local_images_and_link_to_messages(base_dir, existing_message_ids):
    image_files_to_process = []
    
    # List of specific subdirectories to scan
    target_subdirs = [
        os.path.join(base_dir, '2025-07-13', 'lobelia4cosmetics'),
        os.path.join(base_dir, '2025-07-13', 'tikvahpharma'),
        os.path.join(base_dir, '2025-07-14', 'lobelia4cosmetics'),
        os.path.join(base_dir, '2025-07-14', 'tikvahpharma'),
        os.path.join(base_dir, '2025-07-14', 'CheMed123'),
    ]

    # Regex to match 'message_[message_id]_photo.ext' and capture message_id
    # Using 'r' prefix for raw string, which is correct for the Python script.
    # The SyntaxWarning in Jupyter output is a known quirk when embedding this.
    filename_pattern = re.compile(r'^message_(\d+)_photo\.(jpg|jpeg|png|gif)$', re.IGNORECASE)

    for subdir in target_subdirs:
        if not os.path.isdir(subdir):
            print(f"Warning: Directory not found: {subdir}. Skipping.")
            continue
        
        for filename in os.listdir(subdir):
            match = filename_pattern.match(filename)
            if match:
                message_id_str = match.group(1)
                try:
                    message_id = int(message_id_str)
                    filepath = os.path.join(subdir, filename)
                    if message_id in existing_message_ids:
                        image_files_to_process.append({'message_id': message_id, 'filepath': filepath})
                    else:
                        print(f"Skipping image {filename}: message_id {message_id} not found in stg_telegram_messages.")
                except ValueError:
                    print(f"Skipping image {filename}: Could not parse message_id from '{message_id_str}'.")
            else:
                print(f"Skipping image {filename}: Does not match 'message_[id]_photo.ext' pattern.")
    
    return image_files_to_process

def store_detections_to_db(detections_data, conn):
    if not detections_data:
        print("No detections to store.")
        return

    cursor = conn.cursor()
    # Create table if not exists - dbt will manage this better, but good for initial script
    create_table_sql = f\"\"\"
    CREATE TABLE IF NOT EXISTS public.fct_image_detections (
        message_id BIGINT NOT NULL,
        detected_object_class VARCHAR(255) NOT NULL,
        confidence_score NUMERIC(5,4) NOT NULL,
        detection_timestamp TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
        # Add a hash of the detection to make it uniquely identifiable for ON CONFLICT
        detection_hash VARCHAR(64) NOT NULL,
        PRIMARY KEY (detection_hash)
    );
    \"\"\"
    cursor.execute(create_table_sql)
    conn.commit()
    print("Ensured public.fct_image_detections table exists.")

    insert_sql = \"\"\"
    INSERT INTO public.fct_image_detections (message_id, detected_object_class, confidence_score, detection_timestamp, detection_hash)
    VALUES (%s, %s, %s, %s, %s)
    ON CONFLICT (detection_hash) DO NOTHING;
    \"\"\"
    
    data_to_insert = []
    # Use a set to prevent duplicate insertions within this run based on content hash
    processed_hashes = set()
    
    for det in detections_data:
        # Create a unique hash for each detection entry to handle UPSERT
        # Including all key fields to make the hash truly representative
        detection_string = f"{det['message_id']}-{det['detected_object_class']}-{det['confidence_score']:.4f}"
        detection_hash = hashlib.sha256(detection_string.encode('utf-8')).hexdigest()
        
        if detection_hash not in processed_hashes:
            data_to_insert.append((
                det['message_id'],
                det['detected_object_class'],
                det['confidence_score'],
                det['detection_timestamp'], # Use the timestamp from the detection dict
                detection_hash
            ))
            processed_hashes.add(detection_hash)
    
    if data_to_insert:
        cursor.executemany(insert_sql, data_to_insert)
        conn.commit()
        print(f"Stored {len(data_to_insert)} new unique object detections.")
    else:
        print("No new unique detections to insert (possibly already processed).")
    cursor.close()

def main():
    conn = None
    try:
        conn = get_db_connection()
        existing_message_ids = get_existing_message_ids(conn)
        
        images_to_process = scan_local_images_and_link_to_messages(BASE_IMAGE_DIR, existing_message_ids)
        print(f"Found {len(images_to_process)} local images to process linked to known message_ids.")

        all_detections = []
        for img_info in images_to_process:
            message_id = img_info['message_id']
            local_image_path = img_info['filepath']

            if os.path.exists(local_image_path):
                try:
                    print(f"Processing image {local_image_path} for message {message_id}...")
                    results = model(local_image_path) # Perform object detection
                    
                    detected_objects_for_message = []
                    current_detection_timestamp = datetime.now() # Timestamp for this batch of detections

                    for r in results:
                        for box in r.boxes:
                            class_id = int(box.cls[0])
                            confidence = float(box.conf[0])
                            class_name = model.names[class_id]
                            
                            detected_objects_for_message.append({
                                'message_id': message_id,
                                'detected_object_class': class_name,
                                'confidence_score': confidence,
                                'detection_timestamp': current_detection_timestamp # Use the same timestamp for all detections from this image
                            })
                    
                    if detected_objects_for_message:
                        all_detections.extend(detected_objects_for_message)
                        print(f"Detected {len(detected_objects_for_message)} objects in image for message {message_id}.")
                    else:
                        print(f"No objects detected in image {local_image_path} for message {message_id}.")

                except Exception as e:
                    print(f"Error processing image {local_image_path} for message {message_id} with YOLO: {e}")
            else:
                print(f"Image file not found: {local_image_path}, skipping detection.")

        store_detections_to_db(all_detections, conn)
        print("Image analysis and storage complete.")

    except Exception as e:
        print(f"An error occurred in the main process: {e}")
    finally:
        if conn:
            conn.close()
            print("Database connection closed.")

if __name__ == "__main__":
    main()
"""

with open(yolo_analyzer_script_path, "w") as f:
    f.write(yolo_analyzer_script_content)
print(f"Created/Updated YOLO Image Analyzer script: {yolo_analyzer_script_path}")

  filename_pattern = re.compile(r'^message_(\d+)_photo\.(jpg|jpeg|png|gif)$', re.IGNORECASE)


--- Installing ultralytics package ---
Installing PyTorch (CPU version)... This might take a while.
Looking in indexes: https://download.pytorch.org/whl/cpu

PyTorch installed.
Installing ultralytics...

Ultralytics installed successfully.

--- Ultralytics Installation Finished ---
Created/Updated YOLO Image Analyzer script: D:\10academy\week7\kara_solutions_ethiopian_medical_insights\src\yolo_image_analyzer.py


In [1]:
# The same Python cell you've been running, just re-execute it.
import os
import subprocess
import sys
from dotenv import load_dotenv

# --- Refined project_root and dbt_project_dir definition ---
current_notebook_dir = os.getcwd()

project_root = os.path.abspath(os.path.join(current_notebook_dir, '..'))

dbt_project_dir = os.path.join(project_root, "medical_insights_dwh", "dbt_project")

if not os.path.isdir(dbt_project_dir):
    raise FileNotFoundError(
        f"Error: The 'dbt_project' directory was not found at the expected path: {dbt_project_dir}\n"
        "Please ensure your project structure is correct and the 'dbt_project' folder exists directly under 'medical_insights_dwh'."
    )

load_dotenv(os.path.join(project_root, '.env'), override=True)
env_with_vars = os.environ.copy()


# --- DEBUGGING PRINTS ---
print(f"Current Working Directory (Jupyter): {current_notebook_dir}")
print(f"Determined Project Root: {project_root}")
print(f"Determined dbt Project Directory: {dbt_project_dir}")
print(f"Does dbt Project Directory Exist? {os.path.isdir(dbt_project_dir)}")
print(f"POSTGRES_HOST in current Python env: {os.getenv('POSTGRES_HOST')}")
# --- END DEBUGGING PRINTS ---

print("--- Re-running dbt deps to ensure all packages (including dbt_utils for new tests) are installed ---")
try:
    result = subprocess.run(
        ["dbt", "deps"],
        capture_output=True,
        text=True,
        check=True,
        cwd=dbt_project_dir,
        env=env_with_vars
    )
    print(result.stdout)
    if result.stderr:
        print("Errors/Warnings from dbt deps:\n", result.stderr)
    print("dbt deps completed successfully.")
except subprocess.CalledProcessError as e:
    print(f"dbt deps failed: {e.returncode}")
    print(f"Stdout:\n{e.stdout}")
    print(f"Stderr:\n{e.stderr}")
    print("\n!!! DBT DEPS FAILED. PLEASE REVIEW THE ERROR ABOVE. !!!")
    sys.exit(1)
except FileNotFoundError:
    print("Error: dbt command not found. Make sure dbt is correctly installed and in your PATH.")
    sys.exit(1)


print("\n--- Running dbt run to build (or re-build) all models, including fct_image_detections ---")
try:
    result = subprocess.run(
        ["dbt", "run"],
        capture_output=True,
        text=True,
        check=True,
        cwd=dbt_project_dir,
        env=env_with_vars
    )
    print(result.stdout)
    if result.stderr:
        print("Errors/Warnings from dbt run:\n", result.stderr)
    print("dbt run completed successfully.")
except subprocess.CalledProcessError as e:
    print(f"dbt run failed: {e.returncode}")
    print(f"Stdout:\n{e.stdout}")
    print(f"Stderr:\n{e.stderr}")
    print("\n!!! DBT RUN FAILED. PLEASE REVIEW THE ERROR ABOVE AND YOUR SQL MODELS !!!")
    sys.exit(1)
except FileNotFoundError:
    print("Error: dbt command not found. Make sure dbt is correctly installed and in your PATH.")
    sys.exit(1)


print("\n--- Running dbt test to check data quality for all models ---")
try:
    result = subprocess.run(
        ["dbt", "test"],
        capture_output=True,
        text=True,
        check=False,
        cwd=dbt_project_dir,
        env=env_with_vars
    )
    print(result.stdout)
    if result.stderr:
        print("Errors/Warnings from dbt test:\n", result.stderr)
    print("dbt test completed. Check the output for test results (PASS/FAIL).")
except subprocess.CalledProcessError as e:
    print(f"dbt test command failed: {e.returncode}")
    print(f"Stdout:\n{e.stdout}")
    print(f"Stderr:\n{e.stderr}")
    print("\n!!! DBT TEST COMMAND FAILED. PLEASE REVIEW THE ERROR ABOVE !!!")
except FileNotFoundError:
    print("Error: dbt command not found. Make sure dbt is correctly installed and in your PATH.")


print("\n--- Generating dbt documentation with updated models ---")
try:
    result = subprocess.run(
        ["dbt", "docs", "generate"],
        capture_output=True,
        text=True,
        check=True,
        cwd=dbt_project_dir,
        env=env_with_vars
    )
    print(result.stdout)
    if result.stderr:
        print("Errors/Warnings from dbt docs generate:\n", result.stderr)
    print("dbt documentation generated successfully.")
except subprocess.CalledProcessError as e:
    print(f"dbt docs generate failed: {e.returncode}")
    print(f"Stdout:\n{e.stdout}")
    print(f"Stderr:\n{e.stderr}")
    print("\n!!! DBT DOCS GENERATE FAILED. PLEASE REVIEW THE ERROR ABOVE !!!")
except FileNotFoundError:
    print("Error: dbt command not found. Make sure dbt is correctly installed and in your PATH.")

print("\n--- All dbt commands finished for Task 3 ---")

Current Working Directory (Jupyter): D:\10academy\week7\kara_solutions_ethiopian_medical_insights\src
Determined Project Root: D:\10academy\week7\kara_solutions_ethiopian_medical_insights
Determined dbt Project Directory: D:\10academy\week7\kara_solutions_ethiopian_medical_insights\medical_insights_dwh\dbt_project
Does dbt Project Directory Exist? True
POSTGRES_HOST in current Python env: ep-dry-bonus-a2r95tms-pooler.eu-central-1.aws.neon.tech
--- Re-running dbt deps to ensure all packages (including dbt_utils for new tests) are installed ---
[0m06:05:33  Running with dbt=1.10.4
[0m06:05:35  Installing dbt-labs/dbt_utils
[0m06:05:36  Installed from version 1.1.1
[0m06:05:36  Updated version available: 1.3.0
[0m06:05:36  Installing dbt-labs/dbt_external_tables
[0m06:05:37  Installed from version 0.8.0
[0m06:05:37  Updated version available: 0.11.1
[0m06:05:37  
[0m06:05:37  Updates available for packages: ['dbt-labs/dbt_utils', 'dbt-labs/dbt_external_tables']                 
U