### Load Data

In [53]:
import pandas as pd
import re
import emoji

# Load raw data from CSV
df = pd.read_csv('../data/raw/messages.csv')


In [54]:
df.head()

Unnamed: 0,message_id,sender_id,message_text,channel,date
0,67881c6f-1ed4-4c2f-aed7-1e37d4d13bfe,-1001102021238,https://youtu.be/5DBoEm-8kmA?si=LDLuEecNfULJVD...,DoctorsET,2023-12-18 17:04:02+00:00
1,5df99ca4-a74a-43b0-a2db-8b8a9989d594,-1001102021238,ዶክተርስ ኢትዮጵያ በ አዲስ አቀራረብ በ ቴሌቪዥን ፕሮግራሙን ለመጀመር ከ...,DoctorsET,2023-11-03 16:14:39+00:00
2,ee3ca8d8-7494-4676-a207-f3aab446fa81,-1001102021238,ሞት በስኳር \n\nለልጆቻችን የምናሲዘው ምሳቃ ሳናቀው እድሚያቸውን ይቀን...,DoctorsET,2023-10-02 16:37:39+00:00
3,93443f80-59fb-416e-8da6-06c9b9a7d78a,-1001102021238,ከ HIV የተፈወሰ ሰው አጋጥሟችሁ ያቃል ? ፈውስ እና ህክምና ?\n\nሙ...,DoctorsET,2023-09-16 07:54:32+00:00
4,d812334d-6306-459e-b451-d90e065dc33d,-1001102021238,በቅርብ ጊዜ በሃገራችን ላይ እየተስተዋለ ያለ የተመሳሳይ ፆታ ( Homos...,DoctorsET,2023-09-01 16:16:15+00:00


In [55]:
# Function to remove punctuation
def remove_punctuation(text):
    # Remove punctuation using regex
    return re.sub(r'[^\w\s]', '', text)

# Function to remove emojis
def remove_emojis(text):
    # Use emoji library to replace emojis with an empty string
    return emoji.replace_emoji(text, replace='')

# Function to clean message text
def clean_message_text(text):
    if not isinstance(text, str):
        return ''  # If the text is not a string (e.g., NaN or float), return an empty string
    text = remove_emojis(text)  # First, remove emojis
    # text = remove_punctuation(text)  # Then, remove punctuation
    return text

### Removing Duplicates

In [56]:
df.drop_duplicates(subset='message_id', inplace=True)


In [57]:

# Drop rows where 'message_text' is missing
df.dropna(subset=['message_text'], inplace=True)



### Standardizing Formats

In [58]:
df['date'] = pd.to_datetime(df['date'], errors='coerce')  # Convert to datetime


In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2377 entries, 0 to 2525
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   message_id    2377 non-null   object             
 1   sender_id     2377 non-null   int64              
 2   message_text  2377 non-null   object             
 3   channel       2377 non-null   object             
 4   date          2377 non-null   datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), int64(1), object(3)
memory usage: 111.4+ KB


### Data Validation

In [60]:
# Validate that message_id is unique
assert df['message_id'].is_unique, "Duplicate message IDs found!"

# Validate that dates are not in the future
# Convert pd.Timestamp.now() to UTC to match df['date']
now_utc = pd.Timestamp.now(tz='UTC')

# Filter rows where 'date' is less than or equal to the current timestamp
df = df[df['date'] <= now_utc]



In [61]:
# Apply cleaning to message_text column
df['message_text'] = df['message_text'].apply(clean_message_text)

In [62]:
df.to_csv('../data/cleaned/cleaned_data.csv', index=False)


# Task 3

In [1]:
import os
import logging
import torch
import cv2
from datetime import datetime
import pandas as pd
from sqlalchemy import create_engine

In [3]:
# Setup logging
logging.basicConfig(filename='../logs/yolo_detection.log', level=logging.INFO, format='%(asctime)s - %(message)s')

In [4]:
# Load the YOLOv5 model
model = torch.hub.load('ultralytics/yolov5', 'yolov5s')  # Pre-trained YOLOv5 model

Downloading: "https://github.com/ultralytics/yolov5/zipball/master" to C:\Users\zelalem.wubet/.cache\torch\hub\master.zip


Creating new Ultralytics Settings v0.0.6 file  
View Ultralytics Settings with 'yolo settings' or at 'C:\Users\zelalem.wubet\AppData\Roaming\Ultralytics\settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


YOLOv5  2024-10-12 Python-3.12.3 torch-2.4.1+cpu CPU

Downloading https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5s.pt to yolov5s.pt...
100%|██████████| 14.1M/14.1M [00:03<00:00, 4.05MB/s]

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 


In [5]:
# PostgreSQL configuration
db_config = {
    'user': 'postgres',
    'password': 'password',
    'host': 'localhost',
    'port': 5432,
    'database': 'medical_data_warehouse'
}


In [6]:
# Function to store detection data to the database
def store_detections_to_db(data):
    try:
        engine = create_engine(f'postgresql+psycopg2://{db_config["user"]}:{db_config["password"]}@{db_config["host"]}:{db_config["port"]}/{db_config["database"]}')
        df = pd.DataFrame(data)
        df.to_sql('object_detections', engine, if_exists='append', index=False)
        logging.info(f"Stored {len(df)} detection records to the database.")
    except Exception as e:
        logging.error(f"Error storing detection data to the database: {e}")

In [7]:
# Function to store detection data to the database
def store_detections_to_db(data):
    try:
        engine = create_engine(f'postgresql+psycopg2://{db_config["user"]}:{db_config["password"]}@{db_config["host"]}:{db_config["port"]}/{db_config["database"]}')
        df = pd.DataFrame(data)
        df.to_sql('object_detections', engine, if_exists='append', index=False)
        logging.info(f"Stored {len(df)} detection records to the database.")
    except Exception as e:
        logging.error(f"Error storing detection data to the database: {e}")


In [8]:
# Function to process images and detect objects
def detect_objects(image_folder):
    detections = []
    for image_file in os.listdir(image_folder):
        if image_file.endswith('.jpg') or image_file.endswith('.png'):
            image_path = os.path.join(image_folder, image_file)
            logging.info(f"Processing image: {image_path}")

            # Load the image
            img = cv2.imread(image_path)

            # Perform object detection
            results = model(img)

            # Extract relevant detection data
            for detection in results.xyxy[0]:  # xyxy format for bounding boxes
                xmin, ymin, xmax, ymax, confidence, class_id = detection[:6]
                class_name = model.names[int(class_id)]

                # Log the detection
                logging.info(f"Detected {class_name} with confidence {confidence:.2f} in {image_file}")

                # Prepare detection data for storage
                detections.append({
                    'image_name': image_file,
                    'class_name': class_name,
                    'confidence': confidence.item(),
                    'xmin': xmin.item(),
                    'ymin': ymin.item(),
                    'xmax': xmax.item(),
                    'ymax': ymax.item(),
                    'detection_time': datetime.now()
                })

    # Store detection results to the database
    if detections:
        store_detections_to_db(detections)

In [9]:






# Main function to trigger the object detection
def main():
    image_folder = '..data/raw/images/'  # Folder containing images
    detect_objects(image_folder)
    logging.info("Object detection completed.")

if __name__ == '__main__':
    try:
        main()
    except Exception as e:
        logging.error(f"An error occurred during object detection: {e}")