In [2]:
import pandas as pd
import os
import logging
from sqlalchemy import create_engine
from PIL import Image


In [None]:

# Configure logging
logging.basicConfig(filename='data_cleaning_task2.log', level=logging.INFO)

# Directory containing the JSON files
json_dir = r'C:\Users\Administrator\Documents\kifiya\Week_7\Data'

# Database connection URL (replace with actual database URL)
database_url = 'postgresql://user:password@localhost:5432/mydatabase'

# List to store cleaned data
cleaned_data_frames = []

# Load and clean all JSON files
def process_json_files(json_dir):
    for file_name in os.listdir(json_dir):
        if file_name.endswith('.json'):
            file_path = os.path.join(json_dir, file_name)
            logging.info(f"Processing file: {file_name}")
            try:
                df = pd.read_json(file_path, encoding='utf-8')
                logging.info(f"Loaded data from {file_name}")
                df = clean_data(df)
                cleaned_data_frames.append(df)
                store_in_database(df, file_name, database_url)  # Store the cleaned data in a database
            except Exception as e:
                logging.error(f"Error processing file {file_name}: {e}")

# Cleaning functions
def clean_data(df):
    df = remove_duplicates(df)
    df = handle_missing_values(df)
    df = standardize_formats(df)
    df = validate_data(df)
    return df

def remove_duplicates(df):
    initial_size = len(df)
    df = df.drop_duplicates()
    logging.info(f"Removed {initial_size - len(df)} duplicates")
    return df

def handle_missing_values(df):
    df['content'] = df['content'].fillna('No content')
    df = df.dropna(subset=['date'])
    logging.info("Handled missing values")
    return df

def standardize_formats(df):
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    logging.info("Standardized date formats")
    return df

def validate_data(df):
    invalid_dates = df['date'].isnull().sum()
    if invalid_dates > 0:
        logging.warning(f"Found {invalid_dates} invalid dates")
    return df

# Function to store DataFrame in a database
def store_in_database(df, json_file, database_url):
    try:
        engine = create_engine(database_url)
        table_name = os.path.splitext(json_file)[0]  # Use the file name as the table name
        df.to_sql(table_name, engine, if_exists='replace', index=False)
        logging.info(f"Data stored in the '{table_name}' table.")
    except Exception as e:
        logging.error(f"Error storing data from {json_file} in the database: {e}")

# Example usage: Process and clean all JSON files in the folder
process_json_files(json_dir)


In [4]:
# Function to resize images for object detection (YOLO)
def resize_image(image_path, output_size=(416, 416)):
    try:
        with Image.open(image_path) as img:
            img = img.resize(output_size)
            img.save(image_path)  # Overwrite with resized image
            logging.info(f"Resized image: {image_path}")
    except Exception as e:
        logging.error(f"Error resizing image {image_path}: {e}")
        try:
            os.remove(image_path)  # Delete the image if resizing fails
            logging.info(f"Deleted non-resizable image: {image_path}")
        except Exception as delete_error:
            logging.error(f"Error deleting image {image_path}: {delete_error}")

# Apply the resize function to all images in the folder
def transform_images(image_folder):
    for image_file in os.listdir(image_folder):
        image_path = os.path.join(image_folder, image_file)
        resize_image(image_path)

# Transform all images in the 'images' directory
transform_images('C:/Users/Administrator/Documents/kifiya/Week_7/Data/images/')

ERROR:root:Error resizing image C:/Users/Administrator/Documents/kifiya/Week_7/Data/images/62.jpg: cannot identify image file 'C:\\Users\\Administrator\\Documents\\kifiya\\Week_7\\Data\\images\\62.jpg'
ERROR:root:Error resizing image C:/Users/Administrator/Documents/kifiya/Week_7/Data/images/73.jpg: cannot identify image file 'C:\\Users\\Administrator\\Documents\\kifiya\\Week_7\\Data\\images\\73.jpg'
ERROR:root:Error resizing image C:/Users/Administrator/Documents/kifiya/Week_7/Data/images/89.jpg: cannot identify image file 'C:\\Users\\Administrator\\Documents\\kifiya\\Week_7\\Data\\images\\89.jpg'
