In [1]:
# importing libraries 

import os
import pandas as pd 
import numpy as np 
import hashlib
import re

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# data folders

train_path = '/Users/vschuckar/Desktop/data_analytics/Week_9/final_project/dataset/train'
valid_path = '/Users/vschuckar/Desktop/data_analytics/Week_9/final_project/dataset/valid'
test_path = '/Users/vschuckar/Desktop/data_analytics/Week_9/final_project/dataset/test'

### Data "cleaning" and preprocessing 
Data source: https://www.kaggle.com/datasets/pkdarabi/bone-fracture-detection-computer-vision-project/data

Data structure: 
* Three folders: train, valid, and test.
  * Inside each folder, two folders: images and labels.
    * images folder: x-ray images of different fracture sites.
    * labels folder: .txt files with the following structure -> class number and bounding box specifications.
  
Problems with the dataset: 
* The names of the image files and txt files were the same for each different file but did not have any useful information on it.
* The classification of the images were the following numbers: 0, 1, 2, 4, 5, and 6 which can lead to confusion due to the missing 3.
* The fracture names are: Elbow Positive, Fingers Positive, Forearm Fracture, Humerus Fracture, Shoulder Fracture, and Wrist Positive but there were not 
  clearly connected in the data source with the class numbers. 
* A lot of the txt were empty. 

Solutions: 
1. Extract the class number from the txt file and put it on the image file name and the txt file name, obviously matching image/txt file names for each file.
2. Change the number of class 6 to class 3. 
3. Investigate the images and confirm to each fracture name each class belong. 
4. Skip while renaming and delete the empty txt files to avoid confusion.

Solution 3. 
* Class 0 - Elbow Positive 
* Class 1 - Fingers Positive 
* Class 2 - Forearm Positive 
* Class 3 - Wrist Positive 
* Class 4 - Humerus Fracture
* Class 5 - Shoulder Fracture 



In [None]:
# function to generate a short name for the image and .txt files, with the first character of the name being the class

def generate_short_name(file_name):
    '''
    This function receives a file name and return a short random 8 characters name, unique for each file. 
    Input: File name
    Output: New random unique file name with 8 characters
    '''
    return hashlib.md5(file_name.encode('utf-8')).hexdigest()[:8]

def rename_images_with_classification(folder_path):
    '''
    This function receives a folder path and, inside of the folder, looks for the files that end with .jpg and .txt.
    Then, it gets the first character inside for the .txt file, which becomes the first character of the new .jpg file name, 
    the rest of the .jpg file name is a short random 8 characters string. 
    Input: Folder path
    Output: New name for the .jpg, with a specific first character and new 8 random characters
    '''
    for subdir in os.listdir(folder_path):
        if os.path.isdir(os.path.join(folder_path, subdir)):
            images_path = os.path.join(folder_path, subdir, 'images')
            labels_path = os.path.join(folder_path, subdir, 'labels')

            for file_name in os.listdir(images_path):
                if file_name.endswith('.jpg'):
                    image_path = os.path.join(images_path, file_name)

                    label_file_path = os.path.join(labels_path, file_name.replace('.jpg', '.txt'))

                    if not os.path.exists(label_file_path):
                        print(f"Label file not found for {file_name}. Skipping.")
                        continue

                    with open(label_file_path, 'r') as label_file:
                        classification = label_file.read(1)

                        if not classification:
                            print(f"Skipping {label_file_path} as it is empty.")
                            continue

                    short_name = generate_short_name(file_name)

                    new_file_name = f"{classification}_{short_name}.jpg"

                    os.rename(image_path, os.path.join(images_path, new_file_name))

In [None]:
# applying the function above 

rename_images_with_classification('/Users/vschuckar/Desktop/data_analytics/Week_9/final_project/dataset')

In [5]:
# in the function above, I asked to skip all the empty .txt and there were a lot of them, so I will delete them to avoid confusion
# function to delete the empty txt files

def delete_empty_txt_files(folder_path):
    '''
    This function receives a folder path and access its .txt files. If it is empty, it deletes the file. 
    Input: Folder path
    Output: Deletes empty .txt files
    '''
    if not os.path.exists(folder_path):
        print(f"Folder '{folder_path}' does not exist.")
        return
    
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        if filename.endswith(".txt") and os.path.getsize(file_path) == 0:
            try:
                os.remove(file_path)
                print(f"Deleted: {file_path}")
            except Exception as e:
                print(f"Error deleting {file_path}: {e}")

In [None]:
# applying the function above 

folder_path = "/Users/vschuckar/Desktop/data_analytics/Week_9/final_project/dataset"
delete_empty_txt_files(folder_path)

In [None]:
# creating a function to change the class to another number 

def rename_images_with_new_class(folder_path, old_class, new_class):
    '''
    This function receives a folder path, access its files and, if it is a .jpg file, changes the first character of its name to another.
    Input: Folder path, old first character of the file name, new first character of the file name
    Output: Changed first character of the .jpg file name 
    '''
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.jpg'):
            image_path = os.path.join(folder_path, file_name)
            
            class_match = re.match(r'^(\d+)_', file_name)
            
            if class_match:
                class_label = class_match.group(1)
                
                if class_label == str(old_class):
                    new_file_name = f"{new_class}_{file_name[len(class_label) + 1:]}"
                    
                    os.rename(image_path, os.path.join(folder_path, new_file_name))
            else:
                print(f"Skipping {file_name} as it does not follow the expected pattern.")

In [None]:
# applying the function 

rename_images_with_new_class('/Users/vschuckar/Desktop/data_analytics/Week_9/final_project/dataset/train/images', old_class=6, new_class=3)
rename_images_with_new_class('/Users/vschuckar/Desktop/data_analytics/Week_9/final_project/dataset/test/images', old_class=6, new_class=3)
rename_images_with_new_class('/Users/vschuckar/Desktop/data_analytics/Week_9/final_project/dataset/valid/images', old_class=6, new_class=3)