In [3]:
import pandas as pd
from PIL import Image, ImageEnhance
import numpy as np
import os

# Load the CSV file
df = pd.read_csv('../data/train.csv')

# Analyze the distribution of PCI values and decide which ones to augment
pci_counts = df['pci'].value_counts()
threshold = pci_counts.median() # Or any other logic to decide the threshold

# Define augmentation functions
def flip_image(image):
    return image.transpose(Image.FLIP_LEFT_RIGHT)

def change_contrast(image, factor=1.5):
    enhancer = ImageEnhance.Contrast(image)
    return enhancer.enhance(factor)

def add_noise(image):
    np_image = np.array(image)
    noise = np.random.randint(5, size=np_image.shape, dtype='uint8')
    for i in range(image.size[1]):  # For each row
        for j in range(image.size[0]):  # For each column
            for k in range(3):  # For each color channel
                if np_image[i, j, k] != 0:
                    np_image[i, j, k] += noise[i, j, k]
    return Image.fromarray(np_image)

# Directory containing the images
image_dir = '../data/train'
output_dir = '../data/augment_images'
augmented_images_records = []

# Augment images for selected PCI values
for pci, count in pci_counts.items():
    if count < threshold:
        # Find images with this PCI value
        images_to_augment = df[df['pci'] == pci]['image_name']
        for image_name in images_to_augment:
            image_path = os.path.join(image_dir, image_name)
            image = Image.open(image_path)
            
            # Perform augmentations
            flipped_image = flip_image(image)
            contrast_image = change_contrast(image)
            noisy_image = add_noise(image)
            
            # Save augmented images and record their information
            flipped_image_path = f'flipped_{image_name}'
            contrast_image_path = f'contrast_{image_name}'
            noisy_image_path = f'noisy_{image_name}'
            
            flipped_image.save(os.path.join(output_dir, flipped_image_path))
            contrast_image.save(os.path.join(output_dir, contrast_image_path))
            noisy_image.save(os.path.join(output_dir, noisy_image_path))
            
            augmented_images_records.extend([
                [flipped_image_path, pci],
                [contrast_image_path, pci],
                [noisy_image_path, pci],
            ])

# Create a DataFrame for the augmented images
augmented_df = pd.DataFrame(augmented_images_records, columns=['image_name', 'pci'])

# Concatenate with the original DataFrame
final_df = pd.concat([df, augmented_df], ignore_index=True)

# Export to Excel
excel_path = '../data/augmented_train.xlsx'
final_df.to_excel(excel_path, index=False)

print(f"Excel file has been saved to {excel_path}")

  np_image[i, j, k] += noise[i, j, k]


ModuleNotFoundError: No module named 'openpyxl'

In [6]:
augmented_df
excel_path = '../data/augmented_train.csv'
final_df.to_csv(excel_path)

print(f"Excel file has been saved to {excel_path}")

Excel file has been saved to ../data/augmented_train.csv
