In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from PIL import Image, ImageFilter

In [None]:
table_data = pd.read_csv(r"F:\Semester 5\Manajemen Big Data\Final Project\Food Ingredients and Recipe Dataset with Image Name Mapping.csv")
table_data.head()

In [None]:
table_data = table_data[:-12501]
print('New shape after removing 10000 rows:', table_data.shape)

# IMAGE

In [None]:
image_names = table_data['Image_Name'].unique()
print('Number of unique Image Names:', len(image_names))
print('First 10 Image Names:', image_names[:10])

## Set Path

In [None]:
folder_path = r"F:\Semester 5\Manajemen Big Data\Final Project\Food Images\Food Images"
file_names = [os.path.splitext(f)[0] for f in os.listdir(folder_path)]

## Checking

In [None]:
image_names_set = set(table_data['Image_Name'].unique())
file_names_set = set(file_names)

matches = image_names_set & file_names_set
no_matches = image_names_set - file_names_set

In [None]:
print('Number of matches:', len(matches))
print('Number of no matches:', len(no_matches))
print('Total unique Image Names:', len(image_names_set))
print('Total files in folder:', len(file_names_set))

## Fungsi Prepo

Fungsi untuk memproses tiap gambar  
- Center Crop  
- Resize  
- Convert semua gambar ke RGB

In [None]:
def preprocess_image(image_path, processed_folder, image_name, resize_size=(256, 256), crop_size=(224, 224)):
    """
    Preprocess image: resize, center crop, and apply Gaussian blur, saving intermediate results.
    """
    try:
        with Image.open(image_path) as img:
            # konversi ke RGB
            if img.mode != 'RGB':
                img = img.convert('RGB')
            
            # Resize dulu (ke ukuran yang lebih besar, misal 256x256)
            img_resized = img.resize(resize_size, Image.Resampling.LANCZOS)
            
            # Save resized
            resized_path = os.path.join(processed_folder, 'resized', image_name + '.jpg')
            img_resized.save(resized_path)
            
            # Center crop
            width, height = img_resized.size
            crop_width, crop_height = crop_size
            left = (width - crop_width) // 2
            top = (height - crop_height) // 2
            right = left + crop_width
            bottom = top + crop_height
            img_cropped = img_resized.crop((left, top, right, bottom))
            
            # Save cropped
            cropped_path = os.path.join(processed_folder, 'cropped', image_name + '.jpg')
            img_cropped.save(cropped_path)
            
            # Apply Gaussian blur
            img_blurred = img_cropped.filter(ImageFilter.GaussianBlur(radius=2))
            
            # Save blurred
            blurred_path = os.path.join(processed_folder, 'blurred', image_name + '.jpg')
            img_blurred.save(blurred_path)
            
            return True
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return False

## Set Path Tujuan

In [None]:
# path tujuan
processed_folder = r"F:\Semester 5\Manajemen Big Data\Final Project\Processed Images"
os.makedirs(processed_folder, exist_ok=True)

# buat subfolder untuk setiap langkah
os.makedirs(os.path.join(processed_folder, 'cropped'), exist_ok=True)
os.makedirs(os.path.join(processed_folder, 'resized'), exist_ok=True)
os.makedirs(os.path.join(processed_folder, 'blurred'), exist_ok=True)

## Pipeline

In [None]:
# pipeline function
def run_preprocessing_pipeline(table_data, image_folder, processed_folder):
    success_count = 0
    total = len(table_data)
    for idx, row in table_data.iterrows():
        image_name = row['Image_Name']
        image_path = os.path.join(image_folder, image_name + '.jpg')
        
        if os.path.exists(image_path):
            if preprocess_image(image_path, processed_folder, image_name):
                success_count += 1
        else:
            print(f"Image not found: {image_path}")
    
    print(f"Preprocessing completed. {success_count}/{total} images processed successfully.")

In [None]:
# Run the pipeline
image_folder = r"F:\Semester 5\Manajemen Big Data\Final Project\Food Images\Food Images"
run_preprocessing_pipeline(table_data, image_folder, processed_folder)

## Tampilkan Hasil Akhir

In [None]:
# pilih gambar pertama hasil prepo terakhir
first_image_name = table_data['Image_Name'].iloc[0]
blurred_image_path = os.path.join(processed_folder, 'blurred', first_image_name + '.jpg')

# Load gambar blurred
img_blurred = Image.open(blurred_image_path)

# Convert ke numpy array untuk mendapatkan shape
img_array = np.array(img_blurred)

# Tampilkan gambar
plt.imshow(img_blurred)
plt.title(f'Blurred Image: {first_image_name}')
plt.axis('off')
plt.show()

# Tampilkan bentuk numerik (shape)
print(f'Shape of the image array: {img_array.shape}')

# TEXT

In [None]:
# buat tabel baru yang berisi hanya instructions
instructions_table = table_data[['Instructions']]

In [None]:
instructions_table

In [None]:
# lowercase simpan di kolom baru
instructions_table['Instructions_Lower'] = instructions_table['Instructions'].str.lower()
instructions_table