##### 1. Install OpenCV

In [None]:
!pip3 install opencv-python

##### 2. Import libs needed below

In [25]:
# importing required libraries
import pandas as pd
import cv2
import requests # request img from web
import shutil # save img locally
import os
from PIL import Image
from io import BytesIO
import hashlib
from matplotlib import pyplot as plt

##### 3. Load the Pubfig url and convert it to pd.dataframe format

In [3]:
# Read url file
def read_data(file):
    data = []
    for line in open(file): 
        line = line.strip().split("\t")
        data.append(line)
        # break
    return pd.DataFrame(data)

df = read_data("dev.csv")
df.columns = ["name", "number", "url", "rect", "class"]

##### 4. Download the images from these urls in the dataframe loaded above

In [4]:
def remove_files(upper_dir="dev"):
    for folder in os.listdir(upper_dir):
        folder_path = os.path.join(upper_dir, folder)
        if os.path.isdir(folder_path):
            shutil.rmtree(folder_path)
            os.makedirs(folder_path)
        else:
            os.remove(folder_path)
                    
def download_image(row, upper_dir="dev"):
    folder_path = os.path.join(upper_dir, '_'.join(row["name"].split(" ")))
    if not os.path.isdir(folder_path):
        shutil.rmtree(folder_path)
        os.makedirs(folder_path)
    
    filename = os.path.join(folder_path, row['class'] + '.jpg')
    try:
        response = requests.get(row['url'], timeout=20)
        img = Image.open(BytesIO(response.content))
        img.save(filename)
        print('Image sucessfully Downloaded: ',filename)
    except Exception as e:
        print('Image (%s), url (%s) Couldn\'t be retrieved because of %s' % (filename, row['url'], e))

# download the images from the urls in the dataframe
# df.apply(download_image, axis=1)
# The downloading processing is time consuming, so we save the images locally and load them from local

##### 5. Find and delete the duplicated images based on their md5 values

In [48]:
def calculate_md5(file_path):
    with open(file_path, 'rb') as file:
        md5_hash = hashlib.md5()
        while chunk := file.read(4096):
            md5_hash.update(chunk)
    return md5_hash.hexdigest()

def delete_image(file_path):
    try:
        os.remove(file_path)
        print(f"Deleted: {file_path}")
    except OSError as e:
        print(f"Error deleting {file_path}: {e}")
        
# Remove the pictures that are the same
def find_and_delete_duplicated_images(directory):
    md5_map = {}
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            md5_of_file = calculate_md5(file_path)
            if md5_of_file not in md5_map:
                md5_map[md5_of_file] = file_path
            else:
                delete_image(file_path)
                if os.path.isfile(md5_map[md5_of_file]):
                    delete_image(md5_map[md5_of_file]) 
               
find_and_delete_duplicated_images("dev")

##### 6. Crop the head area from the images processed above

In [51]:
def get_head_path(row, upper_dir="dev"):
    folder_path = os.path.join(upper_dir, '_'.join(row["name"].split(" ")))
    filename = os.path.join(folder_path, row['class'] + '.jpg')
    return filename

def get_head_image(image_path, rect_str, save_path):
    # Load the image
    img = cv2.imread(image_path)
    rect = [int(st) for st in rect_str.split(",")]
    # Cut the rectangle
    cut_img = img[rect[1]:rect[1]+rect[3], rect[0]:rect[0]+rect[2]]
    # cut_img = img[rect[0]:rect[0]+rect[2], rect[1]:rect[1]+rect[3]]
    # Save the cut image
    # cv2.imwrite(save_path, cut_img)
    return cut_img
    
def show_cv2_image(img):
    image_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    plt.imshow(image_rgb)
    plt.axis('off')
    plt.show()
    
def show_image_with_path(image_path):
    show_cv2_image(cv2.imread(image_path))

In [None]:
# Check if the column exists
column_name = 'head_image_path'
if column_name not in df.columns:
    df[column_name] = ''
# Iterate over rows in df and cut the head area as a new image
for index, row in df.iterrows():
    original_image_path = get_head_path(row)
    print(original_image_path)
    if os.path.isfile(original_image_path):
        show_cv2_image(get_head_image(original_image_path, row['rect'], get_head_image))