In [6]:
# Bước 1: Tìm ảnh chưa được gán nhãn bằng Linear Search.
def find_unlabeled_images(data):
    unlabeled_images = []
    for image in data:
        if image['label'] is None:
            unlabeled_images.append(image)
    return unlabeled_images

# Bước 2: Sắp xếp ảnh theo độ mờ (blurriness) tăng dần bằng Bubble Sort
def sort_by_blurriness(data):
    n = len(data)
    for i in range(n):
        for j in range(0, n - i - 1):
            if data[j]["blurriness"] > data[j+1]["blurriness"]:
                data[j], data[j+1] = data[j+1], data[j]
    return data

# Bước 3: Phát hiện ảnh tương tự (similarity > threshold) bằng Binary Search.
def has_high_similarity(data, threshold=0.9):
    sims = sorted([item['similarity'] for item in data], reverse=True)
    left, right = 0, len(data) -1
    while left <= right:
        mid =(left+right) //2
        if sims[mid] > threshold:
            return True
        elif sims[mid] <= threshold:
            right = mid - 1
    return False

# Run example
dataset = [
  {"name": "img1.png", "label": "benign", "blurriness": 0.4, "similarity": 0.92},
  {"name": "img2.png", "label": None, "blurriness": 0.6, "similarity": 0.95},
  {"name": "img3.png", "label": "malignant", "blurriness": 0.3, "similarity": 0.84},
  {"name": "img4.png", "label": None, "blurriness": 0.5, "similarity": 0.88}
]

print("\nUnlabeled images:", find_unlabeled_images(dataset))
print("\nSorted by blurriness:", [img['name'] for img in sort_by_blurriness(dataset)])
print("\nHas high similarity > 0.9:", has_high_similarity(dataset))


Unlabeled images: [{'name': 'img2.png', 'label': None, 'blurriness': 0.6, 'similarity': 0.95}, {'name': 'img4.png', 'label': None, 'blurriness': 0.5, 'similarity': 0.88}]

Sorted by blurriness: ['img3.png', 'img1.png', 'img4.png', 'img2.png']

Has high similarity > 0.9: True


In [9]:
def write_log_file(data, file_name='log.txt'):
    with open(file_name, "w") as f:
        f.write("Unlabeled Image: \n")
        for item in data:
            if item['label'] is None:
                f.write(f"- {item['name']}\n")

        f.write("\n Blurry images (blurriness > 0.5): \n")
        for item in data:
            if item['blurriness'] > 0.5:
                f.write(f"- {item['name']} (blurriness: {item['blurriness']})\n")

        f.write("\nHighly similar images (similarity > 0.9):\n")
        for item in data:
            if item['blurriness'] > 0.9:
                f.write(f"- {item['name']} (blurriness: {item['blurriness']})\n")

In [10]:
import difflib

def find_duplicate_images(data, similarity_threshold=0.9, name_similarity=0.8):
    duplicates = []
    n = len(data)
    for i in range(n):
        for j in range(i+1, n):
            name_sim = difflib.SequenceMatcher(None, data[i]['name'], data[j]['name']).ratio()
            if (data[i]['similarity'] > similarity_threshold and 
                data[j]['similarity'] > similarity_threshold and 
                name_sim > name_similarity):
                duplicates.append((data[i]['name'], data[j]['name']))
    return duplicates

def generate_summary(data):
    total = len(data)
    unlabeled = sum(1 for item in data if item['label'] is None)
    blurry = sum(1 for item in data if item['blurriness'] > 0.5)
    high_sim = sum(1 for item in data if item['similarity'] > 0.9)
    good = total - (unlabeled + blurry)

    return {
        "Total images": total,
        "Unlabeled images": unlabeled,
        "Blurry images": blurry,
        "Highly similar images": high_sim,
        "Good images": good
    }


In [11]:
dataset = [
    {"name": "img1.png", "label": "benign", "blurriness": 0.4, "similarity": 0.92},
    {"name": "img2.png", "label": None, "blurriness": 0.6, "similarity": 0.95},
    {"name": "img3.png", "label": "malignant", "blurriness": 0.3, "similarity": 0.84},
    {"name": "img4.png", "label": None, "blurriness": 0.5, "similarity": 0.88}
]
write_log_file(dataset)
print("🔁 Duplicate images:", find_duplicate_images(dataset))
print("📊 Summary report:", generate_summary(dataset))


🔁 Duplicate images: [('img1.png', 'img2.png')]
📊 Summary report: {'Total images': 4, 'Unlabeled images': 2, 'Blurry images': 1, 'Highly similar images': 2, 'Good images': 1}
