In [None]:
import os
from bs4 import BeautifulSoup

results = []

for root, dirs, files in os.walk('data/'):
    for file in files:
        if file.endswith('utf8.html'):
            file_path = os.path.join(root, file)
            with open(file_path, 'r', encoding='utf-8') as f:
                soup = BeautifulSoup(f, 'html.parser')
                imgs = soup.find_all('img')
                count_without_wh = sum(
                    not img.has_attr('width') and not img.has_attr('height') for img in imgs
                )
                results.append({
                    'file_path': file_path,
                    'img_element_without_width_height': count_without_wh
                })

In [3]:
print('File Path, Images without width/height')
for result in results:
    print(f"{result['file_path']}, {result['img_element_without_width_height']}")
print(f'Total files processed: {len(results)}')
print(f'Total images without width/height: {sum(r["img_element_without_width_height"] for r in results)}')
print('Done.')

File Path, Images without width/height
data/nikkeibp.co.jp/20000511101011/FMDZXCV5UUUJLBFRHGL2A5634X7U5UH2_utf8.html, 0
data/nikkeibp.co.jp/20000511101011/resources/biztech.nikkeibp.co.jp_biztech_topics.shtml/UMQC3POOJZPAA5WFE6KXTU5LH6L2BZTR_utf8.html, 0
data/nikkeibp.co.jp/20000511101011/resources/bizad.nikkeibp.co.jp_image_ad.shtml/ZTGZ5YUCRMDLHO353SNWHYZMRFWVZ433_utf8.html, 0
data/nikkeibp.co.jp/20000510064435/LHFT7VAGPBHL7IPBWCSGR6YYWDK7XTUA_utf8.html, 0
data/nikkeibp.co.jp/20000520102928/ET2SBQZWKRQ7D6IS27M3CEBODZH52YAY_utf8.html, 0
data/nikkeibp.co.jp/20000510045128/ET2SBQZWKRQ7D6IS27M3CEBODZH52YAY_utf8.html, 0
data/nikkeibp.co.jp/20000528235554/E3QHSGE7MSIHHSBVKUS3VEJQRMSMWYOC_utf8.html, 0
data/asahi.com/20000511104501/JFLKG5GRGT4KUGRVOF7YOOXG4H2R7HSL_utf8.html, 6
data/asahi.com/20000520005747/SDOFFMLOX6PVBYOGZZSO6I6RA6DJ5JT6_utf8.html, 1
data/asahi.com/20000512001328/UFS4BUM6TCKILXPCYP4XDUD52YJ3CGRW_utf8.html, 6
data/asahi.com/20000510231842/ZMP5WPQGUSWGSHQHSPLEJ2WSGQBV42ZC_utf

In [4]:
IAB_SIZES = {
    (300, 250): "Medium Rectangle",
    (250, 250): "Square Pop-Up",
    (240, 400): "Vertical Rectangle",
    (336, 280): "Large Rectangle",
    (180, 150): "Rectangle",
    (468, 60): "Full Banner",
    (234, 60): "Half Banner",
    (88, 31): "Micro Button",
    (120, 90): "Button 1",
    (120, 60): "Button 2",
    (120, 240): "Vertical Banner",
    (125, 125): "Square Button",
    (728, 90): "Leaderboard",
    (160, 600): "Wide Skyscraper",
    (120, 600): "Skyscraper",
    (300, 600): "Half Page Ad",
    }

JIAA_SIZES = {
    (224, 33): "Small Banner",
    (468, 60): "Regular Banner",
    (728, 90): "Large Banner",
    (120, 60): "Small Badge",
    (120, 90): "Regular Badge",
    (125, 125): "Large Badge",
    (200, 200): "Small Rectangle",
    (300, 250): "Regular Rectangle",
    (336, 280): "Large Rectangle",
    (120, 600): "Regular Skyscraper",
    (160, 600): "Wide Skyscraper",
    (148, 800): "Large Skyscraper",
}

In [6]:
matched_img_count = 0

# Create a set of all unique (width, height) pairs from both size dicts
all_ad_sizes = set(IAB_SIZES.keys()) | set(JIAA_SIZES.keys())

for root, dirs, files in os.walk('data/'):
    for file in files:
        if file.endswith('utf8.html'):
            file_path = os.path.join(root, file)
            with open(file_path, 'r', encoding='utf-8') as f:
                soup = BeautifulSoup(f, 'html.parser')
                imgs = soup.find_all('img')
                for img in imgs:
                    try:
                        w = int(img['width']) if img.has_attr('width') else None
                        h = int(img['height']) if img.has_attr('height') else None
                        if w is not None and h is not None:
                            if (w, h) in all_ad_sizes:
                                matched_img_count += 1
                    except ValueError:
                        continue  # skip if width/height is not an integer

print(f"Number of <img> elements matching JIAA_SIZES or IAB_SIZES: {matched_img_count}")

Number of <img> elements matching JIAA_SIZES or IAB_SIZES: 170
