# Prepare all directories

In [10]:
import os
import json

cwd = os.getcwd()

articles_dir = os.path.join(cwd, "articles")
article_files = [os.path.join(articles_dir, file) for file in os.listdir(articles_dir)]
images_dir = os.path.join(cwd, "images")

if not os.path.exists(images_dir):
    os.mkdir(images_dir)
    print("Folder %s created!" % images_dir)
else:
    print("Folder %s already exists" % images_dir)

Folder /Users/workspace/Projects/dot_mechanical_turk/data/scripts/data_crawl/jsParsers/myntra/images already exists


# Preprocess image ids and their urls

In [None]:
remaining_products = "remaining_products"
remainingIds = ["15100708", "13624176", "11841316", "13623056", "10328163", "23885636", "20185308", "22426762", "20348594", "20064030", "15050578", "22595066", "11885560"]
# with open(remaining_products) as p_file:
#     for id in p_file.readlines():
#         remainingIds[id.strip()] = 1
    
product_images_map = []
for file in article_files:
    filename = os.path.splitext(os.path.basename(file))[0]
    if filename in remainingIds:
        with open(file) as f:
            try :
                data = json.loads(f.read())
                images = list(
                    map(
                        lambda image: image["imageURL"],
                        data["media"]["albums"][0]["images"],
                    )
                )
                product_img_dir = os.path.join(
                        images_dir, filename
                    )
                product_images_map.append((product_img_dir, images))                    
            except Exception as e:
                print(e, file)
print(len(product_images_map))

# Check and Fetch all images

In [None]:
from threading import Lock
import urllib
from urllib.request import urlopen
import requests
from PIL import Image as ImageP

from concurrent.futures import ThreadPoolExecutor

def pil_check(filename):
    img = ImageP.open(filename)  # open the image file
    img.verify()  # verify that it is a good image, without decoding it.. quite fast
    img.close()

    # Image manipulation is mandatory to detect few defects
    img = ImageP.open(filename)  # open the image file
    # alternative (removed) version, decode/recode:
    # f = cStringIO.StringIO()
    # f = io.BytesIO()
    # img.save(f, "BMP")
    # f.close()
    img.transpose(ImageP.FLIP_LEFT_RIGHT)
    img.close()

# load a file from a URL, returns content of downloaded file
def download_url(urlpath, image_path):
    # open a connection to the server
    response = requests.get(urlpath)
    if response.status_code == 200:
        print("Download success", urlpath, image_path)
        return response.content
    else:
        print("Download error", urlpath, image_path)

# save provided content to the local path
def save_file(path, data):
    # open the local file for writing
    with open(path, 'wb') as file:
        # write all provided data to the file
        file.write(data)

def downloadAndSave(image, path):
    data = download_url(image, path)
    save_file(path=path, data=data)

def fetch_images(file):
    if not os.path.exists(file[0]):
        os.mkdir(file[0])
        for idx, image in enumerate(file[1]):
            downloadAndSave(image, os.path.join(file[0], "{idx}.jpg".format(idx=idx)))
    else:
        for idx, image in enumerate(file[1]):
            path = os.path.join(file[0], "{idx}.jpg".format(idx=idx))
            if not os.path.exists(path):
                downloadAndSave(image, path)
            else:
                try:
                   pil_check(path)
                except Exception as e:
                    print("Image error:", e, path)
                    downloadAndSave(image, path)
                
import concurrent.futures

with ThreadPoolExecutor(50) as ex:
    count = 0
    futures = [ex.submit(fetch_images, data) for data in product_images_map]
    for future in concurrent.futures.as_completed(futures):
        count+=1
        print("Completed", count)


# Validate and check how many images are corrupted

In [None]:
from PIL import Image as ImageP

from concurrent.futures import ThreadPoolExecutor

def pil_check(filename):
    img = ImageP.open(filename)  # open the image file
    img.verify()  # verify that it is a good image, without decoding it.. quite fast
    img.close()

    # Image manipulation is mandatory to detect few defects
    img = ImageP.open(filename)  # open the image file
    # alternative (removed) version, decode/recode:
    # f = cStringIO.StringIO()
    # f = io.BytesIO()
    # img.save(f, "BMP")
    # f.close()
    img.transpose(ImageP.FLIP_LEFT_RIGHT)
    img.close()
    
def checkImages(file):
    corrupted_images = []
    for idx, image in enumerate(file[1]):
        path = os.path.join(file[0], "{idx}.jpg".format(idx=idx))
        try:
            pil_check(path)
        except Exception as e:
            corrupted_images.append(image)
    return (file[0], corrupted_images)
                
import concurrent.futures

with ThreadPoolExecutor(100) as ex:
    corrupted_image = {}
    count = 0
    futures = [ex.submit(checkImages, data) for data in product_images_map]
    for future in concurrent.futures.as_completed(futures):
        result = future.result()
        count += 1
        if result[1]:
            corrupted_image[result[0]] = result[1]
        print("Completed", count)
    print("Total failed product", corrupted_image.keys())

In [None]:
# Check images
for f in corrupted_image.keys():
    images = os.listdir(f)
    for image in images:
        try:
            pil_check(os.path.join(f, image))
        except Exception as e:
            print("Error",e, f, image)

# Get all data sizes

In [13]:
from pathlib import Path
images_size = sum(os.stat(os.path.join(dir_path, f)).st_size for dir_path, _, files in os.walk(images_dir) for f in files)
print(images_size)

434679655991


In [14]:
articles_directory = Path('./articles')
articles_size = sum(f.stat().st_size for f in articles_directory.glob('**/*') if f.is_file())
print(articles_size)

7322285984


In [15]:
total = articles_size + images_size
_KB = 1024

total_GB = total / _KB**3
print("Total data size in GB is: ", total_GB)

Total data size in GB is:  411.64638658519834


In [16]:
# Validate if we have equal articles and images set

imageDirsCount = 0
# Iterate directory
for path in os.listdir(images_dir):
    # check if current path is a file
    if os.path.isdir(os.path.join(images_dir, path)):
        imageDirsCount += 1

articlesCount = len([name for name in os.listdir(articles_dir) if os.path.isfile(os.path.join(articles_dir, name))])

print("Articles:", articlesCount)
print("ImagesSet:", imageDirsCount)
print("Have equal images set and articles:", articlesCount == imageDirsCount)

Articles: 328385
ImagesSet: 328385
Have equal images set and articles: True


In [17]:
# Count all images of all articles

image_count = sum(1 for _, _, files in os.walk(images_dir) for f in files)

print('All images:', image_count)

All images: 1899884


In [18]:
size_per_image = 434679655991/image_count
size_per_image_kb = size_per_image/_KB
print("Average size per image: ", size_per_image_kb)

Average size per image:  223.43040498983672


In [19]:
images_per_article = image_count/328385
print("Average number of images per article:", images_per_article)

Average number of images per article: 5.785538316305556


In [3]:
all_images = [os.path.join(dir_path, f) for dir_path, dir_names, files in os.walk(images_dir) for f in files]

In [None]:
from PIL import Image as ImageP

from concurrent.futures import ThreadPoolExecutor

from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

def pil_check(filename):
    img = ImageP.open(filename)  # open the image file
    img.verify()  # verify that it is a good image, without decoding it.. quite fast
    img.close()

    # Image manipulation is mandatory to detect few defects
    img = ImageP.open(filename)  # open the image file
    # alternative (removed) version, decode/recode:
    # f = cStringIO.StringIO()
    # f = io.BytesIO()
    # img.save(f, "BMP")
    # f.close()
    img.transpose(ImageP.FLIP_LEFT_RIGHT)
    img.close()
    
def checkImagesByPath(image_path):
    try:
        pil_check(image_path)
        return None
    except Exception as e:
        print("Exception", e)
        return image_path
        
                
import concurrent.futures

with ThreadPoolExecutor(100) as ex:
    all_corrupted_images = []
    count = 0
    futures = [ex.submit(checkImagesByPath, image) for image in all_images if not image.endswith(".DS_Store")]
    for future in concurrent.futures.as_completed(futures):
        result = future.result()
        if result is not None:
            all_corrupted_images.append(result)
        count += 1
    print("Total failed product", len(all_corrupted_images))

# Delete all corrupted images

In [None]:
for path in all_corrupted_images:
    try:
        os.remove(path)
    except Exception as e:
        print(e)