In [7]:

import pandas as pd
import urllib.request
import os
import urllib.error
import queue
import threading
import urllib.request
import glob

#path to raw data set = "../Datasets/Master_Dataset_Raw.csv"

# Note: I have downloaded 80,581 images, but there are a total of 80,585 images --> 4 images returned errors downloading. Here is one:
# Error downloading image: https://image.pixstory.com/optimized/Pixstory-image-1605712733.jpg - HTTP Error 403: Forbidden


# Creating a DF, Modifying URLs, and Filtering out duplicate links

In [8]:
# Read the CSV file and create a DataFrame
df = pd.read_csv('../Datasets/Master_Dataset_Raw.csv')

pd.set_option('display.max_colwidth', None)

# Drop the duplicate URLs from the DataFrame
df = df.drop_duplicates(subset='Media')

# Define a lambda function to modify the URLs
modify_url = lambda url: url.replace(".com/", ".com/optimized/")

# Apply the lambda function to the "Media" column of the DataFrame
df['Media'] = df['Media'].apply(modify_url)

# Downloading Images Via Threading

In [10]:

# Define the maximum number of worker threads
NUM_THREADS = 10

# Define the starting index for resuming the download
start_index =  21206

# Define a shared queue to hold the list of URLs to download
url_queue = queue.Queue()

# Define a shared counter to keep track of the current index
index_lock = threading.Lock()
current_index = 0

# Populate the queue with the list of URLs
for url in df['Media']:
    url_queue.put(url)

# Define a function to download each image and print the URL
def download_image():
    global current_index
    while True:
        try:
            # Get the next URL from the queue
            url = url_queue.get_nowait()

            # Check if we need to skip this URL
            with index_lock:
                if current_index < start_index:
                    current_index += 1
                    continue

            # Download the image
            print(f"Downloading image: {url}")
            urllib.request.urlretrieve(url, f"95k_Images/{url.split('/')[-1]}")

            # Update the current index
            with index_lock:
                current_index += 1

        except queue.Empty:
            break

# Create a list of worker threads
threads = []
for i in range(NUM_THREADS):
    t = threading.Thread(target=download_image)
    threads.append(t)

# Start the worker threads
for t in threads:
    t.start()

# Wait for all worker threads to finish
for t in threads:
    t.join()

# Save the current index for resuming the download later
start_index = current_index

Downloading image: https://image.pixstory.com/optimized/Pixstory-image-164604195933917.pngDownloading image: https://image.pixstory.com/optimized/Pixstory-image-164266843248029.png
Downloading image: https://image.pixstory.com/optimized/Pixstory-image-164732518732375.png
Downloading image: https://image.pixstory.com/optimized/Pixstory-image-164787649871909.png
Downloading image: https://image.pixstory.com/optimized/Pixstory-image-164250098711324.png
Downloading image: https://image.pixstory.com/optimized/Pixstory-image-164250142625513.png
Downloading image: https://image.pixstory.com/optimized/Pixstory-image-164727272898406.png
Downloading image: https://image.pixstory.com/optimized/Pixstory-image-164302145410772.png
Downloading image: https://image.pixstory.com/optimized/Pixstory-image-164319196325397.png
Downloading image: https://image.pixstory.com/optimized/Pixstory-image-164863776936168.png

Downloading image: https://image.pixstory.com/optimized/Pixstory-image-164448325957858.png

# Old Method - Downloading images

In [7]:
#check how many files in a folder: 
#cd /Users/daniilabbruzzese/Documents/Senior\ Year/DSCI\ 550/assignment\ 2/DSCI550-PixstoryMediaExtractionAndAnalysis/4_Tika\ Image\ Dockers/95k_Images
#ls -1 | wc -l

# Set the starting index for resuming the download
start_index = 29944

# Define a lambda function to download each image and print the URL
def download_image(url):
    global start_index
    index = df[df['Media'] == url].index[0]
    if index < start_index:
        print(f"Skipping image: {url}")
        return
    print(f"Downloading image: {url}")
    try:
        urllib.request.urlretrieve(url, f"95k_Images/{url.split('/')[-1]}")
    except Exception as e:
        print(f"Error downloading image: {url} - {str(e)}")
        return
    start_index = index + 1

# Apply the lambda function to each element in the "Media" column
df['Media'].apply(download_image)


# Verifying Downloading Process is Correcf

In [7]:
#check how many files in a folder: 
#cd /Users/daniilabbruzzese/Documents/Senior\ Year/DSCI\ 550/assignment\ 2/DSCI550-PixstoryMediaExtractionAndAnalysis/4_Tika\ Image\ Dockers/95k_Images
#ls -1 | wc -l

# check if the index of the non-ordered DataFrame is ordered correctly
print(df.index.is_monotonic_increasing)

True


In [11]:

# Check the number of unique URLs in the "Media" column
num_unique_urls = df['Media'].nunique()

# Check the total number of rows in the DataFrame
num_rows = len(df)

num_unique_index = len(df.index.unique())



# Print the results
print(f"Number of unique URLs: {num_unique_urls}")
print(f"Total number of rows: {num_rows}")
print("Number of unique index numbers:", num_unique_index)


Number of unique URLs: 80585
Total number of rows: 80585
Number of unique index numbers: 80585


In [12]:


# set the directory path
dir_path = '95k_Images'

# get a list of all the file paths in the folder
files_list = glob.glob(os.path.join(dir_path, '*'))

# get the total number of files in the folder
total_files = len(files_list)

# get the number of unique files in the folder
unique_files = len(set(files_list))

print(f"Total files: {total_files}")
print(f"Unique files: {unique_files}")


Total files: 80584
Unique files: 80584
