In [1]:
import pandas as pd
import os
import requests
import numpy as np
import time
from fake_useragent import UserAgent
from help_functions import is_file_saved

df = pd.read_csv("images_to_scrape.csv").iloc[:,1:]

#=====================
# CREATE THE FOLDERS 
#=====================
cwd = os.getcwd()
OUT_FOLDER = "raw_data"

if not os.path.exists(cwd + f'/{OUT_FOLDER}'):
    os.mkdir(cwd + f'/{OUT_FOLDER}')
os.chdir(cwd + f'/{OUT_FOLDER}')

main_dirs = ['train', 'validation', 'test']
categories = df.art_movement.unique().tolist()

for m in main_dirs:
    newdir = cwd + f"/{OUT_FOLDER}/{m}"
    if not os.path.exists(newdir):
        os.mkdir(newdir)
    for cat in categories:
        subdir = newdir + f"/{cat}"
        if not os.path.exists(subdir):
            os.mkdir(subdir)
os.chdir(cwd)


#=========================
# SCRAPING THE IMAGES
#=========================

# keeping only obs in train, val, test (sorry means "sorry, youre not part of the team")
my_df = df[~df.folder.str.contains("sorry")].reset_index().loc[:,['folder', 'art_movement', 'jpeg', 'image_name']] # keeping only train, val and test
# creating a column to update when image is scraped AND saved
my_df['scrape_state']=['not_scraped'] * len(my_df)

# initialize fake user agent
fake_agent = UserAgent().random
headers = {'User-Agent': fake_agent}
CHANGE_RATE = 100
SLEEP_TIME = 1.5

for i in range(len(my_df)):
    # two info to move in the right folder, in order to save the imaege
    folder = my_df.folder[i]
    cat = my_df.art_movement[i]
    
    newdir = f"{cwd}/{OUT_FOLDER}/{folder}/{cat}" # that's the directory where I will go
    jpeg_link = my_df.jpeg[i] # link to get through requests
    image_name = my_df.image_name[i]+'.jpg' # image name to save the image
    
    # changing the user agent every MAX_REQUESTS:
    if i%CHANGE_RATE == 0:
        fake_agent = UserAgent().random
        headers = {'User-Agent': fake_agent}
        print(f"{str(i)} images done, changing agent")
    else:
        pass
    
    # now we make the request and see the status
    if requests.get(jpeg_link).status_code != 200: # request not accepted
        my_df.loc[i, "scrape_state"]="invalid request"
        print("error request")
    else: # request was successful
        image=requests.get(jpeg_link).content # this is our image        
        os.chdir(newdir) # changing directory
        with open(f"{image_name}", 'wb') as handle:   
            handle.write(image)
        handle.close()
        if is_file_saved(newdir, image_name):
            my_df.loc[i, "scrape_state"]='saved'
        else:
            my_df.loc[i, "scrape_state"]='unable to save'
            print("error saving")
    
    time.sleep(SLEEP_TIME)
    
os.chdir(cwd)
print("unable to get", len(my_df[my_df['scrape_state']!="saved"]), "images.")
my_df.to_csv("scraping_results.csv")

0 images done, changing agent
100 images done, changing agent
200 images done, changing agent
300 images done, changing agent
400 images done, changing agent
500 images done, changing agent
600 images done, changing agent
700 images done, changing agent
800 images done, changing agent
900 images done, changing agent
1000 images done, changing agent
1100 images done, changing agent
1200 images done, changing agent
1300 images done, changing agent
1400 images done, changing agent
1500 images done, changing agent
1600 images done, changing agent
1700 images done, changing agent
1800 images done, changing agent
1900 images done, changing agent
2000 images done, changing agent
2100 images done, changing agent
2200 images done, changing agent
2300 images done, changing agent
2400 images done, changing agent
2500 images done, changing agent
2600 images done, changing agent
2700 images done, changing agent
2800 images done, changing agent
2900 images done, changing agent
3000 images done, chan