# Get Links

In [2]:
import os
import pandas as pd
import threading
import concurrent.futures
import numpy as np
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

# Load the CSV file
file_path = './data/combined_tabular_data.csv'
checkpoint_path = './data/checkpoint_data.csv'

write_lock = threading.Lock()

# Initialize or load the checkpoint file
if os.path.exists(checkpoint_path):
    file = pd.read_csv(checkpoint_path, low_memory=False)
else:
    file = pd.read_csv(file_path, low_memory=False)
    file['image_url'] = None
    file.to_csv(checkpoint_path, index=False)

def scrape_page(data_chunk):
    options = Options()
    options.add_argument('--headless=new')

    updates_buffer = []
    update_count = 0

    for index, row in data_chunk.iterrows():

        if pd.notna(row['image_url']):
            continue

        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

        page_url = row['link']
        driver.get(page_url)

        try:
            image_tag = WebDriverWait(driver, 60).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'img.leaflet-litter-img'))
            )
            image_url = image_tag.get_attribute('src')
            updates_buffer.append((row['id'], image_url))
            
            print(f'Found image URL: {image_url}')

            update_count += 1
            if update_count >= 15:
                with write_lock:
                    temp_file = pd.read_csv(checkpoint_path, low_memory=False)
                    print(temp_file.image_url.isna().sum())
                    for id, url in updates_buffer:
                        temp_file.loc[temp_file['id'] == id, 'image_url'] = url
                    temp_file.to_csv(checkpoint_path, index=False)
                    updates_buffer = []
                    update_count = 0

        except Exception as e:
            print(f'No image found on page or error: {page_url}')

        driver.quit()

    # Ensure any remaining updates are written out after loop completion
    if updates_buffer:
        with write_lock:
            temp_file = pd.read_csv(checkpoint_path, low_memory=False)
            for id, url in updates_buffer:
                temp_file.loc[temp_file['id'] == id, 'image_url'] = url
            temp_file.to_csv(checkpoint_path, index=False)

# # Split DataFrame into chunks
# num_chunks = 4
# chunks = np.array_split(file, num_chunks)

# # Process chunks in parallel
# with concurrent.futures.ThreadPoolExecutor() as executor:
#     executor.map(scrape_page, chunks)

scrape_page(file)

# Save the final DataFrame to a file
final_data = pd.read_csv(checkpoint_path)
final_data.to_excel('./data/final_output_with_images.xlsx', index=False)

No image found on page or error: https://openlittermap.com/global?lat=32.757176519402&lon=-117.14407081727&zoom=17&photo=309226
No image found on page or error: https://openlittermap.com/global?lat=32.757182302915&lon=-117.14453215722&zoom=17&photo=309760
Found image URL: https://olm-s3.s3.eu-west-1.amazonaws.com/2021/04/25/IMG_3893.HEIC


  final_data = pd.read_csv(checkpoint_path)
