# Instagram Multi-User Scraper (DOM-based, Mobile View)
Item-by-item downloading with smart scroll estimation
Downloads ALL valid images/videos at each carousel position (URL-tracked globally to avoid duplicates)

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import time
from bs4 import BeautifulSoup as bs
import requests
from pathlib import Path
import json
import hashlib
from datetime import datetime, timedelta

In [None]:
# Configuration
USERNAME = ""  # Your Instagram username
PASSWORD = ""  # Your Instagram password

# List of usernames to scrape
USERS_TO_SCRAPE = []  # e.g., ["user1", "user2", "user3"]

# Cutoff date - only scrape posts from this date forward
CUTOFF_DATE = datetime(2023, 8, 17, 0, 0, 0)

# Sleep multiplier - set to 2 or 3 to slow down (default 1)
SLEEP_MULTIPLIER = 1  # Change to 2 or 3 if you need slower execution

DOWNLOAD_DIR = Path("instagram_downloads")
DOWNLOAD_DIR.mkdir(exist_ok=True)

# Global tracking
downloaded_hashes = set()
seen_urls = set()  # Track image URLs globally to avoid re-downloading
stats_log = []

In [None]:
# Setup Chrome with mobile emulation
options = webdriver.ChromeOptions()
options.add_experimental_option('mobileEmulation', {
    'deviceName': 'iPhone 12 Pro'
})

driver = webdriver.Chrome(options=options)
print("✓ Browser opened with mobile emulation")
print(f"Sleep multiplier: {SLEEP_MULTIPLIER}x")

In [None]:
# Login to Instagram
def login_instagram(username, password):
    driver.get('https://www.instagram.com/')
    
    try:
        # Wait for and click login button if on homepage
        try:
            login_link = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//a[contains(@href, '/accounts/login')]"))
            )
            login_link.click()
        except:
            pass
        
        # Enter username
        username_input = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.NAME, "username"))
        )
        username_input.send_keys(username)
        
        # Enter password
        password_input = driver.find_element(By.NAME, "password")
        password_input.send_keys(password)
        password_input.send_keys(Keys.RETURN)
        
        # Handle "Save Your Login Info" popup
        try:
            not_now = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Not now') or contains(text(), 'Not Now')]"))
            )
            not_now.click()
        except:
            pass
        
        # Handle "Turn on Notifications" popup
        try:
            not_now = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Not Now')]"))
            )
            not_now.click()
        except:
            pass
        
        print("✓ Logged in successfully")
        return True
        
    except Exception as e:
        print(f"Login failed: {e}")
        return False

# Perform login
if USERNAME and PASSWORD:
    login_instagram(USERNAME, PASSWORD)
else:
    print("⚠ No login credentials provided")

In [None]:
# Extract post date from a post page
def extract_post_date():
    try:
        time_elements = driver.find_elements(By.CSS_SELECTOR, 'time.x1p4m5qa')
        if time_elements:
            datetime_str = time_elements[0].get_attribute('datetime')
            if datetime_str:
                post_date = datetime.fromisoformat(datetime_str.replace('Z', '+00:00'))
                return post_date.replace(tzinfo=None)
    except:
        pass
    return None

In [None]:
# Extract post caption/text
def extract_post_caption():
    try:
        # Look for h1 with caption text
        caption_elements = driver.find_elements(By.CSS_SELECTOR, 'h1._ap3a._aaco._aacu._aacx._aad7._aade')
        if caption_elements:
            return caption_elements[0].text
    except:
        pass
    return ""

In [None]:
# (sample_post_dates function removed - no longer needed)

In [None]:
# Estimate scrolls needed to reach cutoff date
def estimate_scrolls_needed(username):
    print("\nEstimating scrolls needed...")
    
    profile_url = f"https://www.instagram.com/{username}/"
    driver.get(profile_url)
    time.sleep(6 * SLEEP_MULTIPLIER)
    
    # Scroll 1 time and collect links
    all_links_1_scroll = []
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(4 * SLEEP_MULTIPLIER)
    
    elements = driver.find_elements(By.CSS_SELECTOR, 'a[href*="/p/"], a[href*="/reel/"]')
    seen = set()
    for elem in elements:
        href = elem.get_attribute('href')
        if href and ('/p/' in href or '/reel/' in href) and href not in seen:
            all_links_1_scroll.append(href)
            seen.add(href)
    
    print(f"After 1 scroll: {len(all_links_1_scroll)} posts collected")
    
    if not all_links_1_scroll:
        print("No posts found, using default scroll count")
        return 50
    
    # Get date from last post (oldest visible)
    print("Checking date of last post after 1 scroll...")
    driver.get(all_links_1_scroll[-1])
    time.sleep(3 * SLEEP_MULTIPLIER)
    earliest_1_scroll = extract_post_date()
    
    if not earliest_1_scroll:
        print("Could not extract date, using default scroll count")
        return 50
    
    print(f"Earliest date after 1 scroll: {earliest_1_scroll.strftime('%Y-%m-%d')}")
    
    # Go back and scroll 3 times, collecting links after each scroll
    driver.get(profile_url)
    time.sleep(6 * SLEEP_MULTIPLIER)
    
    all_links_3_scroll = []
    seen = set()
    for i in range(3):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(4 * SLEEP_MULTIPLIER)
        
        # Collect links after each scroll
        elements = driver.find_elements(By.CSS_SELECTOR, 'a[href*="/p/"], a[href*="/reel/"]')
        for elem in elements:
            href = elem.get_attribute('href')
            if href and ('/p/' in href or '/reel/' in href) and href not in seen:
                all_links_3_scroll.append(href)
                seen.add(href)
    
    print(f"After 3 scrolls: {len(all_links_3_scroll)} posts collected")
    
    if not all_links_3_scroll:
        print("No posts found, using default scroll count")
        return 50
    
    # Get date from last post (oldest visible)
    print("Checking date of last post after 3 scrolls...")
    driver.get(all_links_3_scroll[-1])
    time.sleep(3 * SLEEP_MULTIPLIER)
    earliest_3_scroll = extract_post_date()
    
    if not earliest_3_scroll:
        print("Could not extract date, using default scroll count")
        return 50
    
    print(f"Earliest date after 3 scrolls: {earliest_3_scroll.strftime('%Y-%m-%d')}")
    
    time_diff = earliest_1_scroll - earliest_3_scroll
    days_per_2_scrolls = time_diff.days
    
    if days_per_2_scrolls <= 0:
        print("Time difference too small, using default scroll count")
        return 50
    
    print(f"Time covered by 2 scrolls: {days_per_2_scrolls} days")
    
    days_to_cutoff = (earliest_3_scroll - CUTOFF_DATE).days
    
    if days_to_cutoff <= 0:
        print(f"Already reached cutoff date!")
        return 3
    
    additional_scrolls = int((days_to_cutoff / days_per_2_scrolls) * 2) + 5
    total_scrolls = 3 + additional_scrolls
    
    print(f"Days to cutoff: {days_to_cutoff}")
    print(f"Estimated total scrolls needed: {total_scrolls}")
    
    if total_scrolls > 200:
        print("Capping at 200 scrolls")
        return 200
    
    return total_scrolls

In [None]:
# Extract post stats
def extract_post_stats():
    likes = "0"
    comments = "0"
    is_paid = False
    
    try:
        like_spans = driver.find_elements(By.CSS_SELECTOR, 'span.x1ypdohk.x1s688f.x2fvf9.xe9ewy2[role="button"]')
        if like_spans:
            likes = like_spans[0].text
    except:
        pass
    
    try:
        comment_spans = driver.find_elements(
            By.CSS_SELECTOR,
            'span.xdj266r.x14z9mp.xat24cr.x1lziwak.xexx8yu.xyri2b.x18d9i69.x1c1uobl.x1hl2dhg.x16tdsg8.x1vvkbs'
        )
        for span in comment_spans:
            text = span.text
            if text.replace(',', '').isdigit():
                comments = text
                break
    except:
        pass
    
    try:
        if "Paid partnership with " in driver.page_source:
            is_paid = True
    except:
        pass
    
    return likes, comments, is_paid

# Extract posting account (subbrand)
def extract_posting_account():
    try:
        # Look for the posting account span (e.g., nikerunning as subbrand of nike)
        account_spans = driver.find_elements(
            By.CSS_SELECTOR,
            'span.x193iq5w.xeuugli.x1fj9vlw.x13faqbe.x1vvkbs.xt0psk2.x1i0vuye.xvs91rp.x1s688f.x5n08af.x10wh9bi.xpm28yp.x8viiok.x1o7cslx'
        )
        if account_spans:
            # Return the first one that looks like a username
            for span in account_spans:
                text = span.text.strip()
                if text and len(text) > 0 and not text.isdigit():
                    return text
    except:
        pass
    return None

In [None]:
# Extract current carousel item media URLs (returns ALL valid images/videos)
def extract_current_item_urls():
    media_urls = []
    
    try:
        # Check for video first (priority)
        videos = driver.find_elements(By.TAG_NAME, 'video')
        for video in videos:
            src = video.get_attribute('src')
            if src and ('cdninstagram.com' in src or 'fbcdn.net' in src):
                if src not in seen_urls:
                    media_urls.append((src, 'video'))
        
        # Get ALL valid images (not just first one)
        images = driver.find_elements(By.TAG_NAME, 'img')
        
        for img in images:
            src = img.get_attribute('src')
            if src and ('cdninstagram.com' in src or 'fbcdn.net' in src):
                # Filter out unwanted images
                if any(x in src for x in ['/s150x150/', '/s320x320/', 's640x640', 'static']):
                    continue
                
                # Filter out 150x150 profile pics by dimension
                try:
                    width = img.get_attribute('width')
                    height = img.get_attribute('height')
                    if width and height:
                        if int(width) == 150 and int(height) == 150:
                            continue
                except:
                    pass
                
                # Skip if we've already seen this URL
                if src in seen_urls:
                    continue
                
                # Add ALL valid images (better to have more than miss content)
                media_urls.append((src, 'image'))
        
    except Exception as e:
        print(f"  Error extracting media: {e}")
    
    return media_urls

In [None]:
# Check if next button exists
def has_next_button():
    try:
        selectors = [
            'button[aria-label="Next"]',
            'button[aria-label="next"]',
            'button._afxw._al46._al47'
        ]
        
        for selector in selectors:
            try:
                next_btn = WebDriverWait(driver, 2).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, selector))
                )
                return next_btn
            except:
                continue
        return None
    except:
        return None

In [None]:
# Download single file
def download_file(url, filepath):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        
        # Check for duplicates
        content_hash = hashlib.md5(response.content).hexdigest()
        if content_hash in downloaded_hashes:
            return False, "duplicate"
        
        downloaded_hashes.add(content_hash)
        filepath.write_bytes(response.content)
        return True, None
        
    except Exception as e:
        return False, str(e)[:40]

In [None]:
# Scrape a single post (item-by-item download, all images per position)
def scrape_post(post_url):
    print(f"\n{'='*60}")
    print(f"Scraping: {post_url}")
    print('='*60)
    
    driver.get(post_url)
    time.sleep(6 * SLEEP_MULTIPLIER)
    
    # Extract date
    post_date = extract_post_date()
    if post_date:
        print(f"Date: {post_date.strftime('%Y-%m-%d %H:%M:%S')}")
    
    # Extract stats
    likes, comments, is_paid = extract_post_stats()
    
    # Extract posting account (subbrand)
    posting_account = extract_posting_account()
    if posting_account:
        print(f"Posted by: {posting_account}")
    
    print(f"Stats: Likes={likes}, Comments={comments}, Paid={is_paid}")
    
    # Extract caption
    caption = extract_post_caption()
    
    # Create directory for this post
    post_id = post_url.rstrip('/').split('/')[-1]
    post_dir = DOWNLOAD_DIR / post_id
    post_dir.mkdir(exist_ok=True)
    
    # Save caption
    if caption:
        caption_file = post_dir / "caption.txt"
        caption_file.write_text(caption, encoding='utf-8')
        print(f"Saved caption ({len(caption)} chars)")
    
    # Download carousel items one by one
    carousel_position = 1
    total_items_downloaded = 0
    
    while True:
        # Extract ALL media URLs at current position
        media_urls = extract_current_item_urls()
        
        if media_urls:
            print(f"\n  Carousel position {carousel_position}: Found {len(media_urls)} items")
            
            # Download each media item
            for url, media_type in media_urls:
                ext = '.mp4' if media_type == 'video' else '.jpg'
                
                # Find next available filename
                item_num = 1
                while True:
                    filepath = post_dir / f"item_{item_num}{ext}"
                    if not filepath.exists():
                        break
                    item_num += 1
                
                success, error = download_file(url, filepath)
                if success:
                    print(f"    Downloaded item_{item_num}{ext} ({media_type})")
                    seen_urls.add(url)  # Mark URL as seen
                    total_items_downloaded += 1
                else:
                    if error == "duplicate":
                        print(f"    Skipped item_{item_num}: Duplicate content")
                        seen_urls.add(url)  # Still mark as seen
                    else:
                        print(f"    Failed item_{item_num}: {error}")
                
                time.sleep(5 * SLEEP_MULTIPLIER)
        
        # Try to click next
        next_btn = has_next_button()
        if not next_btn:
            break
        
        next_btn.click()
        time.sleep(1 * SLEEP_MULTIPLIER)
        carousel_position += 1
    
    print(f"\nProcessed {carousel_position} carousel positions")
    
    # Log stats
    stats_log.append({
        'timestamp': datetime.now().isoformat(),
        'post_url': post_url,
        'post_id': post_id,
        'post_date': post_date.isoformat() if post_date else None,
        'posting_account': posting_account,
        'likes': likes,
        'comments': comments,
        'paid_partnership': is_paid,
        'caption_length': len(caption) if caption else 0,
        'carousel_positions': carousel_position,
        'media_downloaded': total_items_downloaded
    })
    
    print(f"✓ Downloaded {total_items_downloaded} unique items to '{post_dir}'")
    return total_items_downloaded

In [None]:
# Scrape user profile
def scrape_user(username, max_posts=None):
    print(f"\n{'='*60}")
    print(f"Scraping user: {username}")
    print(f"Cutoff date: {CUTOFF_DATE.strftime('%Y-%m-%d')}")
    print('='*60)
    
    # Estimate scrolls needed
    estimated_scrolls = estimate_scrolls_needed(username)
    
    # Scroll profile and collect links incrementally
    print(f"\nScrolling {estimated_scrolls} times...")
    profile_url = f"https://www.instagram.com/{username}/"
    driver.get(profile_url)
    time.sleep(6 * SLEEP_MULTIPLIER)
    
    all_post_links = set()
    
    for i in range(estimated_scrolls):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(4 * SLEEP_MULTIPLIER)
        
        # Collect links after each scroll
        links = driver.find_elements(By.CSS_SELECTOR, 'a[href*="/p/"], a[href*="/reel/"]')
        for link in links:
            href = link.get_attribute('href')
            if href and ('/p/' in href or '/reel/' in href):
                all_post_links.add(href)
        
        if (i + 1) % 10 == 0:
            print(f"  Scrolled {i + 1}/{estimated_scrolls} times... ({len(all_post_links)} posts collected so far)")
    
    print(f"Completed {estimated_scrolls} scrolls")
    
    # Convert set to list
    post_links = list(all_post_links)
    
    if max_posts:
        post_links = post_links[:max_posts]
    
    print(f"Found {len(post_links)} unique posts (including reels)")
    
    # Scrape each post
    try:
        total_downloaded = 0
        skipped_old = 0
        
        for i, post_url in enumerate(post_links, 1):
            print(f"\nPost {i}/{len(post_links)}")
            
            # Check post date before downloading
            driver.get(post_url)
            time.sleep(3 * SLEEP_MULTIPLIER)
            
            post_date = extract_post_date()
            if post_date and post_date < CUTOFF_DATE:
                print(f"⏩ Skipping post from {post_date.strftime('%Y-%m-%d')} (before cutoff)")
                skipped_old += 1
                continue
            
            # Go back to post URL to scrape it properly
            count = scrape_post(post_url)
            total_downloaded += count
            time.sleep(4 * SLEEP_MULTIPLIER)
        
        print(f"\n✓ User '{username}' complete:")
        print(f"  Items downloaded: {total_downloaded}")
        print(f"  Posts skipped (before cutoff): {skipped_old}")
        return total_downloaded
        
    except Exception as e:
        print(f"Error scraping user: {e}")
        return 0

In [None]:
# Main scraping loop
if USERS_TO_SCRAPE:
    for username in USERS_TO_SCRAPE:
        scrape_user(username, max_posts=None)
        time.sleep(6 * SLEEP_MULTIPLIER)
else:
    print("⚠ No users to scrape. Add usernames to USERS_TO_SCRAPE list.")

In [None]:
# Save stats log
if stats_log:
    stats_file = DOWNLOAD_DIR / "scrape_stats.json"
    with open(stats_file, 'w') as f:
        json.dump(stats_log, f, indent=2)
    print(f"\n✓ Stats saved to {stats_file}")
    print(f"\nTotal stats:")
    print(f"  Posts scraped: {len(stats_log)}")
    print(f"  Items downloaded: {sum(s['media_downloaded'] for s in stats_log)}")
    print(f"  Paid partnerships: {sum(1 for s in stats_log if s['paid_partnership'])}")
else:
    print("No stats to save")

In [None]:
# Close browser
driver.quit()
print("\n✓ Browser closed")
print("✓ All done!")