# Instagram Multi-User Scraper (DOM-based, Mobile View)
Item-by-item downloading with smart scroll estimation
Downloads each carousel item sequentially to capture all videos

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import time
from bs4 import BeautifulSoup as bs
import requests
from pathlib import Path
import json
import hashlib
from datetime import datetime, timedelta

In [None]:
# Configuration
USERNAME = ""  # Your Instagram username
PASSWORD = ""  # Your Instagram password

# List of usernames to scrape
USERS_TO_SCRAPE = []  # e.g., ["user1", "user2", "user3"]

# Cutoff date - only scrape posts from this date forward
CUTOFF_DATE = datetime(2023, 8, 17, 0, 0, 0)

DOWNLOAD_DIR = Path("instagram_downloads")
DOWNLOAD_DIR.mkdir(exist_ok=True)

# Global tracking
downloaded_hashes = set()
stats_log = []

In [None]:
# Setup Chrome with mobile emulation
options = webdriver.ChromeOptions()
options.add_experimental_option('mobileEmulation', {
    'deviceName': 'iPhone 12 Pro'
})

driver = webdriver.Chrome(options=options)
print("✓ Browser opened with mobile emulation")

In [None]:
# Login to Instagram
def login_instagram(username, password):
    driver.get('https://www.instagram.com/')
    time.sleep(6)
    
    try:
        # Wait for and click login button if on homepage
        try:
            login_link = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//a[contains(@href, '/accounts/login')]"))
            )
            login_link.click()
            time.sleep(4)
        except:
            pass
        
        # Enter username
        username_input = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.NAME, "username"))
        )
        username_input.send_keys(username)
        
        # Enter password
        password_input = driver.find_element(By.NAME, "password")
        password_input.send_keys(password)
        password_input.send_keys(Keys.RETURN)
        
        time.sleep(10)
        
        # Handle "Save Your Login Info" popup
        try:
            not_now = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Not now') or contains(text(), 'Not Now')]"))
            )
            not_now.click()
            time.sleep(4)
        except:
            pass
        
        # Handle "Turn on Notifications" popup
        try:
            not_now = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Not Now')]"))
            )
            not_now.click()
            time.sleep(4)
        except:
            pass
        
        print("✓ Logged in successfully")
        return True
        
    except Exception as e:
        print(f"Login failed: {e}")
        return False

# Perform login
if USERNAME and PASSWORD:
    login_instagram(USERNAME, PASSWORD)
else:
    print("⚠ No login credentials provided")

In [None]:
# Extract post date from a post page
def extract_post_date():
    try:
        time_elements = driver.find_elements(By.CSS_SELECTOR, 'time.x1p4m5qa')
        if time_elements:
            datetime_str = time_elements[0].get_attribute('datetime')
            if datetime_str:
                post_date = datetime.fromisoformat(datetime_str.replace('Z', '+00:00'))
                return post_date.replace(tzinfo=None)
    except:
        pass
    return None

In [None]:
# Extract post caption/text
def extract_post_caption():
    try:
        # Look for h1 with caption text
        caption_elements = driver.find_elements(By.CSS_SELECTOR, 'h1._ap3a._aaco._aacu._aacx._aad7._aade')
        if caption_elements:
            return caption_elements[0].text
    except:
        pass
    return ""

In [None]:
# Sample posts to get their dates
def sample_post_dates(post_links, sample_count=3):
    dates = []
    sample_links = post_links[-sample_count:] if len(post_links) >= sample_count else post_links
    
    for post_url in sample_links:
        try:
            driver.get(post_url)
            time.sleep(3)
            post_date = extract_post_date()
            if post_date:
                dates.append(post_date)
                print(f"  Sampled: {post_date.strftime('%Y-%m-%d')}")
        except:
            continue
    
    return dates

In [None]:
# Estimate scrolls needed to reach cutoff date
def estimate_scrolls_needed(username):
    print("\nEstimating scrolls needed...")
    
    profile_url = f"https://www.instagram.com/{username}/"
    driver.get(profile_url)
    time.sleep(6)
    
    # Scroll 1 time
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(4)
    
    links_1_scroll = []
    elements = driver.find_elements(By.CSS_SELECTOR, 'a[href*="/p/"]')
    for elem in elements:
        href = elem.get_attribute('href')
        if href and '/p/' in href:
            links_1_scroll.append(href)
    links_1_scroll = list(dict.fromkeys(links_1_scroll))
    
    print(f"After 1 scroll: {len(links_1_scroll)} posts loaded")
    print("Sampling posts after 1 scroll...")
    dates_1_scroll = sample_post_dates(links_1_scroll, sample_count=3)
    
    if not dates_1_scroll:
        print("Could not sample dates, using default scroll count")
        return 50
    
    earliest_1_scroll = min(dates_1_scroll)
    latest_1_scroll = max(dates_1_scroll)
    print(f"Date range after 1 scroll: {latest_1_scroll.strftime('%Y-%m-%d')} to {earliest_1_scroll.strftime('%Y-%m-%d')}")
    
    # Go back and scroll 3 times
    driver.get(profile_url)
    time.sleep(6)
    
    for i in range(3):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(4)
    
    links_3_scroll = []
    elements = driver.find_elements(By.CSS_SELECTOR, 'a[href*="/p/"]')
    for elem in elements:
        href = elem.get_attribute('href')
        if href and '/p/' in href:
            links_3_scroll.append(href)
    links_3_scroll = list(dict.fromkeys(links_3_scroll))
    
    print(f"After 3 scrolls: {len(links_3_scroll)} posts loaded")
    print("Sampling posts after 3 scrolls...")
    dates_3_scroll = sample_post_dates(links_3_scroll, sample_count=3)
    
    if not dates_3_scroll:
        print("Could not sample dates, using default scroll count")
        return 50
    
    earliest_3_scroll = min(dates_3_scroll)
    print(f"Earliest date after 3 scrolls: {earliest_3_scroll.strftime('%Y-%m-%d')}")
    
    time_diff = earliest_1_scroll - earliest_3_scroll
    days_per_2_scrolls = time_diff.days
    
    if days_per_2_scrolls <= 0:
        print("Time difference too small, using default scroll count")
        return 50
    
    print(f"Time covered by 2 scrolls: {days_per_2_scrolls} days")
    
    days_to_cutoff = (earliest_3_scroll - CUTOFF_DATE).days
    
    if days_to_cutoff <= 0:
        print(f"Already reached cutoff date!")
        return 3
    
    additional_scrolls = int((days_to_cutoff / days_per_2_scrolls) * 2) + 5
    total_scrolls = 3 + additional_scrolls
    
    print(f"Days to cutoff: {days_to_cutoff}")
    print(f"Estimated total scrolls needed: {total_scrolls}")
    
    if total_scrolls > 200:
        print("Capping at 200 scrolls")
        return 200
    
    return total_scrolls

In [None]:
# Extract post stats
def extract_post_stats():
    likes = "0"
    comments = "0"
    is_paid = False
    
    try:
        like_spans = driver.find_elements(By.CSS_SELECTOR, 'span.x1ypdohk.x1s688f.x2fvf9.xe9ewy2[role="button"]')
        if like_spans:
            likes = like_spans[0].text
    except:
        pass
    
    try:
        comment_spans = driver.find_elements(
            By.CSS_SELECTOR,
            'span.xdj266r.x14z9mp.xat24cr.x1lziwak.xexx8yu.xyri2b.x18d9i69.x1c1uobl.x1hl2dhg.x16tdsg8.x1vvkbs'
        )
        for span in comment_spans:
            text = span.text
            if text.replace(',', '').isdigit():
                comments = text
                break
    except:
        pass
    
    try:
        if "Paid partnership with " in driver.page_source:
            is_paid = True
    except:
        pass
    
    return likes, comments, is_paid

In [None]:
# Extract current carousel item media URL
def extract_current_item_url():
    try:
        # Check for video first (priority)
        videos = driver.find_elements(By.TAG_NAME, 'video')
        for video in videos:
            src = video.get_attribute('src')
            if src and ('cdninstagram.com' in src or 'fbcdn.net' in src):
                return src, 'video'
        
        # If no video, get image (but skip thumbnails and static)
        images = driver.find_elements(By.TAG_NAME, 'img')
        best_image = None
        best_size = 0
        
        for img in images:
            src = img.get_attribute('src')
            if src and ('cdninstagram.com' in src or 'fbcdn.net' in src):
                # Filter out unwanted images
                if any(x in src for x in ['/s150x150/', '/s320x320/', 's640x640', 'static']):
                    continue
                
                # Try to estimate size (larger is better)
                try:
                    width = img.size.get('width', 0)
                    height = img.size.get('height', 0)
                    size = width * height
                    if size > best_size:
                        best_size = size
                        best_image = src
                except:
                    if not best_image:
                        best_image = src
        
        if best_image:
            return best_image, 'image'
        
    except Exception as e:
        print(f"  Error extracting media: {e}")
    
    return None, None

In [None]:
# Check if next button exists
def has_next_button():
    try:
        selectors = [
            'button[aria-label="Next"]',
            'button[aria-label="next"]',
            'button._afxw._al46._al47'
        ]
        
        for selector in selectors:
            try:
                next_btn = WebDriverWait(driver, 2).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, selector))
                )
                return next_btn
            except:
                continue
        return None
    except:
        return None

In [None]:
# Download single file
def download_file(url, filepath):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        
        # Check for duplicates
        content_hash = hashlib.md5(response.content).hexdigest()
        if content_hash in downloaded_hashes:
            return False, "duplicate"
        
        downloaded_hashes.add(content_hash)
        filepath.write_bytes(response.content)
        return True, None
        
    except Exception as e:
        return False, str(e)[:40]

In [None]:
# Scrape a single post (item-by-item download)
def scrape_post(post_url):
    print(f"\n{'='*60}")
    print(f"Scraping: {post_url}")
    print('='*60)
    
    driver.get(post_url)
    time.sleep(6)
    
    # Extract date
    post_date = extract_post_date()
    if post_date:
        print(f"Date: {post_date.strftime('%Y-%m-%d %H:%M:%S')}")
    
    # Extract stats
    likes, comments, is_paid = extract_post_stats()
    print(f"Stats: Likes={likes}, Comments={comments}, Paid={is_paid}")
    
    # Extract caption
    caption = extract_post_caption()
    
    # Create directory for this post
    post_id = post_url.rstrip('/').split('/')[-1]
    post_dir = DOWNLOAD_DIR / post_id
    post_dir.mkdir(exist_ok=True)
    
    # Save caption
    if caption:
        caption_file = post_dir / "caption.txt"
        caption_file.write_text(caption, encoding='utf-8')
        print(f"Saved caption ({len(caption)} chars)")
    
    # Download carousel items one by one
    item_number = 1
    success_count = 0
    
    while True:
        # Extract current item
        url, media_type = extract_current_item_url()
        
        if url:
            ext = '.mp4' if media_type == 'video' else '.jpg'
            filepath = post_dir / f"item_{item_number}{ext}"
            
            success, error = download_file(url, filepath)
            if success:
                print(f"  Downloaded item_{item_number}{ext} ({media_type})")
                success_count += 1
            else:
                if error == "duplicate":
                    print(f"  Skipped item_{item_number}: Duplicate")
                else:
                    print(f"  Failed item_{item_number}: {error}")
            
            time.sleep(5)  # 5 second pause
        
        # Try to click next
        next_btn = has_next_button()
        if not next_btn:
            break
        
        next_btn.click()
        time.sleep(1)
        item_number += 1
    
    print(f"Processed {item_number} carousel items")
    
    # Log stats
    stats_log.append({
        'timestamp': datetime.now().isoformat(),
        'post_url': post_url,
        'post_id': post_id,
        'post_date': post_date.isoformat() if post_date else None,
        'likes': likes,
        'comments': comments,
        'paid_partnership': is_paid,
        'caption_length': len(caption) if caption else 0,
        'carousel_items': item_number,
        'media_downloaded': success_count
    })
    
    print(f"✓ Downloaded {success_count} unique items to '{post_dir}'")
    return success_count

In [None]:
# Scrape user profile
def scrape_user(username, max_posts=None):
    print(f"\n{'='*60}")
    print(f"Scraping user: {username}")
    print(f"Cutoff date: {CUTOFF_DATE.strftime('%Y-%m-%d')}")
    print('='*60)
    
    # Estimate scrolls needed
    estimated_scrolls = estimate_scrolls_needed(username)
    
    # Scroll profile
    print(f"\nScrolling {estimated_scrolls} times...")
    profile_url = f"https://www.instagram.com/{username}/"
    driver.get(profile_url)
    time.sleep(6)
    
    for i in range(estimated_scrolls):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(4)
        
        if (i + 1) % 10 == 0:
            print(f"  Scrolled {i + 1}/{estimated_scrolls} times...")
    
    print(f"Completed {estimated_scrolls} scrolls")
    
    # Find all post links
    try:
        post_links = []
        links = driver.find_elements(By.CSS_SELECTOR, 'a[href*="/p/"]')
        
        for link in links:
            href = link.get_attribute('href')
            if href and '/p/' in href:
                post_links.append(href)
        
        post_links = list(dict.fromkeys(post_links))
        
        if max_posts:
            post_links = post_links[:max_posts]
        
        print(f"Found {len(post_links)} posts")
        
        # Scrape each post
        total_downloaded = 0
        for i, post_url in enumerate(post_links, 1):
            print(f"\nPost {i}/{len(post_links)}")
            count = scrape_post(post_url)
            total_downloaded += count
            time.sleep(4)
        
        print(f"\n✓ User '{username}' complete: {total_downloaded} total items downloaded")
        return total_downloaded
        
    except Exception as e:
        print(f"Error scraping user: {e}")
        return 0

In [None]:
# Main scraping loop
if USERS_TO_SCRAPE:
    for username in USERS_TO_SCRAPE:
        scrape_user(username, max_posts=None)
        time.sleep(6)
else:
    print("⚠ No users to scrape. Add usernames to USERS_TO_SCRAPE list.")

In [None]:
# Save stats log
if stats_log:
    stats_file = DOWNLOAD_DIR / "scrape_stats.json"
    with open(stats_file, 'w') as f:
        json.dump(stats_log, f, indent=2)
    print(f"\n✓ Stats saved to {stats_file}")
    print(f"\nTotal stats:")
    print(f"  Posts scraped: {len(stats_log)}")
    print(f"  Items downloaded: {sum(s['media_downloaded'] for s in stats_log)}")
    print(f"  Paid partnerships: {sum(1 for s in stats_log if s['paid_partnership'])}")
else:
    print("No stats to save")

In [None]:
# Close browser
driver.quit()
print("\n✓ Browser closed")
print("✓ All done!")