# Instagram Multi-User Scraper (DOM-based)
Uses BeautifulSoup to extract media URLs directly from DOM - no messy network logs!

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import time
from bs4 import BeautifulSoup as bs
import requests
from pathlib import Path
import hashlib
import json
from datetime import datetime

In [None]:
# Configuration
USERNAME = ""  # Your Instagram username
PASSWORD = ""  # Your Instagram password

# List of usernames to scrape
USERS_TO_SCRAPE = []  # e.g., ["user1", "user2", "user3"]

DOWNLOAD_DIR = Path("instagram_downloads")
DOWNLOAD_DIR.mkdir(exist_ok=True)

# Global tracking
downloaded_hashes = set()
stats_log = []

In [None]:
# Setup Chrome (desktop mode for better stability)
options = webdriver.ChromeOptions()
options.add_argument('--start-maximized')

driver = webdriver.Chrome(options=options)
print("✓ Browser opened")

In [None]:
# Login to Instagram
def login_instagram(username, password):
    driver.get('https://www.instagram.com/')
    time.sleep(3)
    
    try:
        # Enter username
        username_input = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.NAME, "username"))
        )
        username_input.send_keys(username)
        
        # Enter password
        password_input = driver.find_element(By.NAME, "password")
        password_input.send_keys(password)
        password_input.send_keys(Keys.RETURN)
        
        time.sleep(5)
        
        # Handle "Save Your Login Info" popup
        try:
            not_now = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Not now') or contains(text(), 'Not Now')]"))
            )
            not_now.click()
            time.sleep(2)
        except:
            pass
        
        # Handle "Turn on Notifications" popup
        try:
            not_now = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Not Now')]"))
            )
            not_now.click()
            time.sleep(2)
        except:
            pass
        
        print("✓ Logged in successfully")
        return True
        
    except Exception as e:
        print(f"Login failed: {e}")
        return False

# Perform login
if USERNAME and PASSWORD:
    login_instagram(USERNAME, PASSWORD)
else:
    print("⚠ No credentials - add USERNAME and PASSWORD above")

In [None]:
# Extract post stats
def extract_post_stats():
    likes = "0"
    comments = "0"
    is_paid = False
    
    try:
        # Get likes - look for button with "like" text
        like_buttons = driver.find_elements(By.XPATH, "//button[contains(@class, 'x1i10hfl')]//span[contains(text(), 'like')]")
        if like_buttons:
            like_text = like_buttons[0].text
            # Extract number from text like "123 likes"
            likes = like_text.split()[0] if like_text else "0"
    except:
        pass
    
    try:
        # Get comments - look for elements with comment counts
        comment_elements = driver.find_elements(By.XPATH, "//span[contains(text(), 'comment')]")
        for elem in comment_elements:
            text = elem.text
            if 'comment' in text.lower():
                parts = text.split()
                if parts and parts[0].replace(',', '').isdigit():
                    comments = parts[0]
                    break
    except:
        pass
    
    try:
        # Check for paid partnership
        if "Paid partnership with " in driver.page_source:
            is_paid = True
    except:
        pass
    
    return likes, comments, is_paid

In [None]:
# Extract media URL from current view using BeautifulSoup
def extract_media_url():
    try:
        # Find the main article/post container
        article = driver.find_element(By.TAG_NAME, 'article')
        html = article.get_attribute('innerHTML')
        soup = bs(html, 'html.parser')
        
        # Try to find video first, then image
        video = soup.find('video')
        if video and video.get('src'):
            return video['src']
        
        # Look for img tag
        img = soup.find('img', src=True)
        if img and img.get('src'):
            src = img['src']
            # Filter out profile pics and small icons
            if 'cdninstagram.com' in src or 'fbcdn.net' in src:
                if '/s150x150/' not in src and '/s320x320/' not in src:
                    return src
        
    except Exception as e:
        print(f"  Error extracting media: {e}")
    
    return None

In [None]:
# Download media file with duplicate detection
def download_media(url, filepath):
    try:
        response = requests.get(url, timeout=15)
        response.raise_for_status()
        
        # Check for duplicates
        content_hash = hashlib.md5(response.content).hexdigest()
        if content_hash in downloaded_hashes:
            return False, "duplicate"
        
        downloaded_hashes.add(content_hash)
        
        # Write file
        filepath.write_bytes(response.content)
        return True, None
        
    except Exception as e:
        return False, str(e)[:40]

In [None]:
# Check if carousel has next button
def has_next_in_carousel():
    try:
        # Try multiple selectors for next button
        selectors = [
            'button[aria-label="Next"]',
            'button[aria-label="next"]',
            'button._afxw._al46._al47',
        ]
        
        for selector in selectors:
            try:
                next_btn = driver.find_element(By.CSS_SELECTOR, selector)
                if next_btn.is_displayed():
                    return next_btn
            except:
                continue
        return None
    except:
        return None

In [None]:
# Scrape all media from current post (handles single image, video, or carousel)
def scrape_current_post(post_dir, post_num):
    media_count = 0
    
    # Extract first media
    url = extract_media_url()
    if url:
        media_count += 1
        ext = '.mp4' if 'video' in url or '.mp4' in url else '.jpg'
        filepath = post_dir / f"post{post_num}_item{media_count}{ext}"
        
        success, error = download_media(url, filepath)
        if success:
            print(f"    Downloaded: {filepath.name}")
        else:
            if error == "duplicate":
                print(f"    Skipped: {filepath.name} (duplicate)")
            else:
                print(f"    Failed: {error}")
    
    # Check for carousel and click through
    while True:
        next_btn = has_next_in_carousel()
        if not next_btn:
            break
        
        next_btn.click()
        time.sleep(1)
        
        # Extract next media
        url = extract_media_url()
        if url:
            media_count += 1
            ext = '.mp4' if 'video' in url or '.mp4' in url else '.jpg'
            filepath = post_dir / f"post{post_num}_item{media_count}{ext}"
            
            success, error = download_media(url, filepath)
            if success:
                print(f"    Downloaded: {filepath.name}")
            else:
                if error == "duplicate":
                    print(f"    Skipped: {filepath.name} (duplicate)")
                else:
                    print(f"    Failed: {error}")
    
    return media_count

In [None]:
# Navigate to next post in modal view
def go_to_next_post():
    try:
        # Find next post button (right arrow in modal)
        selectors = [
            'a[role="button"][aria-label="Next"]',
            'a[aria-label="Next"]',
            'button.coreSpriteRightPaginationArrow',
            'a._aaqg._aaqh'  # Common next post button class
        ]
        
        for selector in selectors:
            try:
                next_post_btn = driver.find_element(By.CSS_SELECTOR, selector)
                if next_post_btn.is_displayed():
                    next_post_btn.click()
                    time.sleep(2)
                    return True
            except:
                continue
        
        return False
    except:
        return False

In [None]:
# Scrape all posts from a user
def scrape_user(username, max_posts=None):
    print(f"\n{'='*60}")
    print(f"Scraping user: {username}")
    print('='*60)
    
    # Navigate to user profile
    profile_url = f"https://www.instagram.com/{username}/"
    driver.get(profile_url)
    time.sleep(4)
    
    # Create user directory
    user_dir = DOWNLOAD_DIR / username
    user_dir.mkdir(exist_ok=True)
    
    try:
        # Click on first post
        first_post = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, 'article a[href*="/p/"]'))
        )
        first_post.click()
        time.sleep(3)
        
        post_count = 0
        total_media = 0
        
        while True:
            if max_posts and post_count >= max_posts:
                print(f"\nReached max posts limit ({max_posts})")
                break
            
            post_count += 1
            print(f"\nPost {post_count}:")
            
            # Extract stats
            likes, comments, is_paid = extract_post_stats()
            print(f"  Stats: Likes={likes}, Comments={comments}, Paid={is_paid}")
            
            # Download all media from this post
            media_count = scrape_current_post(user_dir, post_count)
            total_media += media_count
            
            # Get current post URL
            current_url = driver.current_url
            post_id = current_url.rstrip('/').split('/')[-1] if '/p/' in current_url else f"post{post_count}"
            
            # Log stats
            stats_log.append({
                'timestamp': datetime.now().isoformat(),
                'username': username,
                'post_url': current_url,
                'post_id': post_id,
                'likes': likes,
                'comments': comments,
                'paid_partnership': is_paid,
                'media_downloaded': media_count
            })
            
            # Go to next post
            if not go_to_next_post():
                print("\nNo more posts found")
                break
        
        print(f"\n✓ User '{username}' complete: {post_count} posts, {total_media} total media")
        return post_count, total_media
        
    except Exception as e:
        print(f"Error scraping user: {e}")
        return 0, 0

In [None]:
# Main scraping loop - scrape all users
if USERS_TO_SCRAPE:
    for username in USERS_TO_SCRAPE:
        scrape_user(username, max_posts=None)  # Set max_posts if needed
        time.sleep(3)
else:
    print("⚠ No users to scrape. Add usernames to USERS_TO_SCRAPE list.")

In [None]:
# Save stats log
if stats_log:
    stats_file = DOWNLOAD_DIR / "scrape_stats.json"
    with open(stats_file, 'w') as f:
        json.dump(stats_log, f, indent=2)
    
    print(f"\n{'='*60}")
    print(f"✓ Stats saved to {stats_file}")
    print(f"\nTotal Summary:")
    print(f"  Posts scraped: {len(stats_log)}")
    print(f"  Items downloaded: {sum(s['media_downloaded'] for s in stats_log)}")
    print(f"  Paid partnerships: {sum(1 for s in stats_log if s['paid_partnership'])}")
    print('='*60)
else:
    print("No stats to save")

In [None]:
# Close browser
driver.quit()
print("\n✓ Browser closed")
print("✓ All done!")