# Instagram Multi-User Scraper (DOM-based, Mobile View)
Combines mobile view for stats + BeautifulSoup DOM parsing for clean media extraction
Scrolls back to Aug 17, 2023 for each user

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import time
from bs4 import BeautifulSoup as bs
import requests
from pathlib import Path
import json
import hashlib
from datetime import datetime

In [None]:
# Configuration
USERNAME = ""  # Your Instagram username
PASSWORD = ""  # Your Instagram password

# List of usernames to scrape
USERS_TO_SCRAPE = []  # e.g., ["user1", "user2", "user3"]

# Cutoff date - only scrape posts from this date forward
CUTOFF_DATE = datetime(2023, 8, 17, 0, 0, 0)

DOWNLOAD_DIR = Path("instagram_downloads")
DOWNLOAD_DIR.mkdir(exist_ok=True)

# Global tracking
downloaded_hashes = set()
stats_log = []

In [None]:
# Setup Chrome with mobile emulation
options = webdriver.ChromeOptions()
options.add_experimental_option('mobileEmulation', {
    'deviceName': 'iPhone 12 Pro'
})

driver = webdriver.Chrome(options=options)
print("✓ Browser opened with mobile emulation")

In [None]:
# Login to Instagram
def login_instagram(username, password):
    driver.get('https://www.instagram.com/')
    time.sleep(6)  # Doubled
    
    try:
        # Wait for and click login button if on homepage
        try:
            login_link = WebDriverWait(driver, 10).until(  # Doubled
                EC.element_to_be_clickable((By.XPATH, "//a[contains(@href, '/accounts/login')]"))
            )
            login_link.click()
            time.sleep(4)  # Doubled
        except:
            pass
        
        # Enter username
        username_input = WebDriverWait(driver, 20).until(  # Doubled
            EC.presence_of_element_located((By.NAME, "username"))
        )
        username_input.send_keys(username)
        
        # Enter password
        password_input = driver.find_element(By.NAME, "password")
        password_input.send_keys(password)
        password_input.send_keys(Keys.RETURN)
        
        time.sleep(10)  # Doubled
        
        # Handle "Save Your Login Info" popup
        try:
            not_now = WebDriverWait(driver, 10).until(  # Doubled
                EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Not now') or contains(text(), 'Not Now')]"))
            )
            not_now.click()
            time.sleep(4)  # Doubled
        except:
            pass
        
        # Handle "Turn on Notifications" popup
        try:
            not_now = WebDriverWait(driver, 10).until(  # Doubled
                EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Not Now')]"))
            )
            not_now.click()
            time.sleep(4)  # Doubled
        except:
            pass
        
        print("✓ Logged in successfully")
        return True
        
    except Exception as e:
        print(f"Login failed: {e}")
        return False

# Perform login
if USERNAME and PASSWORD:
    login_instagram(USERNAME, PASSWORD)
else:
    print("⚠ No login credentials provided")

In [None]:
# Extract post date
def extract_post_date():
    try:
        # Look for time element with class x1p4m5qa
        time_elements = driver.find_elements(By.CSS_SELECTOR, 'time.x1p4m5qa')
        if time_elements:
            datetime_str = time_elements[0].get_attribute('datetime')
            if datetime_str:
                # Parse ISO format datetime
                post_date = datetime.fromisoformat(datetime_str.replace('Z', '+00:00'))
                return post_date
    except Exception as e:
        print(f"  Could not extract date: {e}")
    
    return None

In [None]:
# Extract post stats (works in mobile view)
def extract_post_stats():
    likes = "0"
    comments = "0"
    is_paid = False
    
    try:
        # Get likes - mobile view selector
        like_spans = driver.find_elements(By.CSS_SELECTOR, 'span.x1ypdohk.x1s688f.x2fvf9.xe9ewy2[role="button"]')
        if like_spans:
            likes = like_spans[0].text
    except:
        pass
    
    try:
        # Get comments - mobile view selector
        comment_spans = driver.find_elements(
            By.CSS_SELECTOR,
            'span.xdj266r.x14z9mp.xat24cr.x1lziwak.xexx8yu.xyri2b.x18d9i69.x1c1uobl.x1hl2dhg.x16tdsg8.x1vvkbs'
        )
        for span in comment_spans:
            text = span.text
            if text.replace(',', '').isdigit():
                comments = text
                break
    except:
        pass
    
    try:
        # Check for paid partnership
        if "Paid partnership with " in driver.page_source:
            is_paid = True
    except:
        pass
    
    return likes, comments, is_paid

In [None]:
# Extract media URLs using BeautifulSoup DOM parsing (NO network logs!)
def extract_media_urls_dom():
    media_urls = set()
    
    try:
        # Get all img tags
        images = driver.find_elements(By.TAG_NAME, 'img')
        for img in images:
            src = img.get_attribute('src')
            if src and ('cdninstagram.com' in src or 'fbcdn.net' in src):
                # Filter out small thumbnails and profile pics
                if '/s150x150/' not in src and '/s320x320/' not in src and 's640x640' not in src:
                    media_urls.add(src)
        
        # Get all video tags
        videos = driver.find_elements(By.TAG_NAME, 'video')
        for video in videos:
            src = video.get_attribute('src')
            if src and ('cdninstagram.com' in src or 'fbcdn.net' in src):
                media_urls.add(src)
        
        # Alternative: use BeautifulSoup on entire page
        if len(media_urls) == 0:
            soup = bs(driver.page_source, 'html.parser')
            
            # Find videos
            for video in soup.find_all('video'):
                src = video.get('src')
                if src and ('cdninstagram.com' in src or 'fbcdn.net' in src):
                    media_urls.add(src)
            
            # Find images
            for img in soup.find_all('img'):
                src = img.get('src')
                if src and ('cdninstagram.com' in src or 'fbcdn.net' in src):
                    if '/s150x150/' not in src and '/s320x320/' not in src and 's640x640' not in src:
                        media_urls.add(src)
        
    except Exception as e:
        print(f"  Error extracting media: {e}")
    
    return media_urls

In [None]:
# Click through carousel
def click_through_carousel():
    click_count = 0
    while True:
        try:
            # Try multiple selectors for next button
            next_btn = None
            selectors = [
                'button[aria-label="Next"]',
                'button[aria-label="next"]',
                'button._afxw._al46._al47'
            ]
            
            for selector in selectors:
                try:
                    next_btn = WebDriverWait(driver, 2).until(  # Doubled
                        EC.element_to_be_clickable((By.CSS_SELECTOR, selector))
                    )
                    break
                except:
                    continue
            
            if next_btn:
                next_btn.click()
                click_count += 1
                time.sleep(1)  # Doubled
            else:
                break
                
        except (TimeoutException, NoSuchElementException):
            break
    
    return click_count

In [None]:
# Download media with duplicate detection and 5 second pauses
def download_media(media_urls, post_dir):
    success_count = 0
    
    for idx, url in enumerate(media_urls, 1):
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            
            # Check for duplicates by content hash
            content_hash = hashlib.md5(response.content).hexdigest()
            if content_hash in downloaded_hashes:
                print(f"  Skipped {idx}/{len(media_urls)}: Duplicate")
                time.sleep(5)  # Added 5 second pause
                continue
            
            downloaded_hashes.add(content_hash)
            
            # Determine file extension
            content_type = response.headers.get('content-type', '')
            if 'video' in content_type:
                ext = '.mp4'
            elif 'image' in content_type:
                ext = '.jpg'
            else:
                ext = '.jpg'  # default
            
            success_count += 1
            filename = post_dir / f"item_{success_count}{ext}"
            
            filename.write_bytes(response.content)
            print(f"  Downloaded {success_count}: {filename.name}")
            
            # 5 second pause between downloads
            time.sleep(5)
            
        except Exception as e:
            print(f"  Skipped {idx}/{len(media_urls)}: {str(e)[:40]}")
            time.sleep(5)  # Added 5 second pause
    
    return success_count

In [None]:
# Scrape a single post
def scrape_post(post_url):
    print(f"\n{'='*60}")
    print(f"Scraping: {post_url}")
    print('='*60)
    
    driver.get(post_url)
    time.sleep(6)  # Doubled
    
    # Extract date
    post_date = extract_post_date()
    if post_date:
        print(f"Date: {post_date.strftime('%Y-%m-%d %H:%M:%S')}")
    
    # Extract stats
    likes, comments, is_paid = extract_post_stats()
    print(f"Stats: Likes={likes}, Comments={comments}, Paid={is_paid}")
    
    # Click through carousel to load all media
    click_count = click_through_carousel()
    if click_count > 0:
        print(f"Clicked next {click_count} times")
    
    # Extract media URLs using DOM parsing
    media_urls = extract_media_urls_dom()
    print(f"Found {len(media_urls)} media items")
    
    # Create directory for this post
    post_id = post_url.rstrip('/').split('/')[-1]
    post_dir = DOWNLOAD_DIR / post_id
    post_dir.mkdir(exist_ok=True)
    
    # Download media
    success_count = download_media(media_urls, post_dir)
    
    # Log stats
    stats_log.append({
        'timestamp': datetime.now().isoformat(),
        'post_url': post_url,
        'post_id': post_id,
        'post_date': post_date.isoformat() if post_date else None,
        'likes': likes,
        'comments': comments,
        'paid_partnership': is_paid,
        'media_downloaded': success_count
    })
    
    print(f"✓ Downloaded {success_count} unique items to '{post_dir}'")
    return success_count

In [None]:
# Scrape user profile - scroll back to Aug 17, 2023
def scrape_user(username, max_posts=None):
    print(f"\n{'='*60}")
    print(f"Scraping user: {username}")
    print(f"Cutoff date: {CUTOFF_DATE.strftime('%Y-%m-%d')}")
    print('='*60)
    
    profile_url = f"https://www.instagram.com/{username}/"
    driver.get(profile_url)
    time.sleep(6)  # Doubled
    
    # Scroll to load posts until we reach cutoff date
    scroll_count = 0
    reached_cutoff = False
    
    while not reached_cutoff:
        # Scroll down
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(4)  # Doubled
        scroll_count += 1
        
        # Check if we've loaded posts old enough
        # Look for time elements on the page
        try:
            time_elements = driver.find_elements(By.CSS_SELECTOR, 'time')
            if time_elements:
                # Check the last (oldest visible) post date
                for time_elem in reversed(time_elements[-5:]):  # Check last 5 time elements
                    try:
                        datetime_str = time_elem.get_attribute('datetime')
                        if datetime_str:
                            post_date = datetime.fromisoformat(datetime_str.replace('Z', '+00:00'))
                            # Remove timezone info for comparison
                            post_date_naive = post_date.replace(tzinfo=None)
                            
                            if post_date_naive < CUTOFF_DATE:
                                print(f"Reached cutoff date after {scroll_count} scrolls")
                                print(f"Oldest post date: {post_date_naive.strftime('%Y-%m-%d')}")
                                reached_cutoff = True
                                break
                    except:
                        continue
        except:
            pass
        
        # Safety limit - stop after 100 scrolls
        if scroll_count >= 100:
            print(f"Reached scroll limit (100 scrolls)")
            break
        
        if scroll_count % 10 == 0:
            print(f"  Scrolled {scroll_count} times...")
    
    print(f"Total scrolls: {scroll_count}")
    
    # Find all post links
    try:
        post_links = []
        
        # Find all links with /p/ (posts)
        links = driver.find_elements(By.CSS_SELECTOR, 'a[href*="/p/"]')
        
        for link in links:
            href = link.get_attribute('href')
            if href and '/p/' in href:
                post_links.append(href)
        
        # Remove duplicates
        post_links = list(dict.fromkeys(post_links))
        
        if max_posts:
            post_links = post_links[:max_posts]
        
        print(f"Found {len(post_links)} posts")
        
        # Scrape each post
        total_downloaded = 0
        for i, post_url in enumerate(post_links, 1):
            print(f"\nPost {i}/{len(post_links)}")
            count = scrape_post(post_url)
            total_downloaded += count
            time.sleep(4)  # Doubled
        
        print(f"\n✓ User '{username}' complete: {total_downloaded} total items downloaded")
        return total_downloaded
        
    except Exception as e:
        print(f"Error scraping user: {e}")
        return 0

In [None]:
# Main scraping loop - Scrape all users in the list
if USERS_TO_SCRAPE:
    for username in USERS_TO_SCRAPE:
        scrape_user(username, max_posts=None)  # Set max_posts if needed
        time.sleep(6)  # Doubled
else:
    print("⚠ No users to scrape. Add usernames to USERS_TO_SCRAPE list.")

In [None]:
# Save stats log
if stats_log:
    stats_file = DOWNLOAD_DIR / "scrape_stats.json"
    with open(stats_file, 'w') as f:
        json.dump(stats_log, f, indent=2)
    print(f"\n✓ Stats saved to {stats_file}")
    print(f"\nTotal stats:")
    print(f"  Posts scraped: {len(stats_log)}")
    print(f"  Items downloaded: {sum(s['media_downloaded'] for s in stats_log)}")
    print(f"  Paid partnerships: {sum(1 for s in stats_log if s['paid_partnership'])}")
else:
    print("No stats to save")

In [None]:
# Close browser
driver.quit()
print("\n✓ Browser closed")
print("✓ All done!")