# Instagram Media Downloader

This notebook provides an agentic way to download images and videos from Instagram posts using your authenticated session.

## Features
- Automated login to Instagram
- Navigate through posts
- Intercept network requests to capture media URLs
- Download images and videos

## Requirements
Install the required packages first:
```bash
pip install selenium requests pillow webdriver-manager
```

In [1]:
# Install required packages
!pip install selenium requests pillow webdriver-manager --quiet

In [2]:
import json
import os
import time
import requests
from datetime import datetime
from pathlib import Path
from urllib.parse import urlparse

from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager

## Configuration

In [3]:
# Configuration
DOWNLOAD_FOLDER = "instagram_downloads"
MOBILE_VIEW = True  # Toggle Chrome's device toolbar (Ctrl+Shift+M) to keep Instagram in mobile layout

# Create download folder
Path(DOWNLOAD_FOLDER).mkdir(exist_ok=True)

print(f"Downloads will be saved to: {os.path.abspath(DOWNLOAD_FOLDER)}")

Downloads will be saved to: d:\OneDrive - Emory\Schweidel\PerceptionMap\code\instagram_downloads


## Instagram Media Downloader Class

In [None]:
class InstagramMediaDownloader:
    def __init__(self, mobile_view=True, headless=False, debug=False):
        self.download_folder = DOWNLOAD_FOLDER
        self.driver = None
        self.mobile_view = mobile_view
        self.headless = headless
        self.debug = debug  # Enable verbose logging to verify network capture

    def setup_driver(self):
        """Setup Chrome driver with appropriate options"""
        chrome_options = Options()

        if self.headless:
            chrome_options.add_argument('--headless=new')

        # Enable mobile emulation if requested
        if self.mobile_view:
            mobile_emulation = {
                "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
                "userAgent": "Mozilla/5.0 (Linux; Android 10) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Mobile Safari/537.36"
            }
            chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
            
            # Auto-open DevTools for easy inspection
            chrome_options.add_argument("--auto-open-devtools-for-tabs")

        chrome_options.set_capability('goog:loggingPrefs', {'performance': 'ALL'})
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--disable-blink-features=AutomationControlled')
        chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])

        service = Service(ChromeDriverManager().install())
        self.driver = webdriver.Chrome(service=service, options=chrome_options)
        self.driver.implicitly_wait(10)
        
        if self.mobile_view:
            print("✓ Driver setup complete (mobile emulation enabled, DevTools open)")
        else:
            print("✓ Driver setup complete")

    def _drain_performance_logs(self):
        """Clear accumulated performance logs to avoid stale entries"""
        if not self.driver:
            return
        try:
            self.driver.get_log('performance')
        except Exception:
            pass

    def _click_if_present(self, locator, timeout=5):
        """Click an element if it becomes clickable within timeout"""
        try:
            element = WebDriverWait(self.driver, timeout).until(
                EC.element_to_be_clickable(locator)
            )
            element.click()
            time.sleep(1)
            return True
        except Exception:
            return False

    def login(self, username, password):
        """Login to Instagram"""
        print("Logging in to Instagram...")
        self.driver.get("https://www.instagram.com/accounts/login/")
        time.sleep(3)
        
        wait = WebDriverWait(self.driver, 20)
        try:
            # Handle cookie consent if present
            self._click_if_present(
                (By.XPATH, "//button[contains(text(), 'Allow') or contains(text(), 'Accept')]"),
                timeout=5
            )
            
            # Enter credentials
            username_input = wait.until(EC.presence_of_element_located((By.NAME, "username")))
            password_input = wait.until(EC.presence_of_element_located((By.NAME, "password")))
            
            username_input.send_keys(username)
            password_input.send_keys(password)
            password_input.send_keys(Keys.RETURN)
            
            time.sleep(5)
            
            # Handle post-login prompts
            self._click_if_present((By.XPATH, "//button[contains(text(), 'Not Now')]"), timeout=3)
            self._click_if_present((By.XPATH, "//button[contains(text(), 'Not now')]"), timeout=3)
            
            print("✓ Login successful")
            return True
            
        except Exception as exc:
            print(f"✗ Login failed: {str(exc)}")
            return False

    def navigate_to_post(self, post_url):
        """Navigate to a specific Instagram post"""
        print(f"Navigating to post: {post_url}")
        self.driver.get(post_url)
        time.sleep(3)
        print("✓ Post loaded")

    def _wait_for_media_to_render(self, timeout=10):
        """Wait for post media elements to appear"""
        try:
            WebDriverWait(self.driver, timeout).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "article img, article video"))
            )
        except TimeoutException:
            pass

    def _is_desired_media(self, url, mime_type, response):
        """Check if a media URL is desired (excludes static assets, profile pics, etc.)"""
        if not url or ('image' not in mime_type and 'video' not in mime_type):
            return False
        
        # Must be from Instagram CDN
        if 'cdninstagram.com' not in url and 'fbcdn.net' not in url:
            return False
        
        # Exclude static assets (logos, icons, UI elements)
        if 'static.cdninstagram.com' in url:
            return False
        
        path = urlparse(url).path.lower()
        
        # Exclude common non-media assets
        excluded_tokens = ('profilepic', 'sprite', 'favicon', 'glyph', 'badge', 'logo', 'emoji')
        if any(token in path for token in excluded_tokens):
            return False
        
        # Size check for images (exclude small thumbnails/icons)
        headers = response.get('headers', {})
        content_length = headers.get('content-length') or headers.get('Content-Length')
        if content_length and 'video' not in mime_type:
            try:
                if int(content_length) < 35000:
                    return False
            except ValueError:
                pass
        
        return True

    def _normalize_url(self, url):
        """Normalize URL for deduplication by removing certain query parameters"""
        # Parse the URL
        parsed = urlparse(url)
        # Keep the main URL without query params for comparison
        # This helps catch duplicates that differ only in tracking params
        base_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
        return base_url

    def extract_media_urls_from_logs(self, debug_prefix=""):
        """Extract media URLs from browser performance logs"""
        if not self.driver:
            return []
        try:
            logs = self.driver.get_log('performance')
        except Exception:
            return []
        
        if self.debug:
            print(f"\n{debug_prefix}[DEBUG] Processing {len(logs)} performance log entries...")
        
        media_urls = []
        image_count = 0
        video_count = 0
        filtered_out = []
        
        for log_entry in logs:
            try:
                message = json.loads(log_entry['message'])['message']
            except (KeyError, json.JSONDecodeError):
                continue
            
            if message.get('method') != 'Network.responseReceived':
                continue
            
            response = message.get('params', {}).get('response', {})
            url = response.get('url', '')
            mime_type = response.get('mimeType', '')
            
            if self._is_desired_media(url, mime_type, response):
                media_type = 'video' if 'video' in mime_type else 'image'
                media_urls.append({
                    'url': url,
                    'type': media_type,
                    'normalized_url': self._normalize_url(url)
                })
                if media_type == 'video':
                    video_count += 1
                else:
                    image_count += 1
                    
                if self.debug:
                    filename = url.split('/')[-1].split('?')[0]
                    if len(filename) > 50:
                        filename = filename[:50] + "..."
                    print(f"{debug_prefix}  ✓ ACCEPTED {media_type}: {filename}")
            else:
                # Track why it was filtered
                if self.debug and url and ('image' in mime_type or 'video' in mime_type):
                    reason = ""
                    if 'static.cdninstagram.com' in url:
                        reason = "static CDN"
                    elif 'cdninstagram.com' not in url and 'fbcdn.net' not in url:
                        reason = "not Instagram CDN"
                    else:
                        path = urlparse(url).path.lower()
                        excluded_tokens = ('profilepic', 'sprite', 'favicon', 'glyph', 'badge', 'logo', 'emoji')
                        for token in excluded_tokens:
                            if token in path:
                                reason = f"contains '{token}'"
                                break
                        if not reason:
                            headers = response.get('headers', {})
                            content_length = headers.get('content-length') or headers.get('Content-Length')
                            if content_length and 'video' not in mime_type:
                                try:
                                    if int(content_length) < 35000:
                                        reason = f"too small ({content_length} bytes)"
                                except ValueError:
                                    pass
                    if reason:
                        filename = url.split('/')[-1].split('?')[0]
                        if len(filename) > 40:
                            filename = filename[:40] + "..."
                        filtered_out.append((filename, reason))
        
        if self.debug:
            print(f"{debug_prefix}[DEBUG] Found {image_count} images, {video_count} videos")
            if filtered_out and len(filtered_out) > 0:
                print(f"{debug_prefix}[DEBUG] Filtered out {len(filtered_out)} items:")
                for filename, reason in filtered_out[:5]:  # Show first 5
                    print(f"{debug_prefix}  ✗ {filename} - {reason}")
                if len(filtered_out) > 5:
                    print(f"{debug_prefix}  ... and {len(filtered_out) - 5} more")
        
        return media_urls

    def _click_next_slide(self):
        """Click the Next button in a carousel post"""
        selectors = [
            (By.CSS_SELECTOR, "button[aria-label='Next']"),
            (By.XPATH, "//button[@aria-label='Next']"),
            (By.XPATH, "//button//*[name()='svg' and @aria-label='Next']/.."),
        ]
        for locator in selectors:
            try:
                button = WebDriverWait(self.driver, 2).until(
                    EC.element_to_be_clickable(locator)
                )
                button.click()
                time.sleep(2)
                return True
            except Exception:
                continue
        return False

    def _collect_carousel_media(self):
        """
        Collect media from carousel by clicking Next through all items.
        Does NOT download immediately - only collects URLs.
        At the end, deduplicates and returns unique media items.
        """
        all_media = []  # All media found (may have duplicates)
        
        if self.debug:
            print("\n" + "="*60)
            print("STARTING MEDIA COLLECTION")
            print("="*60)
            print("Compare the URLs below with what you see in Network tab > Img/Media")
        
        # Wait for initial media to load
        self._wait_for_media_to_render()
        time.sleep(2)
        
        # Collect initial media (after refresh)
        if self.debug:
            print("\n--- INITIAL LOAD (after refresh) ---")
        initial_media = self.extract_media_urls_from_logs(debug_prefix="[INITIAL] ")
        all_media.extend(initial_media)
        print(f"Found {len(initial_media)} media item(s) on initial load")
        
        # Drain logs to start fresh
        self._drain_performance_logs()
        
        # Click Next and collect media progressively
        click_count = 0
        max_clicks = 50  # Safety limit
        no_new_media_count = 0
        
        while click_count < max_clicks:
            # Try to click Next
            if not self._click_next_slide():
                print("No more Next button found - reached end of carousel")
                break
                
            click_count += 1
            self._wait_for_media_to_render()
            time.sleep(1.5)
            
            if self.debug:
                print(f"\n--- AFTER NEXT CLICK #{click_count} ---")
            
            # Collect media from this iteration
            current_media = self.extract_media_urls_from_logs(debug_prefix=f"[NEXT-{click_count}] ")
            
            if len(current_media) > 0:
                all_media.extend(current_media)
                print(f"After Next click {click_count}: Found {len(current_media)} media item(s) in logs")
                no_new_media_count = 0
            else:
                no_new_media_count += 1
                print(f"After Next click {click_count}: No new media in logs")
                
                # If we've clicked 3 times with no new media, stop
                if no_new_media_count >= 3:
                    print("No new media for 3 consecutive clicks - stopping")
                    break
            
            # Drain logs after each collection
            self._drain_performance_logs()
        
        if click_count >= max_clicks:
            print(f"⚠ Reached maximum click limit ({max_clicks})")
        
        # NOW deduplicate based on normalized URL
        if self.debug:
            print("\n" + "="*60)
            print("DEDUPLICATION PHASE")
            print("="*60)
            print(f"Total media items collected: {len(all_media)}")
        
        unique_media = []
        seen_urls = set()
        
        for media in all_media:
            normalized = media['normalized_url']
            if normalized not in seen_urls:
                seen_urls.add(normalized)
                unique_media.append({
                    'url': media['url'],
                    'type': media['type']
                })
                if self.debug:
                    filename = media['url'].split('/')[-1].split('?')[0][:50]
                    print(f"  ✓ UNIQUE {media['type']}: {filename}")
            else:
                if self.debug:
                    filename = media['url'].split('/')[-1].split('?')[0][:50]
                    print(f"  ✗ DUPLICATE {media['type']}: {filename}")
        
        if self.debug:
            print(f"\nAfter deduplication: {len(unique_media)} unique media items")
            print("="*60)
        
        print(f"\n✓ Collection complete: {len(unique_media)} unique media items (removed {len(all_media) - len(unique_media)} duplicates)")
        
        return unique_media

    def _extract_likes_and_comments(self):
        """Extract likes and comments count from the current post"""
        likes = None
        comments = None
        
        try:
            # Extract likes - look for button with like text or specific patterns
            try:
                # Try to find likes in various formats
                like_patterns = [
                    "span.x1ypdohk.x1s688f.x2fvf9.xe9ewy2[role='button']",
                    "section a[href*='/liked_by/']",
                    "a[href*='/liked_by/'] span",
                ]
                
                for pattern in like_patterns:
                    elements = self.driver.find_elements(By.CSS_SELECTOR, pattern)
                    for elem in elements:
                        text = elem.text.strip()
                        # Look for numbers in the text
                        import re
                        numbers = re.findall(r'\d+', text)
                        if numbers:
                            likes = int(numbers[0])
                            break
                    if likes is not None:
                        break
            except Exception:
                pass
            
            # Extract comments - look for comment count patterns
            try:
                comment_patterns = [
                    "span.html-span.xdj266r.x14z9mp.xat24cr.x1lziwak.xexx8yu.xyri2b.x18d9i69.x1c1uobl.x1hl2dhg.x16tdsg8.x1vvkbs",
                    "span.xdj266r",
                    "ul li div span",
                ]
                
                for pattern in comment_patterns:
                    elements = self.driver.find_elements(By.CSS_SELECTOR, pattern)
                    for elem in elements:
                        text = elem.text.strip()
                        # Only consider pure numbers (not mixed text)
                        if text.isdigit():
                            potential_comments = int(text)
                            # Comments are usually larger than single digits
                            if potential_comments > 0:
                                comments = potential_comments
                                break
                    if comments is not None:
                        break
            except Exception:
                pass
                    
        except Exception as e:
            print(f"⚠ Could not extract likes/comments: {e}")
        
        return likes, comments

    def download_media(self, media_list, post_id=None, likes=None, comments=None):
        """Download media files"""
        if not media_list:
            print("No media found to download")
            return []
        
        downloaded_files = []
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # Display post stats
        if likes is not None or comments is not None:
            stats = []
            if likes is not None:
                stats.append(f"{likes} likes")
            if comments is not None:
                stats.append(f"{comments} comments")
            print(f"\nPost stats: {', '.join(stats)}")
        
        if self.debug:
            print("\n" + "="*60)
            print("STARTING DOWNLOADS")
            print("="*60)
        
        for idx, media in enumerate(media_list):
            time.sleep(2)
            try:
                url = media['url']
                media_type = media['type']
                extension = 'mp4' if media_type == 'video' else 'jpg'
                
                if post_id:
                    filename = f"{post_id}_{idx+1}_{timestamp}.{extension}"
                else:
                    filename = f"media_{idx+1}_{timestamp}.{extension}"
                
                filepath = os.path.join(self.download_folder, filename)
                
                print(f"Downloading {media_type} {idx+1}/{len(media_list)}: {filename}")
                
                if self.debug:
                    url_preview = url[:80] + "..." if len(url) > 80 else url
                    print(f"  URL: {url_preview}")
                
                response = requests.get(url, stream=True, timeout=30)
                response.raise_for_status()
                
                with open(filepath, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
                
                downloaded_files.append(filepath)
                print(f"✓ Downloaded: {filename}")
                
            except Exception as exc:
                print(f"✗ Failed to download {media.get('type', 'media')}: {str(exc)}")
        
        return downloaded_files

    def download_post(self, post_url):
        """Download all media from a single post"""
        self.navigate_to_post(post_url)
        
        # Drain old logs and refresh page to clear network tab
        self._drain_performance_logs()
        print("Refreshing page to clear network logs...")
        self.driver.refresh()
        time.sleep(3)
        
        self._wait_for_media_to_render()
        
        if self.debug:
            print("\n[DEBUG] After refresh, check Network tab > Img/Media sub-tabs")
            print("[DEBUG] The filenames shown there should match what we capture below\n")
        
        # Extract likes and comments
        likes, comments = self._extract_likes_and_comments()
        
        # Collect media by navigating through carousel (NO downloading yet)
        media_items = self._collect_carousel_media()
        
        post_id = post_url.rstrip('/').split('/')[-1]
        # Remove query parameters from post_id
        if '?' in post_id:
            post_id = post_id.split('?')[0]
            
        print(f"\nReady to download {len(media_items)} unique media file(s)")
        
        # NOW download the deduplicated media
        downloaded = self.download_media(media_items, post_id, likes, comments)
        return downloaded

    def download_multiple_posts(self, post_urls):
        """Download media from multiple posts"""
        all_downloads = []
        for index, url in enumerate(post_urls, 1):
            print(f"\n{'='*60}")
            print(f"Processing post {index}/{len(post_urls)}")
            print(f"{'='*60}")
            try:
                self._drain_performance_logs()
                downloaded = self.download_post(url)
                all_downloads.extend(downloaded)
                time.sleep(2)
            except Exception as exc:
                print(f"✗ Error processing post: {str(exc)}")
        return all_downloads

    def close(self):
        """Close the browser"""
        if self.driver:
            self.driver.quit()
            print("\n✓ Browser closed")

## Usage Example 1: Download from a Single Post

**Debug Mode**: Set `debug=True` to see detailed logging that shows:
- All URLs captured from performance logs
- Which URLs are accepted and which are filtered out (with reasons)
- This lets you verify the captured URLs match what you see in Network tab > Img/Media sub-tabs

In [None]:
# Initialize the downloader with DEBUG MODE enabled
# Debug mode shows you exactly what URLs are being captured so you can verify
# they match what you see in DevTools Network tab > Img/Media sub-tabs
downloader = InstagramMediaDownloader(mobile_view=MOBILE_VIEW, headless=False, debug=True)
downloader.setup_driver()

# Login (replace with your credentials)
USERNAME = "your_username"  # Replace with your Instagram username
PASSWORD = "your_password"  # Replace with your Instagram password

if downloader.login(USERNAME, PASSWORD):
    # Download from a single post
    POST_URL = "https://www.instagram.com/p/DOfLxX1j11j/?img_index=1" # "https://www.instagram.com/p/POST_ID/"  # Replace with actual post URL
    
    downloaded_files = downloader.download_post(POST_URL)
    
    print(f"\n{'='*60}")
    print(f"Download Complete!")
    print(f"{'='*60}")
    print(f"Total files downloaded: {len(downloaded_files)}")
    for file in downloaded_files:
        print(f"  - {file}")

# Close the browser
downloader.close()