# Instagram Comment Scraper (DOM-based, Mobile View)
Downloads comments for existing post folders
Incremental downloading with scroll-based collection

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import time
from bs4 import BeautifulSoup as bs
from pathlib import Path
import json
from datetime import datetime
import re

In [None]:
# Configuration
USERNAME = ""  # Your Instagram username
PASSWORD = ""  # Your Instagram password

# List of usernames whose posts to scrape comments from
USERS_TO_SCRAPE = ["nike"]  # e.g., ["user1", "user2", "user3"]

# Sleep multiplier - set to 2 or 3 to slow down (default 1)
SLEEP_MULTIPLIER = 2  # Change to 2 or 3 if you need slower execution

BASE_DOWNLOAD_DIR = Path("instagram_downloads")

# Logging setup
log_lines = []
original_print = print

def custom_print(*args, **kwargs):
    # Capture the message
    message = ' '.join(str(arg) for arg in args)
    log_lines.append(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {message}")
    # Call original print
    original_print(*args, **kwargs)

print = custom_print

# Current user being scraped
CURRENT_USER = None

In [None]:
# Setup Chrome with mobile emulation
options = webdriver.ChromeOptions()
options.add_experimental_option('mobileEmulation', {
    'deviceName': 'iPhone 12 Pro'
})

driver = webdriver.Chrome(options=options)
print("‚úì Browser opened with mobile emulation")
print(f"Sleep multiplier: {SLEEP_MULTIPLIER}x")

In [None]:
# Login to Instagram
def login_instagram(username, password):
    driver.get('https://www.instagram.com/')
    time.sleep(6 * SLEEP_MULTIPLIER)
    
    try:
        # Wait for and click login button if on homepage
        try:
            login_link = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//a[contains(@href, '/accounts/login')]"))
            )
            login_link.click()
        except:
            pass
        
        # Enter username
        username_input = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.NAME, "username"))
        )
        username_input.send_keys(username)
        
        # Enter password
        password_input = driver.find_element(By.NAME, "password")
        password_input.send_keys(password)
        password_input.send_keys(Keys.RETURN)
                
        # Handle "Save Your Login Info" popup
        try:
            not_now = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Not now') or contains(text(), 'Not Now')]"))
            )
            not_now.click()
        except:
            pass
        
        # Handle "Turn on Notifications" popup
        try:
            not_now = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Not Now')]"))
            )
            not_now.click()
        except:
            pass
        
        print("‚úì Logged in successfully")
        return True
        
    except Exception as e:
        print(f"Login failed: {e}")
        return False

# Perform login
if USERNAME and PASSWORD:
    login_instagram(USERNAME, PASSWORD)
else:
    print("‚ö† No login credentials provided")

In [None]:
# Extract comments from current page
def extract_comments():
    comments = []
    
    try:
        # Find all comment sections
        comment_sections = driver.find_elements(
            By.CSS_SELECTOR,
            'div.html-div.xdj266r.x14z9mp.xat24cr.x1lziwak.xyri2b.x1c1uobl.x9f619.xjbqb8w.x78zum5.x15mokao.x1ga7v0g.x16uus16.xbiv7yw.xsag5q8.xz9dl7a.x1uhb9sk.x1plvlek.xryxfnj.x1c4vz4f.x2lah0s.x1q0g3np.xqjyukv.x1qjc9v5.x1oa3qoh.x1nhvcw1'
        )
        
        for section in comment_sections:
            try:
                comment_data = {}
                
                # Extract author (looking for the _aacw class which indicates username)
                author_spans = section.find_elements(
                    By.CSS_SELECTOR,
                    'span._ap3a._aaco._aacw._aacx._aad7._aade'
                )
                if author_spans:
                    comment_data['author'] = author_spans[0].text
                else:
                    continue  # Skip if no author found
                
                # Extract comment text (looking for _aacu class which indicates regular text)
                text_spans = section.find_elements(
                    By.CSS_SELECTOR,
                    'span._ap3a._aaco._aacu._aacx._aad7._aade'
                )
                
                # Filter out the author span from text spans
                comment_text = ""
                for span in text_spans:
                    text = span.text.strip()
                    if text and text != comment_data['author']:
                        comment_text += text + " "
                
                comment_data['text'] = comment_text.strip()
                
                # Check if comment has media (image/gif)
                images = section.find_elements(By.TAG_NAME, 'img')
                has_media = False
                for img in images:
                    src = img.get_attribute('src')
                    # Filter out profile pictures and emojis
                    if src and 'cdninstagram.com' in src and 's150x150' not in src:
                        has_media = True
                        comment_data['media_url'] = src
                        break
                
                if has_media and not comment_data['text']:
                    comment_data['text'] = "[Image/GIF]"
                
                # Extract timestamp
                time_elements = section.find_elements(By.TAG_NAME, 'time')
                if time_elements:
                    datetime_str = time_elements[0].get_attribute('datetime')
                    title_str = time_elements[0].get_attribute('title')
                    comment_data['timestamp'] = datetime_str
                    comment_data['timestamp_display'] = title_str
                
                # Extract likes if present
                like_spans = section.find_elements(
                    By.CSS_SELECTOR,
                    'span.x1lliihq.x1plvlek.xryxfnj.x1n2onr6.xyejjpt.x15dsfln.x193iq5w.xeuugli.x1fj9vlw.x13faqbe.x1vvkbs.x1s928wv.xhkezso.x1gmr53x.x1cpjm7i.x1fgarty.x1943h6x.x1i0vuye.x1fhwpqd.x1s688f.x1roi4f4.x1s3etm8.x676frb.x10wh9bi.xpm28yp.x8viiok.x1o7cslx'
                )
                for span in like_spans:
                    text = span.text.strip()
                    if 'like' in text.lower():
                        comment_data['likes'] = text
                        break
                
                # Only add if we have both author and text/media
                if comment_data.get('author') and (comment_data.get('text') or comment_data.get('media_url')):
                    comments.append(comment_data)
                
            except Exception as e:
                # Skip individual comment if extraction fails
                continue
    
    except Exception as e:
        print(f"  Error extracting comments: {e}")
    
    return comments

In [None]:
# Scrape comments for a single post
def scrape_post_comments(post_id, post_dir):
    print(f"\n{'='*60}")
    print(f"Scraping comments for post: {post_id}")
    print('='*60)
    
    # Navigate to comments page
    comments_url = f"https://www.instagram.com/p/{post_id}/comments/"
    driver.get(comments_url)
    time.sleep(6 * SLEEP_MULTIPLIER)
    
    all_comments = []
    seen_comment_ids = set()
    
    # Read expected comment count from scrape_stats.json if available
    expected_comments = None
    stats_file = post_dir.parent / "scrape_stats.json"
    if stats_file.exists():
        try:
            with open(stats_file, 'r') as f:
                stats = json.load(f)
                for stat in stats:
                    if stat['post_id'] == post_id:
                        comments_str = stat.get('comments', '0')
                        # Parse comment count (handle formats like "1,234")
                        expected_comments = int(comments_str.replace(',', ''))
                        print(f"Expected comments: {expected_comments}")
                        break
        except Exception as e:
            print(f"Could not read expected comments: {e}")
    
    # Scroll and collect comments incrementally
    scroll_count = 0
    no_new_comments_count = 0
    max_scrolls = 200  # Safety limit
    
    print("Starting to scroll and collect comments...")
    
    while scroll_count < max_scrolls:
        # Extract comments from current view
        comments = extract_comments()
        
        # Add new comments to collection
        new_comments = 0
        for comment in comments:
            # Create unique ID from author + timestamp
            comment_id = f"{comment.get('author', '')}_{comment.get('timestamp', '')}"
            if comment_id not in seen_comment_ids:
                all_comments.append(comment)
                seen_comment_ids.add(comment_id)
                new_comments += 1
        
        print(f"  Scroll {scroll_count + 1}: Found {new_comments} new comments (Total: {len(all_comments)})")
        
        # Save comments incrementally every 10 scrolls
        if (scroll_count + 1) % 10 == 0:
            comments_file = post_dir / "comments.json"
            with open(comments_file, 'w', encoding='utf-8') as f:
                json.dump(all_comments, f, indent=2, ensure_ascii=False)
            print(f"  üíæ Saved {len(all_comments)} comments to {comments_file}")
        
        # Check if we should stop
        if new_comments == 0:
            no_new_comments_count += 1
            if no_new_comments_count >= 3:
                print("  No new comments found after 3 scrolls, stopping")
                break
        else:
            no_new_comments_count = 0
        
        # Check if we've reached expected count (with some tolerance)
        if expected_comments and len(all_comments) >= expected_comments * 0.9:
            print(f"  Reached ~{int((len(all_comments) / expected_comments) * 100)}% of expected comments")
            # Continue for a few more scrolls to be sure
            if len(all_comments) >= expected_comments:
                break
        
        # Scroll down
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3 * SLEEP_MULTIPLIER)
        scroll_count += 1
    
    # Final save
    comments_file = post_dir / "comments.json"
    with open(comments_file, 'w', encoding='utf-8') as f:
        json.dump(all_comments, f, indent=2, ensure_ascii=False)
    
    print(f"\n‚úì Scraped {len(all_comments)} comments")
    if expected_comments:
        percentage = int((len(all_comments) / expected_comments) * 100)
        print(f"  Coverage: {percentage}% of expected {expected_comments} comments")
    print(f"‚úì Saved to {comments_file}")
    
    return len(all_comments)

In [None]:
# Scrape comments for all posts of a user
def scrape_user_comments(username):
    global CURRENT_USER, log_lines
    
    CURRENT_USER = username
    user_dir = BASE_DOWNLOAD_DIR / username
    
    if not user_dir.exists():
        print(f"‚ö† User directory not found: {user_dir}")
        print("  Please run the post scraper first to create post folders")
        return
    
    # Reset log for this user
    log_lines = []
    
    print(f"\n{'='*60}")
    print(f"Scraping comments for user: {username}")
    print('='*60)
    
    # Find all post directories
    post_dirs = [d for d in user_dir.iterdir() if d.is_dir()]
    print(f"Found {len(post_dirs)} post folders")
    
    total_comments = 0
    
    for i, post_dir in enumerate(post_dirs, 1):
        post_id = post_dir.name
        print(f"\nPost {i}/{len(post_dirs)}: {post_id}")
        
        # Skip if comments already exist (optional - comment out to re-scrape)
        comments_file = post_dir / "comments.json"
        if comments_file.exists():
            print(f"  ‚è© Comments already exist, skipping")
            try:
                with open(comments_file, 'r') as f:
                    existing = json.load(f)
                    total_comments += len(existing)
                    print(f"  Found {len(existing)} existing comments")
            except:
                pass
            continue
        
        try:
            count = scrape_post_comments(post_id, post_dir)
            total_comments += count
            time.sleep(5 * SLEEP_MULTIPLIER)
        except Exception as e:
            print(f"  Error scraping comments: {e}")
            continue
    
    print(f"\n{'='*60}")
    print(f"‚úì User '{username}' complete")
    print(f"  Total comments scraped: {total_comments}")
    print('='*60)
    
    # Save log file
    if log_lines:
        log_file = user_dir / "comments_log.txt"
        with open(log_file, 'w', encoding='utf-8') as f:
            f.write('\n'.join(log_lines))
        print(f"‚úì Log saved to {log_file}")
    
    return total_comments

In [None]:
# Main scraping loop
if USERS_TO_SCRAPE:
    for username in USERS_TO_SCRAPE:
        scrape_user_comments(username)
        time.sleep(6 * SLEEP_MULTIPLIER)
else:
    print("‚ö† No users to scrape. Add usernames to USERS_TO_SCRAPE list.")

In [None]:
# Close browser
driver.quit()
print("\n‚úì Browser closed")
print("‚úì All done!")