In [2]:
!pip install praw pandas tqdm nltk spacy gensim vaderSentiment matplotlib

Collecting praw
  Downloading praw-7.8.1-py3-none-any.whl.metadata (9.4 kB)
Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Collecting prawcore<3,>=2.4 (from praw)
  Downloading prawcore-2.4.0-py3-none-any.whl.metadata (5.0 kB)
Collecting update_checker>=0.18 (from praw)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
Downloading praw-7.8.1-py3-none-any.whl (189 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m189.3/189.3 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m69.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━

In [None]:
#!/usr/bin/env python3
"""
Module 1: Data Collection with BAScraper (ArcticShift)
Collects Reddit data evenly distributed across 2019-2025 for EV Sentiment Analysis.
Uses ArcticShiftAsync for comprehensive historical coverage.
"""

import os
import sys
import json
import logging
import pandas as pd
import asyncio
from datetime import datetime, timedelta
# Ensure you have the BAScraper folder in your project directory
from BAScraper.BAScraper_async import ArcticShiftAsync
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Configure logging
log_dir = "logs"
os.makedirs(log_dir, exist_ok=True)
log_file = os.path.join(log_dir, f"ev_sentiment_collection_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log")
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_file),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# ==========================================
# CONFIGURATION: EV SENTIMENT ANALYSIS
# ==========================================

# EXPANDED SUBREDDITS LIST
# Includes general EV, specific brands (Tesla, Rivian, Hyundai),
# infrastructure specific, and general car advice.
SUBREDDITS = [
    'electricvehicles',   # General EV discussion
    'cars',               # General automotive (mix of pro/anti EV)
    'TeslaMotors',        # Tesla news/official
    'TeslaLounge',        # Casual Tesla discussion
    'RealTesla',          # Critical/Skeptical view (Important for negative sentiment)
    'Rivian',             # Competitor brand specific
    'Ioniq5',             # Specific popular model (Hyundai)
    'evcharging',         # Infrastructure specific (Great for "charging" keyword)
    'whatcarshouldibuy',  # Purchase advice (Great for "range anxiety")
    'technology'          # General tech perspective on EVs
]

# Keywords identified in your plan
KEYWORDS = ['EV', 'charging', 'battery', 'range anxiety']

# Time periods for even distribution (2019-2025)
YEAR_PERIODS = [
    ('2019-01-01', '2019-12-31'),
    ('2020-01-01', '2020-12-31'),
    ('2021-01-01', '2021-12-31'),
    ('2022-01-01', '2022-12-31'),
    ('2023-01-01', '2023-12-31'),
    ('2024-01-01', '2024-12-31'),
    ('2025-01-01', '2025-12-01'),
]

# Targets
POSTS_PER_YEAR_TARGET = 300
COMMENTS_SAMPLE_SIZE = 500


class BAScraperCollector:
    """Reddit data collector using BAScraper with ArcticShift"""

    def __init__(self):
        """Initialize BAScraper"""
        logger.info("Initializing BAScraper (ArcticShift) Data Collector for EV Analysis")

        # Initialize ArcticShiftAsync
        self.scraper = ArcticShiftAsync(
            save_dir=os.getcwd(),
            task_num=3,
            log_level='INFO',
            log_stream_level='INFO'
        )

        self.all_posts = []
        self.all_comments = []

    async def collect_submissions_for_period(self, subreddit, after, before, query_term):
        """Collect submissions for a specific period and query"""
        try:
            logger.info(f"  Searching r/{subreddit} for '{query_term}' ({after} to {before})")

            result = await self.scraper.fetch(
                mode='submissions_search',
                subreddit=subreddit,
                query=query_term,
                after=after,
                before=before,
                limit=100,  # 100 per request
                sort='asc',
                fields=['id', 'title', 'selftext', 'author', 'created_utc', 'subreddit',
                       'score', 'num_comments', 'url', 'link_flair_text']
            )

            if result:
                posts_list = list(result.values())
                logger.info(f"    Found {len(posts_list)} posts")
                return posts_list
            else:
                return []

        except Exception as e:
            logger.error(f"Error collecting from r/{subreddit}: {e}")
            return []

    async def collect_all_submissions(self):
        """Collect submissions across all years and subreddits"""
        logger.info("="*80)
        logger.info("COLLECTING SUBMISSIONS")
        logger.info("="*80)

        posts_by_year = {year.split('-')[0]: [] for year, _ in YEAR_PERIODS}

        for after, before in YEAR_PERIODS:
            year = after[:4]
            logger.info(f"\n{'='*60}")
            logger.info(f"YEAR: {year}")
            logger.info(f"{'='*60}")

            year_posts = []

            for subreddit in SUBREDDITS:
                for keyword in KEYWORDS:
                    posts = await self.collect_submissions_for_period(
                        subreddit, after, before, keyword
                    )
                    year_posts.extend(posts)
                    # Small delay to respect rate limits with increased subreddits
                    await asyncio.sleep(1.5)

            # Remove duplicates within year
            unique_posts = {p['id']: p for p in year_posts}
            posts_by_year[year] = list(unique_posts.values())

            logger.info(f"Year {year} total: {len(posts_by_year[year])} unique posts")

        # Flatten all posts
        for year_posts in posts_by_year.values():
            self.all_posts.extend(year_posts)

        logger.info(f"\n{'='*80}")
        logger.info(f"TOTAL SUBMISSIONS COLLECTED: {len(self.all_posts)}")
        logger.info(f"{'='*80}")

        return self.all_posts

    async def collect_comments_for_submissions(self, submission_ids, limit=30):
        """Collect comments for sampled submissions"""
        logger.info(f"\nCollecting comments for {len(submission_ids)} submissions")

        all_comments = []

        for i, sub_id in enumerate(submission_ids):
            try:
                if (i + 1) % 50 == 0:
                    logger.info(f"  Progress: {i+1}/{len(submission_ids)}")

                result = await self.scraper.fetch(
                    mode='comments_search',
                    link_id=sub_id,
                    limit=limit,
                    fields=['id', 'body', 'author', 'created_utc', 'subreddit',
                           'score', 'link_id', 'parent_id']
                )

                if result:
                    comments_list = list(result.values())
                    all_comments.extend(comments_list)

                await asyncio.sleep(0.5)  # Rate limiting

            except Exception as e:
                logger.debug(f"Error collecting comments for {sub_id}: {e}")
                continue

        logger.info(f"Collected {len(all_comments)} comments")
        return all_comments

    async def run_collection(self):
        """Main collection orchestration"""
        logger.info("Starting BAScraper collection pipeline")
        logger.info(f"Target Subreddits ({len(SUBREDDITS)}): {SUBREDDITS}")
        logger.info(f"Target Keywords: {KEYWORDS}")

        # Collect submissions
        await self.collect_all_submissions()

        # Sample submissions for comment collection (stratified by year)
        if not self.all_posts:
            logger.warning("No posts collected. Skipping comment collection.")
            return [], []

        df = pd.DataFrame(self.all_posts)
        df['created_datetime'] = pd.to_datetime(df['created_utc'], unit='s')
        df['year'] = df['created_datetime'].dt.year

        # Sample posts for comments (proportional to year distribution, max 500 total)
        sample_ids = []
        for year in df['year'].unique():
            year_posts = df[df['year'] == year]
            n_sample = min(100, len(year_posts))  # Max 100 per year
            sampled = year_posts.sample(n=n_sample, random_state=42)
            sample_ids.extend(sampled['id'].tolist())

        # Limit to 500 total
        if len(sample_ids) > COMMENTS_SAMPLE_SIZE:
            sample_ids = sample_ids[:COMMENTS_SAMPLE_SIZE]

        logger.info(f"\nSampled {len(sample_ids)} posts for comment collection")

        # Collect comments
        self.all_comments = await self.collect_comments_for_submissions(sample_ids)

        return self.all_posts, self.all_comments

    def save_data(self):
        """Save collected data to CSV files"""
        os.makedirs('data/raw', exist_ok=True)

        # Process and save posts
        if not self.all_posts:
            logger.error("No data to save.")
            return pd.DataFrame(), pd.DataFrame()

        posts_df = pd.DataFrame(self.all_posts)

        # Add engagement metrics
        posts_df['engagement_score'] = posts_df['score'] + (posts_df['num_comments'] * 2)

        # Add temporal features
        posts_df['created_datetime'] = pd.to_datetime(posts_df['created_utc'], unit='s')
        posts_df['year'] = posts_df['created_datetime'].dt.year
        posts_df['month'] = posts_df['created_datetime'].dt.month
        posts_df['quarter'] = posts_df['created_datetime'].dt.quarter

        posts_file = 'data/raw/ev_posts.csv'
        posts_df.to_csv(posts_file, index=False)
        logger.info(f"\n✓ Saved {len(posts_df)} posts to {posts_file}")

        # Process and save comments
        comments_df = pd.DataFrame(self.all_comments)
        if not comments_df.empty:
            comments_df['created_datetime'] = pd.to_datetime(comments_df['created_utc'], unit='s')

        comments_file = 'data/raw/ev_comments.csv'
        comments_df.to_csv(comments_file, index=False)
        logger.info(f"✓ Saved {len(comments_df)} comments to {comments_file}")

        # Generate collection report
        report = {
            'collection_metadata': {
                'collection_date': datetime.now().isoformat(),
                'project': 'EV Sentiment Analysis',
                'scraper': 'BAScraper (ArcticShift)',
            },
            'data_summary': {
                'total_posts': len(self.all_posts),
                'total_comments': len(self.all_comments),
                'unique_authors_posts': posts_df['author'].nunique(),
                'unique_subreddits': posts_df['subreddit'].nunique(),
            },
            'configuration': {
                'subreddits': SUBREDDITS,
                'keywords': KEYWORDS,
                'year_periods': YEAR_PERIODS,
            },
            'subreddit_breakdown': posts_df['subreddit'].value_counts().to_dict()
        }

        os.makedirs('data/metadata', exist_ok=True)
        report_file = 'data/metadata/collection_report.json'
        with open(report_file, 'w') as f:
            json.dump(report, f, indent=2)
        logger.info(f"✓ Saved collection report to {report_file}")

        # Print summary
        logger.info(f"\n{'='*80}")
        logger.info("COLLECTION SUMMARY")
        logger.info(f"{'='*80}")

        # --- SUBREDDIT USAGE REPORT ---
        logger.info(f"\nSubreddits Used:")
        logger.info(f"  {', '.join(SUBREDDITS)}")

        logger.info(f"\nTotal Posts per Subreddit:")
        logger.info(f"{'-'*30}")
        logger.info(f"{'Subreddit':<20} | {'Count':<8}")
        logger.info(f"{'-'*30}")

        subreddit_counts = posts_df['subreddit'].value_counts()
        for sub in SUBREDDITS:
            # We use .get(sub, 0) in case a subreddit returned 0 results
            count = subreddit_counts.get(sub, 0)
            logger.info(f"  r/{sub:<18} | {count}")
        logger.info(f"{'-'*30}")
        # ------------------------------

        logger.info(f"\nTemporal Distribution:")
        for year, count in sorted(posts_df['year'].value_counts().items()):
            pct = (count / len(posts_df)) * 100
            logger.info(f"  {year}: {count:4d} posts ({pct:5.1f}%)")

        logger.info(f"\nEngagement Metrics:")
        logger.info(f"  Mean Score: {posts_df['score'].mean():.1f}")
        logger.info(f"  Mean Comments: {posts_df['num_comments'].mean():.1f}")

        return posts_df, comments_df


async def main():
    """Main execution"""
    try:
        logger.info("="*80)
        logger.info("EV SENTIMENT ANALYSIS - DATA COLLECTION")
        logger.info("="*80)

        collector = BAScraperCollector()
        await collector.run_collection()
        posts_df, comments_df = collector.save_data()

        logger.info(f"\n{'='*80}")
        logger.info("✓ DATA COLLECTION COMPLETE!")
        logger.info(f"{'='*80}")
        logger.info(f"\nCollected:")
        logger.info(f"  - {len(posts_df)} posts")
        logger.info(f"  - {len(comments_df)} comments")
        logger.info(f"\nNext step: Run preprocessing for Topic Modelling (LDA) and VADER.")

        return 0

    except Exception as e:
        logger.error(f"Collection failed: {e}", exc_info=True)
        return 1


if __name__ == "__main__":
    sys.exit(asyncio.run(main()))