In [2]:
"""
data_collection.py

Usage:
- If you already have a CSV with columns 'text' and (optional) 'label', use that.
- Otherwise run from a terminal:
    python data_collection.py --mode scrape --query "your keyword" --limit 500 --out data/raw/tweets_scraped.csv

Notes for Jupyter / IPython:
- This file will NOT run its CLI entrypoint automatically when executed inside a notebook.
- In a notebook call fetch_with_snscrape(...) directly, or run the script in a terminal.
"""
import argparse
import os
import sys
import pandas as pd
from tqdm import tqdm

def fetch_with_snscrape(query, limit=500, out_csv="social-media-sentiment-analysis/data/raw/tweets_scraped.csv"):
    try:
        import snscrape.modules.twitter as sntwitter
    except Exception:
        raise RuntimeError("snscrape is required for scraping. Install with: pip install snscrape")
    rows = []
    for i, tweet in enumerate(tqdm(sntwitter.TwitterSearchScraper(query).get_items(), desc="scraping")):
        if i >= limit:
            break
        rows.append({
            "date": tweet.date.isoformat(),
            "id": tweet.id,
            "user": tweet.user.username,
            "text": tweet.content,
            "likeCount": tweet.likeCount
        })
    os.makedirs(os.path.dirname(out_csv), exist_ok=True)
    df = pd.DataFrame(rows)
    df.to_csv(out_csv, index=False, encoding="utf-8")
    print("Saved:", out_csv)
    return out_csv

def main(argv=None):
    parser = argparse.ArgumentParser(prog="data_collection.py")
    parser.add_argument("--mode", choices=["csv", "scrape"], default="csv")
    parser.add_argument("--csv", help="Path to CSV with columns text,label (if mode=csv)")
    parser.add_argument("--query", help="Search query for snscrape (if mode=scrape)", default="product review")
    parser.add_argument("--limit", type=int, default=500)
    parser.add_argument("--out", default="social-media-sentiment-analysis/data/raw/tweets_scraped.csv")

    # Use parse_known_args to be robust in environments that inject extra args (e.g. Jupyter)
    if argv is None:
        # When argv is None we parse sys.argv but tolerate unknowns.
        args, unknown = parser.parse_known_args()
    else:
        args, unknown = parser.parse_known_args(argv)

    # Now perform the requested action
    if args.mode == "csv":
        if not args.csv:
            raise SystemExit("Provide --csv path when mode=csv")
        if not os.path.exists(args.csv):
            raise SystemExit(f"CSV not found: {args.csv}")
        print("Using existing CSV:", args.csv)
    else:
        fetch_with_snscrape(args.query, limit=args.limit, out_csv=args.out)

if __name__ == "__main__":
    # Detect interactive (Jupyter / IPython) environment; avoid auto-running CLI parsing inside notebooks.
    in_ipy = False
    try:
        # get_ipython exists in IPython/Jupyter (NameError in plain Python)
        from IPython import get_ipython  # type: ignore
        if get_ipython() is not None:
            in_ipy = True
    except Exception:
        in_ipy = False

    if in_ipy:
        msg = (
            "Detected IPython/Jupyter environment. The script will not run the CLI entrypoint here.\n"
            "To use this functionality in a notebook, either:\n"
            "  - Call fetch_with_snscrape(query, limit, out_csv) directly from a cell, e.g.:\n"
            "      from data_collection import fetch_with_snscrape\n"
            "      fetch_with_snscrape('product review', limit=100, out_csv='data/raw/tweets.csv')\n"
            "  - Or run the script in a terminal (cmd/PowerShell):\n"
            "      python data_collection.py --mode scrape --query \"your keyword\" --limit 500 --out data/raw/tweets_scraped.csv\n"
        )
        print(msg)
    else:
        # invoked as a normal script from the command line
        main()

Detected IPython/Jupyter environment. The script will not run the CLI entrypoint here.
To use this functionality in a notebook, either:
  - Call fetch_with_snscrape(query, limit, out_csv) directly from a cell, e.g.:
      from data_collection import fetch_with_snscrape
      fetch_with_snscrape('product review', limit=100, out_csv='data/raw/tweets.csv')
  - Or run the script in a terminal (cmd/PowerShell):
      python data_collection.py --mode scrape --query "your keyword" --limit 500 --out data/raw/tweets_scraped.csv

