In [5]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
%cd /content/drive/MyDrive/
!mkdir -p assignment_folder
%cd assignment_folder


/content/drive/MyDrive
/content/drive/MyDrive/assignment_folder


In [8]:
from google.colab import files
uploaded = files.upload()


Saving reddit.env to reddit (1).env


In [9]:
!pip install praw pandas python-dotenv


Collecting praw
  Downloading praw-7.8.1-py3-none-any.whl.metadata (9.4 kB)
Collecting prawcore<3,>=2.4 (from praw)
  Downloading prawcore-2.4.0-py3-none-any.whl.metadata (5.0 kB)
Collecting update_checker>=0.18 (from praw)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
Downloading praw-7.8.1-py3-none-any.whl (189 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m189.3/189.3 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading prawcore-2.4.0-py3-none-any.whl (17 kB)
Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Installing collected packages: update_checker, prawcore, praw
Successfully installed praw-7.8.1 prawcore-2.4.0 update_checker-0.18.0


In [10]:
%%writefile reddit_code.py


Writing reddit_code.py


In [14]:
# reddit_code.py
# -------------------------------------------
# A friendly, step-by-step Reddit data collector
# - Loads your Reddit API credentials from reddit.env (kept private)
# - Collects "hot" posts from a few subreddits on the same theme
# - Runs a keyword search across those subreddits
# - Cleans and saves everything to reddit_data.csv
#
# Edit the CONFIG section near the bottom to choose your subreddits + keyword.
# -------------------------------------------

import os
import time
from typing import List, Dict, Optional

import pandas as pd
import praw
from dotenv import load_dotenv
from urllib.parse import urlparse


# Columns your instructor expects in the final CSV
REQUIRED_COLUMNS = [
    "title",
    "score",
    "upvote_ratio",
    "num_comments",
    "author",
    "subreddit",
    "url",
    "permalink",
    "created_utc",
    "is_self",
    "selftext",
    "flair",
    "domain",
    "search_query",
]


def load_reddit_from_env(env_path: str = "reddit.env") -> praw.Reddit:
    """
    Load credentials from reddit.env and return an authenticated PRAW client.

    Your reddit.env should contain:
      REDDIT_CLIENT_ID="..."
      REDDIT_CLIENT_SECRET="..."
      REDDIT_USER_AGENT="..."

    We do NOT hard-code secrets here. Keep reddit.env out of GitHub.
    """
    load_dotenv(env_path)

    client_id = os.getenv("REDDIT_CLIENT_ID")
    client_secret = os.getenv("REDDIT_CLIENT_SECRET")
    user_agent = os.getenv("REDDIT_USER_AGENT")

    missing = [k for k, v in {
        "REDDIT_CLIENT_ID": client_id,
        "REDDIT_CLIENT_SECRET": client_secret,
        "REDDIT_USER_AGENT": user_agent,
    }.items() if not v]

    if missing:
        raise ValueError(
            f"Missing credentials in {env_path}: {', '.join(missing)}. "
            "Open your reddit.env and fill them in."
        )

    # Create the PRAW client (polite rate-limiting)
    reddit = praw.Reddit(
        client_id=client_id,
        client_secret=client_secret,
        user_agent=user_agent,
        ratelimit_seconds=5,
    )
    return reddit


def safe_get_domain(url_value: Optional[str]) -> Optional[str]:
    """Extract domain from a URL safely (or return None)."""
    if not url_value:
        return None
    try:
        return urlparse(url_value).netloc
    except Exception:
        return None


def submission_to_row(submission, search_query: Optional[str] = None) -> Dict:
    """
    Convert a PRAW submission into a clean dictionary row matching REQUIRED_COLUMNS.
    We gracefully handle missing fields and truncate long text.
    """
    author_name = getattr(submission.author, "name", None) if submission.author else None
    body = getattr(submission, "selftext", None)
    if body:
        body = body[:500]  # keep at most 500 characters (per assignment)

    return {
        "title": getattr(submission, "title", None),
        "score": getattr(submission, "score", None),
        "upvote_ratio": getattr(submission, "upvote_ratio", None),
        "num_comments": getattr(submission, "num_comments", None),
        "author": author_name,
        "subreddit": str(getattr(submission, "subreddit", "")),
        "url": getattr(submission, "url", None),
        "permalink": f"https://www.reddit.com{getattr(submission, 'permalink', '')}",
        "created_utc": int(getattr(submission, "created_utc", 0)) if getattr(submission, "created_utc", None) else None,
        "is_self": getattr(submission, "is_self", None),
        "selftext": body,
        "flair": getattr(submission, "link_flair_text", None),
        "domain": safe_get_domain(getattr(submission, "url", None)),
        "search_query": search_query,  # provenance of how we found it
    }


def collect_hot_posts(reddit: praw.Reddit, subreddits: List[str], limit_per_sub: int) -> List[Dict]:
    """
    Pull 'hot' posts from each subreddit (limit_per_sub each).
    Return a list of dictionary rows ready for a DataFrame.
    """
    rows: List[Dict] = []
    total = 0

    for sr in subreddits:
        print(f"üî• Collecting hot posts from r/{sr} ‚Ä¶")
        try:
            for sub in reddit.subreddit(sr).hot(limit=limit_per_sub):
                rows.append(submission_to_row(sub))
                total += 1
            print(f"   ‚úÖ Done r/{sr}: {limit_per_sub} requested")
            time.sleep(1)  # be nice to the API
        except Exception as e:
            print(f"   ‚ö†Ô∏è Skipping r/{sr} due to error: {e}")

    print(f"üî• Summary (hot): collected {total} posts total.\n")
    return rows


def collect_search_posts(reddit: praw.Reddit, subreddits: List[str], query: str, limit_per_sub: int) -> List[Dict]:
    """
    Search for a keyword in each subreddit.
    Adds a 'search_query' value to each row for provenance.
    """
    rows: List[Dict] = []
    total = 0

    for sr in subreddits:
        print(f"üîé Searching '{query}' in r/{sr} ‚Ä¶")
        try:
            for sub in reddit.subreddit(sr).search(query, limit=limit_per_sub):
                rows.append(submission_to_row(sub, search_query=query))
                total += 1
            print(f"   ‚úÖ Done r/{sr}: {limit_per_sub} requested")
            time.sleep(1)
        except Exception as e:
            print(f"   ‚ö†Ô∏è Search failed for r/{sr}: {e}")

    print(f"üîé Summary (search): collected {total} posts total.\n")
    return rows


def save_clean_csv(rows: List[Dict], out_path: str = "reddit_data.csv") -> None:
    """
    Turn collected rows into a DataFrame, drop duplicates (by permalink), and save to CSV.
    """
    # Ensure DataFrame has all expected columns (and in a consistent order)
    df = pd.DataFrame(rows)
    for col in REQUIRED_COLUMNS:
        if col not in df.columns:
            df[col] = None
    df = df[REQUIRED_COLUMNS]

    # Deduplicate
    before = len(df)
    df = df.drop_duplicates(subset=["permalink"]).reset_index(drop=True)
    after = len(df)
    removed = before - after

    print(f"üßπ Removed {removed} duplicates. {after} unique rows remain.")
    df.to_csv(out_path, index=False)
    print(f"üíæ Saved clean data to {out_path}\n")


# --------------- CONFIG: EDIT THESE --------------- #
# Pick a topic and fill in 3 related subreddits
TOPIC_SUBREDDITS = ["Depression", "Anxiety", "Mindfulness"]

# Add 2‚Äì4 search keywords you want to collect posts about
SEARCH_KEYWORDS = ["therapy", "meditation", "mental health"]

POST_LIMIT = 50
ENV_FILE = "reddit.env"                                            # leave unless your file name differs
# -------------------------------------------------- #


def main():
    print("üöÄ Starting Reddit data collection ‚Ä¶")

    # 1Ô∏è‚É£ Load credentials and connect to Reddit
    reddit = load_reddit_from_env(ENV_FILE)

    # 2Ô∏è‚É£ Collect Hot posts
    hot_rows = collect_hot_posts(reddit, TOPIC_SUBREDDITS, POST_LIMIT)

    # 3Ô∏è‚É£ Search posts for multiple keywords
    search_rows = []
    for keyword in SEARCH_KEYWORDS:
        search_rows.extend(collect_search_posts(reddit, TOPIC_SUBREDDITS, keyword, POST_LIMIT))

    # 4Ô∏è‚É£ Combine everything and save to CSV
    all_rows = hot_rows + search_rows
    save_clean_csv(all_rows, out_path="reddit_data.csv")

    print("‚úÖ All done! Check reddit_data.csv in your folder.")



if __name__ == "__main__":
    main()


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



üöÄ Starting Reddit data collection ‚Ä¶
üî• Collecting hot posts from r/Depression ‚Ä¶
   ‚úÖ Done r/Depression: 50 requested


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



üî• Collecting hot posts from r/Anxiety ‚Ä¶
   ‚úÖ Done r/Anxiety: 50 requested


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



üî• Collecting hot posts from r/Mindfulness ‚Ä¶
   ‚úÖ Done r/Mindfulness: 50 requested


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



üî• Summary (hot): collected 150 posts total.

üîé Searching 'therapy' in r/Depression ‚Ä¶
   ‚úÖ Done r/Depression: 50 requested


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



üîé Searching 'therapy' in r/Anxiety ‚Ä¶
   ‚úÖ Done r/Anxiety: 50 requested


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



üîé Searching 'therapy' in r/Mindfulness ‚Ä¶
   ‚úÖ Done r/Mindfulness: 50 requested


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



üîé Summary (search): collected 150 posts total.

üîé Searching 'meditation' in r/Depression ‚Ä¶
   ‚úÖ Done r/Depression: 50 requested


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



üîé Searching 'meditation' in r/Anxiety ‚Ä¶
   ‚úÖ Done r/Anxiety: 50 requested


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



üîé Searching 'meditation' in r/Mindfulness ‚Ä¶
   ‚úÖ Done r/Mindfulness: 50 requested


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



üîé Summary (search): collected 150 posts total.

üîé Searching 'mental health' in r/Depression ‚Ä¶
   ‚úÖ Done r/Depression: 50 requested


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



üîé Searching 'mental health' in r/Anxiety ‚Ä¶
   ‚úÖ Done r/Anxiety: 50 requested


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



üîé Searching 'mental health' in r/Mindfulness ‚Ä¶
   ‚úÖ Done r/Mindfulness: 50 requested
üîé Summary (search): collected 150 posts total.

üßπ Removed 8 duplicates. 592 unique rows remain.
üíæ Saved clean data to reddit_data.csv

‚úÖ All done! Check reddit_data.csv in your folder.


In [15]:
import pandas as pd
df = pd.read_csv("reddit_data.csv")
df.head()


Unnamed: 0,title,score,upvote_ratio,num_comments,author,subreddit,url,permalink,created_utc,is_self,selftext,flair,domain,search_query
0,Our most-broken and least-understood rules is ...,2359,1.0,177,SQLwitch,depression,https://www.reddit.com/r/depression/comments/d...,https://www.reddit.com/r/depression/comments/d...,1572360722,True,We understand that most people who reply immed...,,www.reddit.com,
1,"Regular check-in post, with essential informat...",57,1.0,144,SQLwitch,depression,https://www.reddit.com/r/depression/comments/1...,https://www.reddit.com/r/depression/comments/1...,1744611968,True,Welcome to /r/depression's check-in post - a p...,,www.reddit.com,
2,I am so lonely,71,0.98,41,AcanthocephalaNo4327,depression,https://www.reddit.com/r/depression/comments/1...,https://www.reddit.com/r/depression/comments/1...,1761777921,True,Im 26 female and I cry myself to sleep every n...,,www.reddit.com,
3,Cried in Front of Daughter,234,0.98,55,CalendarUnusual9500,depression,https://www.reddit.com/r/depression/comments/1...,https://www.reddit.com/r/depression/comments/1...,1761750409,True,Yesterday my four-year old was having a tantru...,,www.reddit.com,
4,"As someone who‚Äôs already depressed, I can conf...",66,0.99,5,throwRA124452,depression,https://www.reddit.com/r/depression/comments/1...,https://www.reddit.com/r/depression/comments/1...,1761763401,True,This is real people‚Äôs emotions are real. Bully...,,www.reddit.com,


In [16]:
%%writefile README.md
# Reddit API Data Collection Assignment

This project connects to the Reddit API using **PRAW** to collect, clean, and export posts from several subreddits related to a chosen theme.

## Overview
The program fetches the *Hot* posts and also performs keyword-based searches across three chosen subreddits. The combined data is cleaned, deduplicated, and saved as `reddit_data.csv`.

## How to Run
1. **Requirements:** Python 3.8+
2. **Install dependencies:**


Writing README.md


In [20]:
%%writefile requirements.txt
praw
pandas
python-dotenv

Writing requirements.txt


In [21]:
%%writefile .gitignore
reddit.env
.env
venv/
__pycache__/


Writing .gitignore


In [22]:
!ls


 README.md	   reddit_code.py    reddit_data.gsheet   requirements.txt
'reddit (1).env'   reddit_data.csv   reddit.env


In [23]:
!git init
!git config --global user.name "AnshuMishra1732"
!git config --global user.email "anshu1732@gmail.com"
!git add .
!git commit -m "Initial commit - Reddit API Assignment"


[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /content/drive/MyDrive/assignment_folder/.git/
error: open("reddit_data.gsheet"): Operation not supported
error: unable to index file 'reddit_data.gsheet'
fatal: adding files failed
On branch master

Initial commit

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31m.gitignore[m
	[31mREADME.md[m
	[31mreddit (1).env[m
	[31mreddit_code.py[m
	[31mreddit_data.csv[m
	[31mreddit_data.gsheet[m
	[31mrequirements.txt[m

nothing added 

In [24]:
!rm reddit_data.gsheet

In [25]:
!git add .gitignore README.md reddit_code.py reddit_data.csv requirements.txt

In [26]:
!git commit -m "Initial commit - Reddit API Assignment"

[master (root-commit) 4819799] Initial commit - Reddit API Assignment
 5 files changed, 1882 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 README.md
 create mode 100644 reddit_code.py
 create mode 100644 reddit_data.csv
 create mode 100644 requirements.txt


In [36]:
from google.colab import userdata
github_token = userdata.get('GITHUB_PAT')
print('Token loaded?', bool(github_token))


Token loaded? True


In [39]:
from google.colab import userdata
from urllib.parse import quote  # ‚Üê needed for quote()

github_token = userdata.get('GITHUB_PAT')
username = "AnshuMishra1732"
repo_name = "reddit_api_assignment"

# encode token safely
safe_token = quote(github_token, safe='')

# build remote URL
remote_url = f"https://{username}:{safe_token}@github.com/{username}/{repo_name}.git"

# reset remote if needed and push
!git remote remove origin 2>/dev/null || true
!git remote add origin {remote_url}
!git branch -M main
!git push -u origin main


Branch 'main' set up to track remote branch 'main' from 'origin'.
Everything up-to-date


In [38]:
%%writefile README.md
# Reddit API Data Collection Assignment

This project connects to the Reddit API using **PRAW** to collect, clean, and export posts from several subreddits related to a chosen theme.

## Overview
The program fetches the *Hot* posts and also performs keyword-based searches across three chosen subreddits.
The combined data is cleaned, deduplicated, and saved as `reddit_data.csv`.

## How to Run

1. **Requirements:**
   - Python 3.8+
   - Installed libraries: `praw`, `pandas`, `python-dotenv`

2. **Install dependencies:**
   ```bash
   pip install -r requirements.txt


Overwriting README.md
