<a href="https://colab.research.google.com/github/sunflowerseed17/PythonDSProject/blob/main/Data_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importing dependencies
import os
import re
import time
!pip install praw
from datetime import datetime, timedelta
import praw
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer



### Scrape Reddit For Depression Posts (ONLY RUN ONCE)

In [None]:
# Configure Reddit API
reddit = praw.Reddit(
    client_id="ZaUY5qF9eLVVpD2OvHGEhg",
    client_secret="djHnirfkPnZUNI7XNs4dKUflOKjmtQ",
    user_agent="TextScraper by u/Jammberg"
)

# List of related subreddits
subreddits = ["depression", "breastcancer", "agoraphobia"]

# Define a regex pattern for phrasing variations of "I have been diagnosed with"
search_pattern = re.compile(
    r"(i\s+(was|am|have been|got|recently got|just got|was just|had been|found out i\s+was|"
    r"was diagnosed as having|diagnosed as suffering from|got diagnosed as having|received a diagnosis of|"
    r"was told i\s+have|was informed i\s+have)\s+.*)",
    re.IGNORECASE
)

# Function to create output folder
def create_folder(folder_name):
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

# Function to save posts to file
def save_post(post, output_folder, subreddit_name):
    filename = f"{subreddit_name}_{post.id}.txt"
    filepath = os.path.join(output_folder, filename)
    try:
        with open(filepath, "w", encoding="utf-8") as file:
            file.write(f"Subreddit: {post.subreddit.display_name}\n")
            file.write(f"Title: {post.title}\n")
            file.write(f"Author: {post.author}\n")
            file.write(f"Score: {post.score}\n")
            file.write(f"Created UTC: {datetime.utcfromtimestamp(post.created_utc)}\n")
            file.write(f"URL: {post.url}\n")
            file.write("\n")
            file.write(post.selftext)
        print(f"Saved post to {filepath}")
    except Exception as e:
        print(f"Error saving post {post.id}: {e}")

# Function to fetch user posts within one month of a specific post
def fetch_user_posts(author_name, reference_date, output_folder, subreddit_name):
    if not author_name:
        print("Author not available for this post.")
        return

    try:
        author = reddit.redditor(author_name)
        one_month_ago = reference_date - timedelta(days=30)
        for user_post in author.submissions.new(limit=None):
            post_date = datetime.utcfromtimestamp(user_post.created_utc)
            if one_month_ago <= post_date <= reference_date:
                save_post(user_post, output_folder, subreddit_name)
            elif post_date < one_month_ago:
                break
            time.sleep(2)  # To avoid hitting rate limits
    except Exception as e:
        print(f"Error fetching posts for user {author_name}: {e}")

# Function to fetch and save posts from a subreddit
def fetch_posts_from_subreddit(subreddit_name):
    print(f"\nFetching posts from r/{subreddit_name}...\n")
    subreddit = reddit.subreddit(subreddit_name)
    output_folder = f"reddit_scraped_posts/{subreddit_name}"
    create_folder(output_folder)

    try:
        for post in subreddit.new(limit=None):
            if post.selftext.strip() and re.search(search_pattern, post.selftext):  # Ensure selftext is not empty or just whitespace
                reference_date = datetime.utcfromtimestamp(post.created_utc)
                save_post(post, output_folder, subreddit_name)
                fetch_user_posts(post.author.name, reference_date, output_folder, subreddit_name)
            time.sleep(2)  # To avoid hitting rate limits
    except Exception as e:
        print(f"Error fetching posts from r/{subreddit_name}: {e}")

# Main scraping logic
for subreddit_name in subreddits:
    fetch_posts_from_subreddit(subreddit_name)

print("\nScraping complete! Text files saved in the respective folders.")

### Preprocessing.
#### The Code Below Does the Following:
1. Tokenization: Splitting text into individual tokens (words or punctuation marks).
2. Removing Noise: Cleaning the text by removing: URLs, Punctuation, and Stop words
3. Stemming: Reducing words to their root form (e.g., "running" -> "run").
4. Converting text to lowercase for consistency.

In [2]:
# Download NLTK data files (only the first time)
nltk.download('punkt')
nltk.download('stopwords')

# Initialize tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Define folders for input and output
folders = {
    "depression": {
        "input": "data/reddit_scraped_posts/depression",
        "output": "data/preprocessed_posts/depression"
    },
    "breastcancer": {
        "input": "data/reddit_scraped_posts/breastcancer",
        "output": "data/preprocessed_posts/breastcancer"
    }
}

# Ensure output folders exist
for category, paths in folders.items():
    os.makedirs(paths["output"], exist_ok=True)

def preprocess_text(text):
    """
    Preprocess the given text:
    - Tokenize into words
    - Lowercase and remove non-alphanumeric tokens
    - Remove stop words
    - Apply stemming
    """
    try:
        # Tokenize text
        tokens = word_tokenize(text)
        
        # Remove URLs, punctuation, and stop words; lowercase the text
        tokens = [
            word.lower() for word in tokens 
            if word.isalnum() and word.lower() not in stop_words
        ]
        
        # Apply stemming
        tokens = [stemmer.stem(word) for word in tokens]
        
        # Join tokens back into a single string
        return " ".join(tokens)
    except Exception as e:
        print(f"Error preprocessing text: {e}")
        return ""

# Process each category
for category, paths in folders.items():
    input_folder = paths["input"]
    output_folder = paths["output"]
    
    if not os.path.exists(input_folder):
        print(f"Input folder does not exist: {input_folder}")
        continue

    print(f"\nProcessing {category} posts...")
    
    # Process each file in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith(".txt"):
            input_filepath = os.path.join(input_folder, filename)
            output_filepath = os.path.join(output_folder, filename)
            
            try:
                with open(input_filepath, "r", encoding="utf-8") as infile:
                    text = infile.read()
                
                # Extract post content (everything after the first header section)
                post_content = "\n".join(text.splitlines()[6:]).strip()
                
                # Skip files that are empty or contain '[removed]'
                if not post_content or '[removed]' in post_content.lower():
                    print(f"Skipping {filename} (empty or contains '[removed]')")
                    continue
                
                # Preprocess the text
                preprocessed_text = preprocess_text(post_content)
                
                # Save preprocessed text to a new file
                with open(output_filepath, "w", encoding="utf-8") as outfile:
                    outfile.write(preprocessed_text)
                
                print(f"Processed {filename} into {output_folder}")
            except Exception as e:
                print(f"Error processing file {filename}: {e}")

print("\nPreprocessing complete! Preprocessed files are saved in the respective output folders.")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nataszasiwinska/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nataszasiwinska/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Processing depression posts...
Processed depression_1hry34i.txt into data/preprocessed_posts/depression
Processed depression_1i0icf3.txt into data/preprocessed_posts/depression
Processed depression_1hw2usc.txt into data/preprocessed_posts/depression
Processed depression_1i03mlo.txt into data/preprocessed_posts/depression
Processed depression_1i1rvag.txt into data/preprocessed_posts/depression
Processed depression_1i02lur.txt into data/preprocessed_posts/depression
Processed depression_1hyr52w.txt into data/preprocessed_posts/depression
Processed depression_1i0mvlm.txt into data/preprocessed_posts/depression
Processed depression_1hh3b1g.txt into data/preprocessed_posts/depression
Skipping depression_1hvus80.txt (empty or contains '[removed]')
Processed depression_1i1nfyx.txt into data/preprocessed_posts/depression
Processed depression_1hpku4n.txt into data/preprocessed_posts/depression
Processed depression_1hgt7ji.txt into data/preprocessed_posts/depression
Processed depression_1i1og6b