In [None]:
# Import the necessary libraries
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd

In [None]:
# Objective: Scrape posts from a subreddit.

# Running Selenium webdriver with target URL
driver = webdriver.Chrome()
driver.get('https://www.reddit.com/search/?q=gpt+store&type=link&cId=0319bff9-d7ff-451d-8b06-0fd060649b07&iId=d5d9a0d6-65bf-4715-8765-1353b0982e89')
driver.set_page_load_timeout(10)

In [None]:
# Import page content. Make sure to scroll the page as much as needed before executing this command.
page_content = driver.page_source

In [None]:
# Define a function that takes page content as input and outputs dataframe with scraped data.
def scrape_subreddit(page_content):
    
    # Parsing page content as BeautifulSoup object
    soup = BeautifulSoup(page_content, 'html.parser')
    
    # Create a BeautifulSoup ResultSet with data for all the posts to be scraped
    posts_raw = soup.find_all('shreddit-post')
    
    # Create an empty list which would be appended with all posts
    posts = []
    
    # Iterate on the ResultSet to extract data for individual posts
    for post in posts_raw:
        post_data = {}
        
        # Using .get() method to safely access attributes and find() to handle None cases
        
        # Getting the Title
        title_element = post.find('a', class_='block font-semibold text-neutral-content-strong m-0 visited:text-neutral-content-weak text-16 xs:text-18 mb-2xs xs:mb-xs')
        if title_element:
            post_data['Title'] = title_element.text.strip() 
        else:
            'N/A'
            
        # Getting the Text
        text_element = post.find('div', class_='mb-xs')
        if text_element and text_element.find('p'):
            post_data['Text'] = text_element.find('p').text.strip() 
        else:
            'N/A'
            
        # Getting the Author
        author_element = post.find('a', class_='flex items-center text-neutral-content visited:text-neutral-content-weak font-bold a no-visited no-underline hover:no-underline')
        if author_element and author_element.find('span', class_='whitespace-nowrap'):
            post_data['Author'] = author_element.find('span', class_='whitespace-nowrap').text 
        else:
            'N/A'
            
        # Getting timestamp, upvotes, comments, and URL
        post_data['Timestamp'] = post.get('created-timestamp', 'N/A')
        post_data['No. of Upvotes'] = post.get('score', 'N/A')
        post_data['No. of Comments'] = post.get('comment-count', 'N/A')
        post_data['URL'] = post.get('content-href', 'N/A')
        
        # Appending the post_data dict to the list
        posts.append(post_data)

    # Create a DataFrame from the list of dictionaries after the loop
    df = pd.DataFrame(posts)
    return df

In [None]:
df = scrape_subreddit(page_content)

In [None]:
driver.quit()

In [None]:
# Define a function that takes the dataframe and filename as input and writes an excel file to disk
def write_to_disk(df, filename):
    out_path = f'C:\\Users\\BINARY COMPUTERS\\Documents\\{filename}.xlsx'
    writer = pd.ExcelWriter(out_path , engine='xlsxwriter')
    df.to_excel(writer, sheet_name='Sheet1')
    writer.save()
    print(f"Write Complete. You can access the file at {out_path}")

In [None]:
write_to_disk(df, 'r_cars')