In [None]:
# Import the necessary libraries
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import time
from datetime import datetime

In [None]:
# Objective: Scrape posts from a Reddit Search.

# Running Selenium webdriver with target URL
driver = webdriver.Chrome()
driver.get('https://www.reddit.com/search/?q=gpt+store&type=link&cId=0319bff9-d7ff-451d-8b06-0fd060649b07&iId=d5d9a0d6-65bf-4715-8765-1353b0982e89')
driver.set_page_load_timeout(10)

In [None]:
# Import page content. Make sure to scroll the page as much as needed before executing this command.
page_content = driver.page_source

In [None]:
# Define a function that takes page content as input and outputs dataframe with scraped data.
def scrape_reddit_search(page_content):
    
    # Parsing page content as BeautifulSoup object
    soup = BeautifulSoup(page_content, 'html.parser')
    
    # Create a BeautifulSoup ResultSet with data for all the posts to be scraped
    posts_raw = soup.find_all('div', class_='w-full flex flex-col items-start min-w-0')
    
    # Create an empty list which would be appended with all posts
    posts = []
    
    # Iterate on the ResultSet to extract data for individual posts
    for post in posts_raw:
        post_data = {}
        
        # Using .get() method to safely access attributes and find() to handle None cases
        
        # Getting the Subreddit
        subreddit_element = post.find('a', class_='flex items-center text-neutral-content-weak font-semibold')
        if subreddit_element:
            post_data['Subreddit'] = subreddit_element.text.strip() 
        else:
            'N/A'
            
        # Getting the Text
        text_element = post.find('a', class_='text-16 xs:text-18 line-clamp-3 text-ellipsis text-neutral-content-strong font-semibold mb-xs no-underline hover:no-underline visited:text-neutral-content-weak')
        if text_element:
            post_data['Text'] = text_element.text.strip() 
        else:
            'N/A'
            
        # Getting timestamp
        time_element = post.find('time')
        if time_element:
            post_data['Timestamp'] = time_element.text
        else:
            'N/A'
            
        # No. of Days Old
        if time_element:
            date_obj = datetime.strptime(time_element['datetime'].split('T')[0], '%Y-%m-%d').date()
            post_data['Days Old'] = datetime.now().date() - date_obj
        else:
            'N/A'
             
        # Getting No. of Votes 
        vote_element = post.find('div', class_='text-neutral-content-weak text-12').find_all('faceplate-number')[0]
        if vote_element:
            post_data['No. of Votes'] = vote_element.text
        else:
            'N/A'
            
        # Getting No. of Comments
        comment_element = post.find('div', class_='text-neutral-content-weak text-12').find_all('faceplate-number')[1]
        if comment_element:
            post_data['No. of Comments'] = comment_element.text
        else:
            'N/A'
            
        # Getting the URL
        if text_element:
            post_data['URL'] = 'https://www.reddit.com' + text_element['href']
        else:
            'N/A'
        
        # Appending the post_data dict to the list
        posts.append(post_data)

    # Create a DataFrame from the list of dictionaries after the loop
    df = pd.DataFrame(posts)
    return df

In [None]:
df = scrape_reddit_search(page_content)

In [None]:
driver.quit()

In [None]:
df = df.sort_values(by='Days Old', ascending=True)

In [None]:
df['No. of Comments'] = df['No. of Comments'].astype(int)

In [None]:
df[df['Days Old'] < pd.Timedelta(days=60)]['No. of Comments'].sum()

In [None]:
df = df[df['Days Old'] < pd.Timedelta(days=60)]

In [None]:
df = df.reset_index().drop(columns=['index'])

In [None]:
df = df[df['No. of Comments'] != 0]

In [None]:
df = df.sort_values(by='No. of Comments', ascending=False).reset_index()

In [None]:
df.drop(columns=['index'], inplace=True)

In [None]:
df = df[df['No. of Comments'] > 10]

In [None]:
def auto_scroll(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)  # Adjust sleep time as necessary
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

In [None]:
driver = webdriver.Chrome()

In [None]:
page_content = driver.page_source

In [None]:
urls = [i for i in df['URL']]

In [None]:
# Define a function that takes page content as input and outputs dataframe with scraped data.
def scrape_post(page_content):
    
    # Parsing page content as BeautifulSoup object
    soup = BeautifulSoup(page_content, 'html.parser')
    
    # Create a BeautifulSoup ResultSet with data for all the posts to be scraped
    comments_raw = soup.find_all('shreddit-comment')
    
    # Filtering for top-level-comments only
    comments_depth_0 = [comment for comment in comments_raw if comment['depth']=='0']
    
    # Create an empty list which would be appended with all comments
    comments = []
    
    # Iterate on the ResultSet to extract data for individual posts
    for comment in comments_depth_0:
        comment_data = {}
        
        # Getting the author
        author_element = comment.find('a', class_='truncate font-bold text-neutral-content-strong text-12 hover:underline')
        if author_element:
            comment_data['Author'] = author_element.text.strip()
        else:
            'N/A'
        
        # Getting the Text
        text_element = comment.find('div', class_='md text-14 rounded-[8px] pb-2xs')
        if text_element and text_element.find('p'):
            comment_data['Text'] = text_element.text.strip()
        else:
            'N/A'
                    
        # Geting Timestamp
        if comment.find('time'):
            comment_data['Timestamp'] = comment.find('time')['datetime']
        else:
            'N/A'
                    
        # Getting score
        try:
            comment_data['Score'] = comment['score']
        except Exception as e:
            comment_data['Score'] = 'N/A'
            
        # Getting URL
        try:
            comment_data['URL'] = 'https://www.reddit.com' + comment['permalink']
        except Exception as e:
            comment_data['URL'] = 'N/A'
            
        # Appending the comment_data dict to the list
        comments.append(comment_data)
        
    # Create a DataFrame from the list of dictionaries after the loop
    df = pd.DataFrame(comments)
    return df

In [None]:
df1 = scrape_post(page_content)

In [None]:
# Define a function that takes the dataframe and filename as input and writes an excel file to disk
def write_to_disk(df, filename):
    out_path = f'C:\\Users\\BINARY COMPUTERS\\Documents\\{filename}.xlsx'
    writer = pd.ExcelWriter(out_path , engine='xlsxwriter')
    df.to_excel(writer, sheet_name='Sheet1')
    writer.save()
    print(f"Write Complete. You can access the file at {out_path}")

In [None]:
write_to_disk(df1, 'DF URL 18')

In [None]:
for url in urls:
    driver.get(url)
    auto_scroll(driver)
    page_content = driver.page_source
    df = scrape_post(page_content)
    filename = url.split('/')[6]
    write_to_disk(df, filename)

In [None]:
driver.quit()

In [None]:
df['URL'] = 'https://www.reddit.com' + df['URL']

In [None]:
soup = BeautifulSoup(page_content, 'html.parser')

In [None]:
date_obj = datetime.strptime(soup.find('div', class_='w-full flex flex-col items-start min-w-0').find('time')['datetime'].split('T')[0], '%Y-%m-%d').date()

In [None]:
datetime.now().date() - date_obj