In [2]:
# Import the necessary libraries
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import time
from datetime import datetime

In [3]:
# Objective: Scrape posts from a Reddit Search.

# Running Selenium webdriver with target URL
driver = webdriver.Chrome()
driver.get('https://www.reddit.com/search/?q=gpt+store&type=link&cId=0319bff9-d7ff-451d-8b06-0fd060649b07&iId=d5d9a0d6-65bf-4715-8765-1353b0982e89')
driver.set_page_load_timeout(10)

In [4]:
# Import page content. Make sure to scroll the page as much as needed before executing this command.
page_content = driver.page_source

In [5]:
# Define a function that takes page content as input and outputs dataframe with scraped data.
def scrape_reddit_search(page_content):
    
    # Parsing page content as BeautifulSoup object
    soup = BeautifulSoup(page_content, 'html.parser')
    
    # Create a BeautifulSoup ResultSet with data for all the posts to be scraped
    posts_raw = soup.find_all('div', class_='w-full flex flex-col items-start min-w-0')
    
    # Create an empty list which would be appended with all posts
    posts = []
    
    # Iterate on the ResultSet to extract data for individual posts
    for post in posts_raw:
        post_data = {}
        
        # Using .get() method to safely access attributes and find() to handle None cases
        
        # Getting the Subreddit
        subreddit_element = post.find('a', class_='flex items-center text-neutral-content-weak font-semibold')
        if subreddit_element:
            post_data['Subreddit'] = subreddit_element.text.strip() 
        else:
            'N/A'
            
        # Getting the Text
        text_element = post.find('a', class_='text-16 xs:text-18 line-clamp-3 text-ellipsis text-neutral-content-strong font-semibold mb-xs no-underline hover:no-underline visited:text-neutral-content-weak')
        if text_element:
            post_data['Text'] = text_element.text.strip() 
        else:
            'N/A'
            
        # Getting timestamp
        time_element = post.find('time')
        if time_element:
            post_data['Timestamp'] = time_element.text
        else:
            'N/A'
            
        # No. of Days Old
        if time_element:
            date_obj = datetime.strptime(time_element['datetime'].split('T')[0], '%Y-%m-%d').date()
            post_data['Days Old'] = datetime.now().date() - date_obj
        else:
            'N/A'
             
        # Getting No. of Votes 
        vote_element = post.find('div', class_='text-neutral-content-weak text-12').find_all('faceplate-number')[0]
        if vote_element:
            post_data['No. of Votes'] = vote_element.text
        else:
            'N/A'
            
        # Getting No. of Comments
        comment_element = post.find('div', class_='text-neutral-content-weak text-12').find_all('faceplate-number')[1]
        if comment_element:
            post_data['No. of Comments'] = comment_element.text
        else:
            'N/A'
            
        # Getting the URL
        if text_element:
            post_data['URL'] = 'https://www.reddit.com' + text_element['href']
        else:
            'N/A'
        
        # Appending the post_data dict to the list
        posts.append(post_data)

    # Create a DataFrame from the list of dictionaries after the loop
    df = pd.DataFrame(posts)
    return df

In [6]:
df = scrape_reddit_search(page_content)

In [22]:
driver.quit()

In [7]:
df = df.sort_values(by='Days Old', ascending=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119 entries, 54 to 47
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype          
---  ------           --------------  -----          
 0   Subreddit        119 non-null    object         
 1   Text             119 non-null    object         
 2   Timestamp        119 non-null    object         
 3   Days Old         119 non-null    timedelta64[ns]
 4   No. of Votes     119 non-null    object         
 5   No. of Comments  119 non-null    object         
 6   URL              119 non-null    object         
dtypes: object(6), timedelta64[ns](1)
memory usage: 7.4+ KB


In [9]:
df['No. of Comments'] = df['No. of Comments'].astype(int)

In [10]:
df[df['Days Old'] < pd.Timedelta(days=60)]['No. of Comments'].sum()

823

In [11]:
df = df[df['Days Old'] < pd.Timedelta(days=60)]

In [12]:
df = df.reset_index().drop(columns=['index'])

In [13]:
df = df[df['No. of Comments'] != 0]

In [14]:
df

Unnamed: 0,Subreddit,Text,Timestamp,Days Old,No. of Votes,No. of Comments,URL
0,r/GPTStore,Make your GPT stand out in the GPT Store with ...,9h ago,1 days,0,1,https://www.reddit.com/r/GPTStore/comments/1b9...
1,r/ChatGPT,is the chatGPT store dead?,10d ago,11 days,5,2,https://www.reddit.com/r/ChatGPT/comments/1b1k...
2,r/GPTsIdeas,Updates for GPTs and the GPT Store.,14d ago,15 days,3,1,https://www.reddit.com/r/GPTsIdeas/comments/1a...
3,r/ChatGPT,Updates for GPTs and the GPT Store.,14d ago,15 days,1,1,https://www.reddit.com/r/ChatGPT/comments/1aya...
4,r/ChatGPT,Do GPTs from the GPT store also follow the cus...,20d ago,21 days,0,3,https://www.reddit.com/r/ChatGPT/comments/1asw...
5,r/ChatGPTPromptGenius,Early Problems with the GPT Store,24d ago,25 days,4,1,https://www.reddit.com/r/ChatGPTPromptGenius/c...
6,r/ChatGPTPro,False reports by competitors in GPT store,26d ago,26 days,8,3,https://www.reddit.com/r/ChatGPTPro/comments/1...
7,r/ChatGPT,What are your picks for the best GPTs in ChatG...,1mo ago,31 days,1,2,https://www.reddit.com/r/ChatGPT/comments/1al2...
8,r/GPTStore,Gpt store,1mo ago,32 days,6,17,https://www.reddit.com/r/GPTStore/comments/1ak...
10,r/GPTStore,my Gpt's in the Gpt store,1mo ago,33 days,3,4,https://www.reddit.com/r/GPTStore/comments/1aj...


In [15]:
df = df.sort_values(by='No. of Comments', ascending=False).reset_index()

In [16]:
df.drop(columns=['index'], inplace=True)

In [17]:
df = df[df['No. of Comments'] > 10]

In [20]:
df

Unnamed: 0,Subreddit,Text,Timestamp,Days Old,No. of Votes,No. of Comments,URL
0,r/OpenAI,Am I the only one that doesn't understand the ...,2mo ago,57 days,187,160,https://www.reddit.com/r/OpenAI/comments/194uw...
1,r/ChatGPT,GPT store is actually pretty shit,2mo ago,58 days,468,69,https://www.reddit.com/r/ChatGPT/comments/194e...
2,r/OpenAI,In your opinion what is currently missing in t...,2mo ago,58 days,19,67,https://www.reddit.com/r/OpenAI/comments/1947i...
3,r/OpenAI,"I really want the GPT Store to succeed, but at...",2mo ago,50 days,82,59,https://www.reddit.com/r/OpenAI/comments/19aj5...
4,r/OpenAI,First impressions of the GPT store?,2mo ago,58 days,60,58,https://www.reddit.com/r/OpenAI/comments/193wb...
5,r/GPTStore,"The Futility of ""Securing"" Prompts in the GPT ...",2mo ago,58 days,28,39,https://www.reddit.com/r/GPTStore/comments/194...
6,r/singularity,Why the GPT store is on the path to AGI and wh...,2mo ago,59 days,39,36,https://www.reddit.com/r/singularity/comments/...
7,r/GPTStore,My Honest Review of the GPT Store,1mo ago,44 days,18,30,https://www.reddit.com/r/GPTStore/comments/19f...
8,r/singularity,Introducing the GPT Store,2mo ago,59 days,97,27,https://www.reddit.com/r/singularity/comments/...
9,r/GPTStore,How do you guys feel about the GPT store?,2mo ago,58 days,10,24,https://www.reddit.com/r/GPTStore/comments/193...


In [21]:
df['URL'][0]

'https://www.reddit.com/r/OpenAI/comments/194uwn6/am_i_the_only_one_that_doesnt_understand_the/'

In [129]:
df['URL'][18].split('/')[6]

'192cc9k'

In [114]:
def auto_scroll(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)  # Adjust sleep time as necessary
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

In [86]:
driver = webdriver.Chrome()

In [137]:
driver.get(df['URL'][18])

KeyError: 18

In [133]:
page_content = driver.page_source

In [118]:
urls = [i for i in df['URL']]

In [119]:
len(urls)

19

In [26]:
# Define a function that takes page content as input and outputs dataframe with scraped data.
def scrape_post(page_content):
    
    # Parsing page content as BeautifulSoup object
    soup = BeautifulSoup(page_content, 'html.parser')
    
    # Create a BeautifulSoup ResultSet with data for all the posts to be scraped
    comments_raw = soup.find_all('shreddit-comment')
    
    # Filtering for top-level-comments only
    comments_depth_0 = [comment for comment in comments_raw if comment['depth']=='0']
    
    # Create an empty list which would be appended with all comments
    comments = []
    
    # Iterate on the ResultSet to extract data for individual posts
    for comment in comments_depth_0:
        comment_data = {}
        
        # Getting the author
        author_element = comment.find('a', class_='truncate font-bold text-neutral-content-strong text-12 hover:underline')
        if author_element:
            comment_data['Author'] = author_element.text.strip()
        else:
            'N/A'
        
        # Getting the Text
        text_element = comment.find('div', class_='md text-14 rounded-[8px] pb-2xs')
        if text_element and text_element.find('p'):
            comment_data['Text'] = text_element.text.strip()
        else:
            'N/A'
                    
        # Geting Timestamp
        if comment.find('time'):
            comment_data['Timestamp'] = comment.find('time')['datetime']
        else:
            'N/A'
                    
        # Getting score
        try:
            comment_data['Score'] = comment['score']
        except Exception as e:
            comment_data['Score'] = 'N/A'
            
        # Getting URL
        try:
            comment_data['URL'] = 'https://www.reddit.com' + comment['permalink']
        except Exception as e:
            comment_data['URL'] = 'N/A'
            
        # Appending the comment_data dict to the list
        comments.append(comment_data)
        
    # Create a DataFrame from the list of dictionaries after the loop
    df = pd.DataFrame(comments)
    return df

In [134]:
df1 = scrape_post(page_content)

In [135]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Author     8 non-null      object
 1   Text       8 non-null      object
 2   Timestamp  8 non-null      object
 3   Score      8 non-null      object
 4   URL        8 non-null      object
dtypes: object(5)
memory usage: 448.0+ bytes


In [37]:
# Define a function that takes the dataframe and filename as input and writes an excel file to disk
def write_to_disk(df, filename):
    out_path = f'C:\\Users\\BINARY COMPUTERS\\Documents\\{filename}.xlsx'
    writer = pd.ExcelWriter(out_path , engine='xlsxwriter')
    df.to_excel(writer, sheet_name='Sheet1')
    writer.save()
    print(f"Write Complete. You can access the file at {out_path}")

In [136]:
write_to_disk(df1, 'DF URL 18')

Write Complete. You can access the file at C:\Users\BINARY COMPUTERS\Documents\DF URL 18.xlsx


In [130]:
for url in urls:
    driver.get(url)
    auto_scroll(driver)
    page_content = driver.page_source
    df = scrape_post(page_content)
    filename = url.split('/')[6]
    write_to_disk(df, filename)

Write Complete. You can access the file at C:\Users\BINARY COMPUTERS\Documents\194uwn6.xlsx
Write Complete. You can access the file at C:\Users\BINARY COMPUTERS\Documents\194ewzx.xlsx
Write Complete. You can access the file at C:\Users\BINARY COMPUTERS\Documents\1947i5u.xlsx
Write Complete. You can access the file at C:\Users\BINARY COMPUTERS\Documents\19aj5n7.xlsx
Write Complete. You can access the file at C:\Users\BINARY COMPUTERS\Documents\193wbhv.xlsx
Write Complete. You can access the file at C:\Users\BINARY COMPUTERS\Documents\194ebyy.xlsx
Write Complete. You can access the file at C:\Users\BINARY COMPUTERS\Documents\1938q95.xlsx
Write Complete. You can access the file at C:\Users\BINARY COMPUTERS\Documents\19f0fyd.xlsx
Write Complete. You can access the file at C:\Users\BINARY COMPUTERS\Documents\193diq7.xlsx
Write Complete. You can access the file at C:\Users\BINARY COMPUTERS\Documents\193xh8r.xlsx
Write Complete. You can access the file at C:\Users\BINARY COMPUTERS\Documents\1

In [138]:
driver.quit()

In [64]:
df.iloc[18]

Subreddit                                                 r/GPTStore
Text                               My Honest Review of the GPT Store
Timestamp                                                    1mo ago
Days Old                                            42 days 00:00:00
No. of Votes                                                      18
No. of Comments                                                   30
URL                https://www.reddit.com/r/GPTStore/comments/19f...
Name: 18, dtype: object

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125 entries, 0 to 124
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype          
---  ------           --------------  -----          
 0   Subreddit        125 non-null    object         
 1   Text             125 non-null    object         
 2   Timestamp        125 non-null    object         
 3   Days Old         125 non-null    timedelta64[ns]
 4   No. of Votes     125 non-null    object         
 5   No. of Comments  125 non-null    object         
 6   URL              125 non-null    object         
dtypes: object(6), timedelta64[ns](1)
memory usage: 7.0+ KB


In [11]:
df['URL'] = 'https://www.reddit.com' + df['URL']

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125 entries, 0 to 124
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Subreddit        125 non-null    object
 1   Text             125 non-null    object
 2   Timestamp        125 non-null    object
 3   No. of Votes     125 non-null    object
 4   No. of Comments  125 non-null    object
 5   URL              125 non-null    object
dtypes: object(6)
memory usage: 6.0+ KB


In [15]:
soup = BeautifulSoup(page_content, 'html.parser')

In [29]:
date_obj = datetime.strptime(soup.find('div', class_='w-full flex flex-col items-start min-w-0').find('time')['datetime'].split('T')[0], '%Y-%m-%d').date()

In [31]:
datetime.now().date() - date_obj

datetime.timedelta(days=56)