In [1]:
# Import the necessary libraries
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
import time

In [2]:
def auto_scroll(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)  # Adjust sleep time as necessary
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

In [7]:
# Objective: Scrape comments from a given post.

# Running Selenium webdriver with target URL.
driver = webdriver.Chrome()

auto_scroll(driver)

In [5]:
auto_scroll(driver)

In [6]:
driver.quit()

In [8]:
# Import page content. Make sure to scroll the page as much as needed before executing this command.
page_content = driver.page_source

In [9]:
# Define a function that takes page content as input and outputs dataframe with scraped data.
def scrape_post(page_content):
    
    # Parsing page content as BeautifulSoup object
    soup = BeautifulSoup(page_content, 'html.parser')
    
    # Create a BeautifulSoup ResultSet with data for all the posts to be scraped
    comments_raw = soup.find_all('shreddit-comment')
    
    # Filtering for top-level-comments only
    comments_depth_0 = [comment for comment in comments_raw if comment['depth']=='0']
    
    # Create an empty list which would be appended with all comments
    comments = []
    
    # Iterate on the ResultSet to extract data for individual posts
    for comment in comments_depth_0:
        comment_data = {}
        
        # Getting the author
        author_element = comment.find('a', class_='truncate font-bold text-neutral-content-strong text-12 hover:underline')
        if author_element:
            comment_data['Author'] = author_element.text.strip()
        else:
            'N/A'
        
        # Getting the Text
        text_element = comment.find('div', class_='md text-14 rounded-[8px] pb-2xs')
        if text_element and text_element.find('p'):
            comment_data['Text'] = text_element.text.strip()
        else:
            'N/A'
                    
        # Geting Timestamp
        if comment.find('time'):
            comment_data['Timestamp'] = comment.find('time')['datetime']
        else:
            'N/A'
                    
        # Getting score
        try:
            comment_data['Score'] = comment['score']
        except Exception as e:
            comment_data['Score'] = 'N/A'
            
        # Getting URL
        try:
            comment_data['URL'] = 'https://www.reddit.com' + comment['permalink']
        except Exception as e:
            comment_data['URL'] = 'N/A'
            
        # Appending the comment_data dict to the list
        comments.append(comment_data)
        
    # Create a DataFrame from the list of dictionaries after the loop
    df = pd.DataFrame(comments)
    return df

In [10]:
df = scrape_post(page_content)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Author     13 non-null     object
 1   Text       13 non-null     object
 2   Timestamp  13 non-null     object
 3   Score      13 non-null     object
 4   URL        13 non-null     object
dtypes: object(5)
memory usage: 648.0+ bytes


In [17]:
driver.quit()

In [8]:
# Define a function that takes the dataframe and filename as input and writes an excel file to disk
def write_to_disk(df, filename):
    out_path = f'C:\\Users\\BINARY COMPUTERS\\Documents\\{filename}.xlsx'
    writer = pd.ExcelWriter(out_path , engine='xlsxwriter')
    df.to_excel(writer, sheet_name='Sheet1')
    writer.save()
    print(f"Write Complete. You can access the file at {out_path}")

In [9]:
write_to_disk(df, 'Reddit Post')

Write Complete. You can access the file at C:\Users\BINARY COMPUTERS\Documents\Reddit Post.xlsx
