## import

In [1]:
!pip install selenium

from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
from time import sleep
import json, os




## instantiate webdriver

In [2]:
## instantiate driver
## check the version of Google Chrome and download correct version of chromedriver
driver = webdriver.Chrome()

In [3]:
## get page of "social grep", which gived old posts of subreddit
## e.g. https://socialgrep.com/search?query=%2Fr%2FLanguageTechnology%2Cafter%3A2010-01-01&order_by=oldest
## original reddit url = 'https://www.reddit.com/r/xxxxxxxxx/'

subreddit = 'mercedes_benz' # choose by yourself
start_date = '2010-01-01' # choose by yourself

url = f'https://socialgrep.com/search?query=%2Fr%2F{subreddit}%2Cafter%3A{start_date}&order_by=oldest'

driver.get(url)
repeat_time, waiting_time = 4, 2

## scroll to the bottom of the page and wait
for i in range(repeat_time):
    driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight);")
    sleep(waiting_time)

## example of one post

In [4]:
## function to scrape
def get_content(post, subreddit):
    try:
        vote = int(post.select_one('span.text-info').text)
    except:
        vote = 0
    try:
        title = post.a.text
    except:
        return None
    try:
        text = post.select_one('div.post_content').get_text(separator='\n').strip()
        if text == '':
            text = None
    except:
        text = None
    date = post.select_one('h6.card-subtitle').text.split(',')[1].strip()

    if text == None and title == f"/r/{subreddit.lower()}":
        return None
    else:
        return {
            "vote" : vote,
            "title" : title,
            "text" : text,
            "date" : date
        }

In [5]:
soup = BeautifulSoup(driver.page_source)
posts = soup.select('div.card-body') # content is under here

get_content(posts[1], subreddit) # show one example

{'vote': 1,
 'title': 'Anybody good at CSS/Stylesheet?',
 'text': '[deleted]',
 'date': '2012-01-29'}

# for loop with datetime

In [6]:
if os.path.exists(f'{subreddit}.json'):
    ## resume scraping from the last date in the json file
    with open(f'{subreddit}.json', 'r', encoding = 'utf8') as f:
        scraped_data = json.load(f)
    new_date = scraped_data[-1]['date']
    url = f'https://socialgrep.com/search?query=%2Fr%2F{subreddit}%2Cafter%3A{new_date}&order_by=oldest'
else:
    ## if the file not exists, create a new list
    scraped_data = []

In [11]:
## scrape and append to `scraped_data`
## RUN THIS CELL AGAIN AND AGAIN until getting the latest post

for _ in tqdm(range(3788)): # set repeat time 

    ## scroll to the bottom of the page and wait
    driver.get(url)
    for i in range(4):
        driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight);")
        sleep(2)

    ## get HTML
    soup = BeautifulSoup(driver.page_source)
    posts = soup.select('div.card-body')

    ## iterate each post
    for post in posts:
        one_post_dict = get_content(post, subreddit)
        if one_post_dict != None:
            scraped_data.append(one_post_dict)

    ## save to json
    with open(f'{subreddit}.json', 'w', encoding ='utf8') as f:
        json.dump(scraped_data, f, indent=False, ensure_ascii=False)

    ## set new date
    new_date = scraped_data[-1]['date']
    url = f'https://socialgrep.com/search?query=%2Fr%2F{subreddit}%2Cafter%3A{new_date}&order_by=oldest'


  2%|█▍                                                                           | 69/3788 [11:22<10:13:17,  9.89s/it]


KeyboardInterrupt: 

## to dataframe and drop duplicate

In [12]:
df = pd.read_json(f'{subreddit}.json').drop_duplicates()
df

Unnamed: 0,vote,title,text,date
0,0,Mighty V12 G-Class Closer To Showrooms!,,2012-01-29
1,1,Anybody good at CSS/Stylesheet?,[deleted],2012-01-29
2,11,Has anyone been to the Mercedes Benz Museum in...,,2012-01-30
3,2,I want this: Gold plated Mercedes-Benz,,2012-01-30
4,5,"190e 16V, any owners out there?",,2012-01-31
...,...,...,...,...
24229,5,found something cool in a lot it’s for sale to...,,2023-11-04
24230,1,Does anyone know what this part is?,Found 3 of these in my 2023 GLE450,2023-11-04
24231,1,Need Help with Value of 2017 C300 4MATIC,"Someone is asking $29,000 for this 2017 C300 w...",2023-11-04
24232,1,guys i need your opinion on this,[deleted],2023-11-04


In [13]:
## missing value in text
df.isna().sum()

vote         0
title        0
text     12463
date         0
dtype: int64

In [14]:
## text includes [removed] [deleted]
df[df['text'].isin(['[removed]', '[deleted]'])]

Unnamed: 0,vote,title,text,date
1,1,Anybody good at CSS/Stylesheet?,[deleted],2012-01-29
14,4,"Now you see it, now you don’t: Mercedes intere...",[deleted],2012-03-05
24,1,Mercedes 711D. Anyone else in Canada own one o...,[removed],2012-03-25
28,11,I see this (AA) badge on old Mercedes cars a l...,[deleted],2012-04-05
38,4,Buying a 2001 Mercedes,[deleted],2012-05-01
...,...,...,...,...
24202,2,Glc63s unlimited lunch control?,[deleted],2023-07-05
24211,1,Whats that part,[deleted],2023-07-24
24214,1,2023 CLA 250 blowby,[deleted],2023-10-23
24217,1,How to remove Speed Limiter,[removed],2023-11-03
