In [12]:
!pip install selenium

from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
from time import sleep
import json, os
import numpy as np



In [2]:
## instantiate driver
## check the version of Google Chrome and download correct version of chromedriver
driver = webdriver.Chrome()

In [3]:
## get page of "social grep", which gived old posts of subreddit
## e.g. https://socialgrep.com/search?query=%2Fr%2FLanguageTechnology%2Cafter%3A2010-01-01&order_by=oldest
## original reddit url = 'https://www.reddit.com/r/xxxxxxxxx/'

subreddit = 'BMW' # choose by yourself
start_date = '2010-01-01' # choose by yourself

url = f'https://socialgrep.com/search?query=%2Fr%2F{subreddit}%2Cafter%3A{start_date}&order_by=oldest'

driver.get(url)
repeat_time, waiting_time = 4, 2

## scroll to the bottom of the page and wait
for i in range(repeat_time):
    driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight);")
    sleep(waiting_time)

In [4]:
## function to scrape
def get_content(post, subreddit):
    try:
        vote = int(post.select_one('span.text-info').text)
    except:
        vote = 0
    try:
        title = post.a.text
    except:
        return None
    try:
        text = post.select_one('div.post_content').get_text(separator='\n').strip()
        if text == '':
            text = None
    except:
        text = None
    date = post.select_one('h6.card-subtitle').text.split(',')[1].strip()

    if text == None and title == f"/r/{subreddit.lower()}":
        return None
    else:
        return {
            "vote" : vote,
            "title" : title,
            "text" : text,
            "date" : date
        }

In [5]:
soup = BeautifulSoup(driver.page_source)
posts = soup.select('div.card-body') # content is under here

get_content(posts[1], subreddit) # show one example

{'vote': 9,
 'title': 'V10 E30 smokes a V8 M3',
 'text': None,
 'date': '2010-01-08'}

In [6]:
if os.path.exists(f'{subreddit}.json'):
    ## resume scraping from the last date in the json file
    with open(f'{subreddit}.json', 'r', encoding = 'utf8') as f:
        scraped_data = json.load(f)
    new_date = scraped_data[-1]['date']
    url = f'https://socialgrep.com/search?query=%2Fr%2F{subreddit}%2Cafter%3A{new_date}&order_by=oldest'
else:
    ## if the file not exists, create a new list
    scraped_data = []

In [7]:
## scrape and append to `scraped_data`
## RUN THIS CELL AGAIN AND AGAIN until getting the latest post

for _ in tqdm(range(3864)): # set repeat time 

    ## scroll to the bottom of the page and wait
    driver.get(url)
    for i in range(4):
        driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight);")
        sleep(2)

    ## get HTML
    soup = BeautifulSoup(driver.page_source)
    posts = soup.select('div.card-body')

    ## iterate each post
    for post in posts:
        one_post_dict = get_content(post, subreddit)
        if one_post_dict != None:
            scraped_data.append(one_post_dict)

    ## save to json
    with open(f'{subreddit}.json', 'w', encoding ='utf8') as f:
        json.dump(scraped_data, f, indent=False, ensure_ascii=False)

    ## set new date
    new_date = scraped_data[-1]['date']
    url = f'https://socialgrep.com/search?query=%2Fr%2F{subreddit}%2Cafter%3A{new_date}&order_by=oldest'


  1%|▌                                                                            | 27/3864 [04:20<10:17:38,  9.66s/it]


KeyboardInterrupt: 

In [8]:
df = pd.read_json(f'{subreddit}.json').drop_duplicates()
df

Unnamed: 0,vote,title,text,date
0,6,The only word for this is epic - V10 swapped i...,[deleted],2010-01-07
1,9,V10 E30 smokes a V8 M3,,2010-01-08
2,8,BMW three-turbo diesel engine,,2010-01-18
3,3,BMW M Sauber,Drop that F1-inspired 500 horsepower V10 into ...,2010-01-31
4,3,Anyone here got a 135i?,I'm thinking of purchasing one in the next few...,2010-02-11
...,...,...,...,...
31472,5,"Since Front End Friday is a thing, I guess it'...",,2018-11-30
31473,14,One week after delivery very impressed with th...,,2018-11-30
31474,31,Frontend Friday,,2018-11-30
31475,8,"Garaged the E36, 335xi gets Winter Beater status",[deleted],2018-11-30


In [9]:
## missing value in text
df.isna().sum()

vote         0
title        0
text     17026
date         0
dtype: int64

In [13]:
## text includes [removed] [deleted]
df[df['text'].isin(['[removed]', '[deleted]',None,'',np.NaN])]

Unnamed: 0,vote,title,text,date
0,6,The only word for this is epic - V10 swapped i...,[deleted],2010-01-07
1,9,V10 E30 smokes a V8 M3,,2010-01-08
2,8,BMW three-turbo diesel engine,,2010-01-18
5,0,BMW.co.uk Blog - The new X5,,2010-02-15
6,2,Fellow member launches Euro Car site with Give...,[deleted],2010-02-18
...,...,...,...,...
31472,5,"Since Front End Friday is a thing, I guess it'...",,2018-11-30
31473,14,One week after delivery very impressed with th...,,2018-11-30
31474,31,Frontend Friday,,2018-11-30
31475,8,"Garaged the E36, 335xi gets Winter Beater status",[deleted],2018-11-30
