In [1]:
import requests
import praw
import os
import pandas as pd
from sqlalchemy import create_engine
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

CLIENT_ID=os.environ.get('REDDIT_CLIENT_ID')
SECRET_KEY=os.environ.get('REDDIT_SECRET_KEY')
pw = os.environ.get('REDDIT_PW')
username = os.environ.get('REDDIT_USERNAME')

## Reddit Scraping last (x) posts

### If SSL Connection works

In [2]:
#Setting up access to the Reddit API
auth=requests.auth.HTTPBasicAuth(CLIENT_ID, SECRET_KEY)

data={
    'grant_type': 'password',
    'username': username,
    'password': pw
}

headers = {'User-Agent':'MyAPI/0.0.1'}


res=requests.post('https://www.reddit.com/api/v1/access_token',
                  auth=auth, data=data, headers=headers)

TOKEN=res.json()['access_token']

headers['Authorization']=f'bearer {TOKEN}'

reddit=praw.Reddit(user_agent="Get Comments",client_id=CLIENT_ID,client_secret=SECRET_KEY)

# hotpost=requests.get('https://oauth.reddit.com/r/CryptoCurrency/search/?q=Bitcoin&sort=new',
#                  headers=headers, params={'restrict_sr':'1','limit':'100'}).json()

hotpost=requests.get('https://oauth.reddit.com/r/CryptoCurrency?sort=new',
                 headers=headers, params={'restrict_sr':'1','limit':'10'}).json()
# for post in hotpost['data']['children']:
#     print(post['data']['title'])

df_posts_2 = pd.DataFrame()

for post in hotpost['data']['children']:
    submission = reddit.submission(id=post['data']['id'])
    submission.comments.replace_more(limit=None)
    df_posts_2=df_posts_2._append({
        'subreddit':post['data']['subreddit'],
        'title':post['data']['title'],
        'timestamp':post['data']['created_utc'],
        'selftext':post['data']['selftext'],
        'upvote_ratio':post['data']['upvote_ratio'],
        'ups':post['data']['ups'],
        'downs':post['data']['downs'],
        'score':post['data']['score'],
        'id':post['data']['id'],
        'name':post['data']['name']
        }, ignore_index=True)


df_comments = pd.DataFrame()

for post in hotpost['data']['children']:
    submission = reddit.submission(id=post['data']['id'])
    submission.comments.replace_more(limit=None)
    for comment in submission.comments.list():
        df_comments=df_comments._append({
            'subreddit':post['data']['subreddit'],
            'title':post['data']['title'],
            'timestamp':post['data']['created_utc'],
            'selftext':post['data']['selftext'],
            'upvote_ratio':post['data']['upvote_ratio'],
            'ups':post['data']['ups'],
            'downs':post['data']['downs'],
            'score':post['data']['score'],
            'id':post['data']['id'],
            'name':post['data']['name'],
            'comment':comment.body,
            'comment_ups':comment.ups,
            'comment_downs':comment.downs,
            'comment_timestamp':comment.created_utc
            }, ignore_index=True)

### If SSL Error

In [28]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

url = 'https://www.reddit.com/r/CryptoCurrency/new/'

# Create a session with retry mechanism
session = requests.Session()
retry = Retry(
    total=5,
    backoff_factor=1,
    status_forcelist=[502, 503, 504]
)
adapter = HTTPAdapter(max_retries=retry)
session.mount('https://', adapter)

#Setting up access to the Reddit API
auth=requests.auth.HTTPBasicAuth(CLIENT_ID, SECRET_KEY)

data={
    'grant_type': 'password',
    'username': username,
    'password': pw
}

headers = {'User-Agent':'MyAPI/0.0.1'}

res=session.post('https://www.reddit.com/api/v1/access_token',
                  auth=auth, data=data, headers=headers)

TOKEN=res.json()['access_token']

headers['Authorization']=f'bearer {TOKEN}'

reddit=praw.Reddit(user_agent="Get Comments",client_id=CLIENT_ID,client_secret=SECRET_KEY)

hotpost=session.get('https://oauth.reddit.com/r/CryptoCurrency?sort=new',
                 headers=headers, params={'restrict_sr':'1','limit':'10'}).json()

df_posts_2 = pd.DataFrame()

for post in hotpost['data']['children']:
    submission = reddit.submission(id=post['data']['id'])
    try:
        submission.comments.replace_more(limit=None)
    except:
        pass
    df_posts_2=df_posts_2._append({
        'subreddit':post['data']['subreddit'],
        'title':post['data']['title'],
        'timestamp':post['data']['created_utc'],
        'selftext':post['data']['selftext'],
        'upvote_ratio':post['data']['upvote_ratio'],
        'ups':post['data']['ups'],
        'downs':post['data']['downs'],
        'score':post['data']['score'],
        'id':post['data']['id'],
        'name':post['data']['name']
        }, ignore_index=True)


df_comments = pd.DataFrame()

for post in hotpost['data']['children']:
    submission = reddit.submission(id=post['data']['id'])
    submission.comments.replace_more(limit=None)
    for comment in submission.comments.list():
        df_comments=df_comments._append({
            'subreddit':post['data']['subreddit'],
            'title':post['data']['title'],
            'timestamp':post['data']['created_utc'],
            'selftext':post['data']['selftext'],
            'upvote_ratio':post['data']['upvote_ratio'],
            'ups':post['data']['ups'],
            'downs':post['data']['downs'],
            'score':post['data']['score'],
            'id':post['data']['id'],
            'name':post['data']['name'],
            'comment':comment.body,
            'comment_ups':comment.ups,
            'comment_downs':comment.downs,
            'comment_timestamp':comment.created_utc
            }, ignore_index=True)

### Cleaning up

In [55]:
# Convert timestamp to datetime

df_posts_2['timestamp'] = pd.to_datetime(df_posts_2['timestamp'], unit='s')
df_comments['timestamp'] = pd.to_datetime(df_comments['timestamp'], unit='s')
df_comments['comment_timestamp'] = pd.to_datetime(df_comments['comment_timestamp'], unit='s')

## SQL interaction

In [32]:
engine=create_engine('sqlite:///../data/reddit_comments.db',echo=True)
# sqlite_connection=engine.connect()
sqlite_table='reddit_comments'
df.to_sql(sqlite_table,engine, if_exists='fail')
# sqlite_connection.close()

2024-08-15 18:59:22,054 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-08-15 18:59:22,055 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("reddit_comments")
2024-08-15 18:59:22,056 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-08-15 18:59:22,060 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("reddit_comments")
2024-08-15 18:59:22,061 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-08-15 18:59:22,063 INFO sqlalchemy.engine.Engine ROLLBACK
2024-08-15 18:59:22,066 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-08-15 18:59:22,069 INFO sqlalchemy.engine.Engine 
CREATE TABLE reddit_comments (
	"index" BIGINT, 
	subreddit TEXT, 
	title TEXT, 
	timestamp DATETIME, 
	selftext TEXT, 
	upvote_ratio FLOAT, 
	ups BIGINT, 
	downs BIGINT, 
	score BIGINT, 
	id TEXT, 
	name TEXT, 
	comment TEXT, 
	comment_ups BIGINT, 
	comment_downs BIGINT, 
	comment_timestamp DATETIME
)


2024-08-15 18:59:22,072 INFO sqlalchemy.engine.Engine [no key 0.00318s] ()
2024-08-15 18:59:22,085 INFO sqla

304

In [25]:
engine=create_engine('sqlite:///../data/reddit_comments.db',echo=True)
sqlite_connection=engine.connect()
sql = "select * from reddit_comments"
df = pd.read_sql(sql,engine)

2024-08-15 18:50:41,394 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-08-15 18:50:41,396 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("select * from reddit_comments")
2024-08-15 18:50:41,397 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-08-15 18:50:41,402 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("select * from reddit_comments")
2024-08-15 18:50:41,404 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-08-15 18:50:41,405 INFO sqlalchemy.engine.Engine ROLLBACK


AttributeError: 'OptionEngine' object has no attribute 'execute'

In [22]:
sqlite_connection.close()

2024-08-15 18:46:49,136 INFO sqlalchemy.engine.Engine ROLLBACK


## Trying Pushshifts

https://www.reddit.com/r/pushshift/comments/194k9y4/reddit_dump_files_through_the_end_of_2023/

In [43]:
import requests
import praw
import time

# Authentication: http://praw.readthedocs.io/en/latest/getting_started/authentication.html
reddit = praw.Reddit(client_id=CLIENT_ID, client_secret=SECRET_KEY,
                     password=pw, user_agent='MyAPI/0.0.1',
                     username=username)

def submissions_pushshift_praw(subreddit, start=None, end=None, limit=100, extra_query=""):
    """
    A simple function that returns a list of PRAW submission objects during a particular period from a defined sub.
    This function serves as a replacement for the now deprecated PRAW `submissions()` method.

    :param subreddit: A subreddit name to fetch submissions from.
    :param start: A Unix time integer. Posts fetched will be AFTER this time. (default: None)
    :param end: A Unix time integer. Posts fetched will be BEFORE this time. (default: None)
    :param limit: There needs to be a defined limit of results (default: 100), or Pushshift will return only 25.
    :param extra_query: A query string is optional. If an extra_query string is not supplied,
                        the function will just grab everything from the defined time period. (default: empty string)

    Submissions are yielded newest first.

    For more information on PRAW, see: https://github.com/praw-dev/praw
    For more information on Pushshift, see: https://github.com/pushshift/api
    """
    matching_praw_submissions = []

    # Default time values if none are defined (credit to u/bboe's PRAW `submissions()` for this section)
    utc_offset = 28800
    now = int(time.time())
    start = max(int(start) + utc_offset if start else 0, 0)
    end = min(int(end) if end else now, now) + utc_offset

    # Format our search link properly.
    search_link = ('https://api.pushshift.io/reddit/submission/search/'
                   '?subreddit={}&after={}&before={}&sort_type=score&sort=asc&limit={}&q={}')
    search_link = search_link.format(subreddit, start, end, limit, extra_query)

    # Get the data from Pushshift as JSON.
    retrieved_data = requests.get(search_link)
    returned_submissions = retrieved_data.json()['data']

    # Iterate over the returned submissions to convert them to PRAW submission objects.
    for submission in returned_submissions:

        # Take the ID, fetch the PRAW submission object, and append to our list
        praw_submission = reddit.submission(id=submission['id'])
        matching_praw_submissions.append(praw_submission)

    # Return all PRAW submissions that were obtained.
    return matching_praw_submissions

In [46]:
url ='https://api.pushshift.io/reddit/submission/search/?subreddit=Cryptocurrency'
requests.get(url)

<Response [403]>

In [49]:
requests.get(url,headers=headers).json()

{'detail': 'Not authenticated'}

In [44]:
submissions_pushshift_praw('CryptoCurrency')

KeyError: 'data'

## Using BeautifulSoup

### If No SSL Error

In [7]:
import requests
import time
from bs4 import BeautifulSoup


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

url = 'https://www.reddit.com/r/CryptoCurrency/new/'
url_test = 'https://www.bbc.co.uk/'

# Get url content with beautiful soup

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text)

### If SSL Error

In [10]:
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

url = 'https://www.reddit.com/r/CryptoCurrency/new/'

# Create a session with retry mechanism
session = requests.Session()
retry = Retry(
    total=5,
    backoff_factor=1,
    status_forcelist=[502, 503, 504]
)
adapter = HTTPAdapter(max_retries=retry)
session.mount('https://', adapter)

try:
    response = session.get(url, headers=headers)
    response.raise_for_status()  # Raise an error for bad status codes
    soup = BeautifulSoup(response.content, 'html.parser')
    # Your existing code to process soup
    data_ks_ids = [a['data-ks-id'] for a in soup.find_all('a', {'data-ks-id': True})]
    print(data_ks_ids)
except requests.exceptions.RequestException as e:
    print(f"An error occurred: {e}")

['t3_1ewvczr', 't3_1ewuz5b', 't3_1ewunua']
