In [1]:
%run user.py # Get credentials from user.py (.gitignored)
USERNAME, PASSWORD, SECRET_TOKEN, CLIENT_ID = my_credentials()

# These can be created at https://www.reddit.com/prefs/apps

In [35]:
import requests

# Authentication
client_auth = requests.auth.HTTPBasicAuth(CLIENT_ID, SECRET_TOKEN)

post_data = {"grant_type": "password", "username": USERNAME, "password": PASSWORD}

# Headers
user_agent = "MyStats/0.0.1"
headers = {"User-Agent": user_agent}

# Send POST request to the Reddit API to get an access token
url = "https://www.reddit.com/api/v1/access_token"
response = requests.post(url, auth=client_auth, data=post_data, headers=headers)

# Parse the response as JSON and return it
response_json = response.json()
response_json

{'access_token': '26130214-ePXLa_Kqn1tn_mRg14tSRgQLUPpUig',
 'token_type': 'bearer',
 'expires_in': 86400,
 'scope': '*'}

Access token is valid for 24 hours (86400 seconds).

In [3]:
from datetime import datetime, timedelta
print(f'Access token expires at {datetime.now() + timedelta(hours=24)}')

Access token expires at 2023-04-10 13:05:24.508939


When it expires we simply generate a new one. Posting the access token like this is not recommended. But it's my personal account so I say it's ok for now.

In [36]:
access_token = response.json()['access_token']

In [37]:
headers = {**headers, **{'Authorization': f'bearer {access_token}'}} # ** for unpacking dictionary
headers

{'User-Agent': 'MyStats/0.0.1',
 'Authorization': 'bearer 26130214-ePXLa_Kqn1tn_mRg14tSRgQLUPpUig'}

### Let's check out the Python subreddit.

In [38]:
res = requests.get("https://oauth.reddit.com/r/python/hot",
                   headers=headers)

res.json()

{'kind': 'Listing',
 'data': {'after': 't3_12efni5',
  'dist': 27,
  'modhash': None,
  'geo_filter': None,
  'children': [{'kind': 't3',
    'data': {'approved_at_utc': None,
     'subreddit': 'Python',
     'selftext': "Tell /r/python what you're working on this week! You can be bragging, grousing, sharing your passion, or explaining your pain. Talk about your current project or your pet project; whatever you want to share.",
     'author_fullname': 't2_145f96',
     'saved': False,
     'mod_reason_title': None,
     'gilded': 0,
     'clicked': False,
     'title': "Sunday Daily Thread: What's everyone working on this week?",
     'link_flair_richtext': [{'e': 'text', 't': 'Daily Thread'}],
     'subreddit_name_prefixed': 'r/Python',
     'hidden': False,
     'pwls': 6,
     'link_flair_css_class': 'daily-thread',
     'downs': 0,
     'thumbnail_height': None,
     'top_awarded_type': None,
     'hide_score': False,
     'name': 't3_12g2rk1',
     'quarantine': False,
     'link_

Yooo! Look at all this juicy dayta! Let's bring in the pandas.

In [40]:
import pandas as pd 

#res = requests.get("https://oauth.reddit.com/r/Python/hot",
#                   headers=headers)

res = requests.get("https://oauth.reddit.com/r/Python/hot",
                   headers=headers)

df = pd.DataFrame(columns=['subreddit', 'title', 'selftext', 'upvote_ratio', 'ups', 'downs', 'score'])

# loop through each post retrieved from GET request
for post in res.json()['data']['children']:
    
    # append to dataframe
    df.loc[len(df)] = {
        'subreddit': post['data']['subreddit'],
        'title': post['data']['title'],
        'selftext': post['data']['selftext'],
        'upvote_ratio': post['data']['upvote_ratio'],
        'ups': post['data']['ups'],
        'downs': post['data']['downs'],
        'score': post['data']['score']
    }

In [41]:
df.sort_values(by='score', ascending=False)

Unnamed: 0,subreddit,title,selftext,upvote_ratio,ups,downs,score
23,Python,"I trained a RoastBot on &gt;120,000 faces and ...",It uses facial recognition to fetch roasts for...,0.93,463,0,463
3,Python,EP 684: A Per-Interpreter GIL Accepted,,0.97,372,0,372
12,Python,PEP 695: Type Parameter Syntax has been accept...,,0.97,352,0,352
2,Python,Comprehensive Reddit Saved Posts Downloader - ...,"Hi all, I made a post about this a couple of d...",0.96,248,0,248
26,Python,PEP 711 – PyBI: a standard format for distribu...,,0.96,239,0,239
14,Python,Pandas 2.0 (with pyarrow) vs Pandas 1.3 - Perf...,,0.76,46,0,46
5,Python,git-limiter: 🧭 Tool to stop you from pushing h...,,0.85,31,0,31
11,Python,I got annoyed by building and maintaining cust...,I got frustrated with the time and effort requ...,0.7,25,0,25
15,Python,AutoControl is an automation package for Python.,&amp;#x200B;\n\n## AutoControl allows you to a...,0.78,16,0,16
16,Python,aiob2 - A Modern and Pythonic API Wrapper for ...,aiob2 is an API wrapper for the Backblaze buck...,0.94,12,0,12


In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

def word_count(df):
    # regular english stop words filter + custom filter
    stop_words = 'english'
    vectorizer = TfidfVectorizer(stop_words=stop_words)
    
    vectorizer.fit(df['title'])
    
    title_tfidf = vectorizer.transform(df['title'])     # Sparse matrix of TF-IDF features
    feature_names = vectorizer.get_feature_names_out()  # Get the names of the features/words
    sum_scores = title_tfidf.sum(axis=0)                # Sum the TF-IDF scores for each feature for all titles
    
    # Convert scores to 1D array, sort descending order
    top_scores = np.squeeze(np.asarray(sum_scores))
    sorted_scores_indices = np.argsort(top_scores)[::-1]
    
    print("Top 10 features by TF-IDF score:")
    for i in range(10):
        feature_index = sorted_scores_indices[i]
        feature_score = top_scores[feature_index]
        feature_name = feature_names[feature_index]
        print(f"{feature_name}: {feature_score:.3f}")

In [43]:
word_count(df)

Top 10 features by TF-IDF score:
python: 2.522
pop: 1.000
daily: 0.969
thread: 0.969
object: 0.892
development: 0.844
learn: 0.821
vs: 0.775
web: 0.757
accepted: 0.729


In [48]:
# Subreddit, time filter and limit for the API request
def get_sub(subreddit='Python', time_filter='month', limit=100):

    # Current timestamp and one month ago
    now = datetime.now()
    one_month_ago = now - timedelta(days=30)

    # Convert to Unix time
    now_unix = int(now.timestamp())
    one_month_ago_unix = int(one_month_ago.timestamp())

    # Make the API request
    url = f'https://oauth.reddit.com/r/{subreddit}/top?t={time_filter}&limit={limit}&after={one_month_ago_unix}&before={now_unix}'
    res = requests.get(url, headers=headers)

    df = pd.DataFrame(columns=['subreddit', 'title', 'selftext', 'upvote_ratio', 'ups', 'downs', 'score'])

    # Loop through each post retrieved from the API request and append to df
    for post in res.json()['data']['children']:
        df.loc[len(df)] = {
            'subreddit': post['data']['subreddit'],
            'title': post['data']['title'],
            'selftext': post['data']['selftext'],
            'upvote_ratio': post['data']['upvote_ratio'],
            'ups': post['data']['ups'],
            'downs': post['data']['downs'],
            'score': post['data']['score']
        }
    return df

In [50]:
df = get_sub(subreddit='Minecraft')
word_count(df)

Top 10 features by TF-IDF score:
minecraft: 5.905
update: 2.025
rare: 2.010
think: 1.781
build: 1.728
new: 1.710
make: 1.643
launcher: 1.631
game: 1.437
nether: 1.430


In [51]:
df = get_sub(subreddit='Hearthstone')
word_count(df)

Top 10 features by TF-IDF score:
new: 10.402
card: 9.996
revealed: 9.189
warrior: 4.217
legendary: 3.165
priest: 2.570
cards: 2.424
neutral: 2.310
hunter: 2.271
druid: 2.148
