In [None]:
# Some bug fixes and feature tweaks to John Gee's Subreddit word cloud generator
# Amy Alexander, March 2024

In [None]:
## Imports
from PIL import Image
import matplotlib.pyplot as plt
import praw
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from nltk.corpus import stopwords, reuters
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
import os 
from dotenv import load_dotenv
load_dotenv()

import re
import pandas as pd
from collections import Counter

# Code to download corpora
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()

In [None]:
## Define function used to clean text data scrapped from a subreddit

def process_text(doc):
    sw = set(stopwords.words('english'))
    regex = re.compile("[^a-zA-Z ]")
    re_clean = regex.sub('', doc)
    words = word_tokenize(re_clean)
    lem = [lemmatizer.lemmatize(word) for word in words]
    output = [word.lower() for word in lem if word.lower() not in sw]
    return ' '.join(output)

In [None]:
## Load env variables. For this script to work, you'll have to have created an agent with reddit. Go to: https://www.reddit.com/prefs/apps, or google how to create a reddit agent
## You'll need a client_id, client_secret and a user agent name. Load them up as env variables

c_id = os.getenv("C_ID")
c_secret = os.getenv("C_SECRET")

In [None]:
## Create data frame, call reddit function, ping user for subreddit's name and scrap the top 10 post

data = []

reddit = praw.Reddit(client_id=c_id,client_secret=c_secret,user_agent="ENTER USER AGENT NAME")    

subreddit = input('Enter subreddit name:')

for submission in reddit.subreddit(subreddit).hot(limit=10):
    submission.comments.replace_more(limit=None)
    #print(submission.title)
    data.append(submission.title)
    for top_level_comment in submission.comments:
        data.append(top_level_comment.body)

In [None]:
## Convert to text, clean data
text = str(data)
text_cleaned = process_text(text)

In [None]:
## Generate word cloud and save it to the same folder as this script is in. Thank you!! and have fun smiley face

# you can also edit the width and height here to create larger images and still preview them smaller inline. consider revising font size proportionally.
wc = WordCloud(width=600, height=400, background_color="white", max_words=3500, max_font_size=150, random_state=42)
wc.generate(text_cleaned)

plt.imshow(wc.recolor(color_func=None),vmin=1000, interpolation="bilinear")
plt.figure(figsize=(600,400))
wc.to_file('wordcloud.png')