This notebook scrapes the reddit API for the top 5 comments of each of the users in the cleaned folder

In [13]:
import json
import requests
import praw
import os
from dotenv import load_dotenv
from dataclasses import dataclass

In [2]:
# Access an environment variable
load_dotenv()
client_id = os.getenv("CLIENT_ID")
client_secret = os.getenv("CLIENT_SECRET")

reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent="CS492 Bot Analysis",
)


In [50]:
# to get the "ai score" of some text
# from https://contentatscale.ai/ai-content-detector/
def get_ai_score(text):
	url = 'https://contentatscale.ai/ai-content-detector/'
	headers = {
		'authority': 'contentatscale.ai',
		'accept': '*/*',
		'accept-language': 'en-US,en;q=0.9',
		'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
		'cookie': '_cas_stats=eyJVc2VyQWdlbnQiOiJNb3ppbGxhLzUuMCAoTWFjaW50b3NoOyBJbnRlbCBNYWMgT1MgWCAxMF8xNV83KSBBcHBsZVdlYktpdC81MzcuMzYgKEtIVE1MLCBsaWtlIEdlY2tvKSBDaHJvbWUvMTIyLjAuMC4wIFNhZmFyaS81MzcuMzYiLCJMYW5ndWFnZSI6ImVuLVVTIiwiU2NyZWVuV2lkdGgiOjE2OTIsIlNjcmVlbkhlaWdodCI6MzAwOCwiQ29sb3JEZXB0aCI6MjQsIlRpbWVab25lT2Zmc2V0IjoyNDAsIlBsdWdpbnMiOiJQREYgVmlld2VyLCBDaHJvbWUgUERGIFZpZXdlciwgQ2hyb21pdW0gUERGIFZpZXdlciwgTWljcm9zb2Z0IEVkZ2UgUERGIFZpZXdlciwgV2ViS2l0IGJ1aWx0LWluIFBERiJ9',
		'dnt': '1',
		'origin': 'https://contentatscale.ai',
		'referer': 'https://contentatscale.ai/ai-content-detector/',
		'sec-ch-ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"',
		'sec-ch-ua-mobile': '?0',
		'sec-ch-ua-platform': '"macOS"',
		'sec-fetch-dest': 'empty',
		'sec-fetch-mode': 'cors',
		'sec-fetch-site': 'same-origin',
		'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
		'x-requested-with': 'XMLHttpRequest'
	}

	data = {
		'content': f"<p>{text}</p>",
		'action': 'checkaiscore'
	}

	response = requests.post(url, headers=headers, data=data)

	if response.status_code == 200:
		# Get JSON data from the response
		return response.json()
	else:
		raise Exception(f"Request was not successful. Status code: {response.status_code}")

In [59]:
@dataclass
class Comment:
	id: str
	text: str

@dataclass
class Account:
	username: str
	is_bot: bool
	comments: list[Comment]


In [42]:
# to get the top 5 comments of a user
def get_comments(username):
    comments = []
    for comment in reddit.redditor(username).comments.top(limit=5):
        comments.append(Comment(comment.id, comment.body))
    return comments

# to get the top 5 comments of all users in a file
def get_accounts_data(f_name) -> list[Account]:
    account_list = []
    with open(f_name, 'r') as f:
        is_bot = '.bots' in f_name
        for line in f:
            username = line.strip()
            print(f"Processing {username}")
            try:
                comments = get_comments(username)
            except Exception as e:
                print(f"Error: {e}")
                continue
            account = Account(username, is_bot, comments)
            account_list.append(account)
    return account_list


In [43]:
all_bots = get_accounts_data('./cleaned/all_bots.bots')

Processing wikipediaGPT2Bot
Processing hiphopheadsGPT2Bot
Processing letstalkmusicGPT2Bot
Processing shortscarystoGPT2Bot
Processing neoliberalGPT2Bot
Processing wordavalancheGPT2Bot
Processing lifeprotipsGPT2Bot
Processing ukpoliticsGPT2Bot
Processing glitch_matrixGPT2Bot
Processing 4chan_GPT2Bot
Processing teenagersGPT2Bot
Processing nflGPT2Bot
Processing muricaGPT2Bot
Processing singularityGPT2Bot
Processing wldyouratherGPT2Bot
Processing televisionGPT2Bot
Processing whowouldwinGPT2Bot
Processing conlangsGPT2Bot
Processing offmychestGPT2Bot
Processing markmywordsGPT2Bot
Processing roastmeGPT2Bot
Processing ledootgenGPT2Bot
Processing scenesfromhatGPT2Bot
Processing recipesGPT2Bot
Processing socialismGPT2Bot
Processing fifthwrldprobGPT2Bot
Processing unpopularopinGPT2Bot
Processing drugsGPT2Bot
Processing moviesGPT2Bot
Processing subredd_dramaGPT2Bot
Processing jokes_GPT2Bot
Processing daystrominstGPT2Bot
Processing nocontextGPT2Bot
Processing treesGPT2Bot
Processing awakenedGPT2Bot


In [44]:
all_humans = get_accounts_data('./cleaned/all_humans.humans')

Processing trat_la
Processing NorthXCX
Processing reddit_irl
Processing traceroo
Processing thrivekindly
Processing infinitebroth
Processing Acidtwist
Processing JabroniRevanchism
Processing BrineOfTheTimes
Processing platinumpixieset
Processing anon-axolotl
Processing snoo-tuh
Processing werksquan
Processing whizlogic
Processing venkman01
Processing spez
Processing reddit
Processing jeffy-bezos
Processing Go_JasonWaterfalls
Processing KeyserSosa
Processing marzipanmarsbar
Processing gregthegeth
Processing kethryvis
Processing outersunset
Processing joyventure
Processing kriketjunkie
Processing unavailable4coffee
Processing lift_ticket83
Processing singmethesong
Processing cozy__sheets
Processing caffeinatedoptimist
Processing dontsweatthetechniQ
Processing enthusiastic-potato
Processing sodypop
Processing crowd__pleaser
Processing LastBluejay
Processing appa4ever
Processing such084
Processing advocado20
Processing BurritoJusticeLeague
Processing toastedfig
Processing UndrgrndCartograp

In [46]:
creme_bots = get_accounts_data('./cleaned/creme_training.bots')

Processing Nazeem_Bot
Processing PhoenixBot
Processing AtheismModBot
Processing IsItDownBot
Processing malo_the_bot
Error: received 404 HTTP response
Processing RFootballBot
Processing KSPortBot
Processing Makes_Small_Text_Bot
Processing CompileBot
Processing SakuraiBot
Processing asmrspambot
Processing SurveyOfRedditBot
Processing RfreebandzBOT
Processing rule_bot
Processing xkcdcomic_bot
Processing PloungeMafiaVoteBot
Processing PoliticBot
Error: received 403 HTTP response
Processing Dickish_Bot_Bot
Processing SuchModBot
Processing MultiFunctionBot
Processing CasualMetricBot
Processing xkcd_bot
Processing VerseBot
Processing BeetusBot
Processing GameDealsBot
Processing BadLinguisticsBot
Processing rhiever-bot
Processing gfycat-bot-sucksdick
Processing chromabot
Processing Readdit_Bot
Processing wooshbot
Processing disapprovalbot
Processing request_bot
Processing define_bot
Processing dogetipbot
Processing techobot
Processing CaptionBot
Processing rightsbot
Processing colorcodebot
Pro

In [47]:
creme_humans = get_accounts_data('./cleaned/creme_training.humans')

Processing suzukigun4life
Processing ActuelRoiDeFrance
Processing Kureiton84
Processing sarahdeanarts
Processing gingeronimooo
Error: received 403 HTTP response
Processing c08306834
Processing gheldean
Processing authentic010
Processing LastBluejay
Processing cantgetthistowork
Processing reddit_irl
Processing Pedrica1
Processing JesseD320
Processing singleboomer
Error: received 404 HTTP response
Processing JF_112
Processing slim_p_
Processing Philo1927
Processing Darren-B80
Processing creesch
Processing clampie
Error: received 403 HTTP response
Processing narcolepsyinc
Processing pdmcmahon
Processing Orange_fury
Processing Ian_a_wilson
Processing MadGo
Processing Olya_roo
Processing viciousdv
Processing Craftastrophe
Processing sweedishfishoreo
Processing dickfromaccounting
Processing DJdrummer
Processing headee
Processing axiosempra
Processing areallyshitusername
Processing Maxillustration
Processing redtaboo
Processing etymologynerd
Processing KymmaLabeija
Processing bushdiid911
Proc

In [72]:
def to_json(accounts, f_name):
	with open(f_name, 'w') as f:
		json.dump(
			[{
				'username': account.username,
				'is_bot': account.is_bot,
				'comments': [c.text for c in account.comments]
			} for account in accounts],
			f,
			indent=4
		)

to_json(all_bots, './cleaned/comments/all_bots.json')
to_json(all_humans, './cleaned/comments/all_humans.json')
to_json(creme_bots, './cleaned/comments/creme_bots.json')
to_json(creme_humans, './cleaned/comments/creme_humans.json')