final project items
- descibe the data and a summary of the stats
- state the research question, and what exactly we're trying to express with the data
- what is the method? why this visual? why is this one chosen?
- what's next? what conclusion can be drawn? what comes next for analyzing the data?

selenium documentation (for reader reference): 
* https://selenium-python.readthedocs.io/
* https://www.selenium.dev/documentation/

In [71]:
#importing our libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import time
import csv
import datetime

In [72]:
#booting up selenium to automate the scraping of reddit

#defining proxy ip
proxy = "20.235.159.154:80"

#defining headless mode (if we want to run this operation without gui)
options= webdriver.FirefoxOptions()
options.timeouts= { 'script': 5000 }
options.add_argument("-headless")
options.add_argument(f"--proxy-server={proxy}")
browser = webdriver.Firefox(options=options)

#defining non-headless mode
#browser= webdriver.Firefox()

In [73]:
#helper function to help us get the number of instances of words in each post
def countwords(textblock, keyword):
    textblock= textblock.lower()
    
    return textblock.count(keyword)

In [74]:
#helper function to open a link to a page
def openlink(url):
    browser.get(url) #open the url in browser (we will be using firefox)

In [75]:
#helper function to get title and body text of post
#also get info like time, link, etc which may be helpful

def getpostinfo(url):
    #opening the link
    openlink(url)
    data= []#defining an empty list for post data we want to return
    
    #select all the posts on the page and get its contents in the form of a list of elements
    posts= browser.find_elements(By.CSS_SELECTOR, "div[class='rounded-3xl bg-surface-100-800-token max-w-5xl w-full p-4 variant-ghost-surface my-3']")

    #scraping results in a daily query
    for post in posts:
        postinfo= {} #empty dict for information on every post
        title= post.find_element(By.TAG_NAME, 'h1').text #get title
        #count up keywords instances in title
        unalivecount= countwords(title, 'unalive')
        killcount= countwords(title, 'kill')
        suicidecount= countwords(title, 'suicide')
        
        #selecting the body text of the post
        fullbody= post.find_element(By.CSS_SELECTOR, "div[class='mt-2 overflow-hidden']")
        lines= fullbody.find_elements(By.CSS_SELECTOR, "p")
        bodytext=""
        for line in lines:
            bodytext+= ("\n"+line.text)
        
        #counting instances of the keywords in post body and adding it to total
        unalivecount+= countwords(bodytext, 'unalive')
        killcount+= countwords(bodytext, 'kill')
        suicidecount+= countwords(suicidecount, 'kill')
            
        footnote= post.find_elements(By.CSS_SELECTOR, "p[class='text-xs font-semibold'")
        
        postinfo['time']= footnote[3].text #TODO: WRITE HELPER FUNCTION TO CONVERT DATE TO USEFUL FORMAT
        postinfo['unalivecount']= unalivecount
        postinfo['killcount']= killcount
        postinfo['suicidecount']= suicidecount
        postinfo['title']= title
        postinfo['text']= bodytext
        #postinfo['link']= footnote[4].text
        
        data.append(postinfo)
    
    return data

In [76]:
#reddit caps search results at 100, so instead we will separate our query by day and scrape the results

#defining the day we want to start querying. we will query every day from 1/1/2019 to 1/1/2025
firstdayend= datetime.datetime(2019, 1, 1, 0, 0)
before= time.mktime(firstdayend.timetuple())
firstdaystart= datetime.datetime(2018, 12, 31, 23, 59)
after= time.mktime(firstdaystart.timetuple())

#defining the end of our sampling to be 1/1/2025
lastdayend= datetime.datetime(2025, 1, 1, 0, 0)
end= time.mktime(lastdayend.timetuple())


In [77]:
def resetfirstend():
    return time.mktime(firstdayend.timetuple())

def resetfirststart():
    return time.mktime(firstdaystart.timetuple())

In [78]:
#scraping for 'unalive'
postdata_unalive= [] #defining empty list for the data
while before<end:
    url= f"https://search-new.pullpush.io/?subreddit=offmychest&type=submission&q=unalive&sort_type=created_utc&sort=asc&before={before}&after={after}"
    scrapedata= getpostinfo(url)
    postdata_unalive.append({'keyword' : "unalive", 
                     'time' : datetime.datetime.fromtimestamp(int(before)).strftime('%Y-%m-%d %H:%M:%S'),
                     'numposts' : len(scrapedata),
                     'data' : scrapedata})
    after=before
    before=before+86400 #each day is incremented by 86400 seconds

#reset the start parameters  
before= resetfirstend()
after= resetfirststart()

In [79]:
#scraping for 'kill'
postdata_kill= [] #defining empty list for the data
while before<end:
    url= f"https://search-new.pullpush.io/?subreddit=offmychest&type=submission&q=kill&sort_type=created_utc&sort=asc&before={before}&after={after}"
    scrapedata= getpostinfo(url)
    postdata_kill.append({'keyword' : "kill", 
                     'time' : datetime.datetime.fromtimestamp(int(before)).strftime('%Y-%m-%d %H:%M:%S'),
                     'numposts' : len(scrapedata),
                     'data' : scrapedata})
    after=before
    before=before+86400 #each day is incremented by 86400 seconds
    
#reset the start parameters  
before= resetfirstend()
after= resetfirststart()

In [80]:
#scraping for 'suicide'
postdata_suicide= [] #defining empty list for the data
while before<end:
    url= f"https://search-new.pullpush.io/?subreddit=offmychest&type=submission&q=suicide&sort_type=created_utc&sort=asc&before={before}&after={after}"
    scrapedata= getpostinfo(url)
    postdata_suicide.append({'keyword' : "suicide", 
                     'time' : datetime.datetime.fromtimestamp(int(before)).strftime('%Y-%m-%d %H:%M:%S'),
                     'numposts' : len(scrapedata),
                     'data' : scrapedata})
    after=before
    before=before+86400 #each day is incremented by 86400 seconds
    
#reset the start parameters  
before= resetfirstend()
after= resetfirststart()

In [82]:
browser.quit()

In [81]:
#combine our data into a singular list
postdata= postdata_unalive+postdata_kill+postdata_suicide

#after collecting data, export it into csv format
header = ["keyword", "time", "numposts", "data"]
with open('data.csv', 'w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=header)
    writer.writeheader()
    for row in postdata:
        writer.writerow(row)  