In [None]:
import pandas as pd
import datetime as dt
import csv
import numpy as np

In [15]:
#load in our csv files
#the location of these files may differ on your own device, so change address accordingly

unaliveraw= pd.read_csv("./data/unalive_data.csv")

killraw= pd.read_csv("./data/kill_data.csv")

murderraw= pd.read_csv("./data/murder_data.csv")

suicideraw= pd.read_csv("./data/suicide_data.csv")

In [16]:
unaliveraw.head()

Unnamed: 0,word,date,data,metadata,error
0,unalive,,[],"{'op_a': 5298, 'op_b': 0, 'total': 5304}",
1,unalive,,[],"{'op_a': 241, 'op_b': 0, 'total': 244}",
2,unalive,,[],"{'op_a': 283, 'op_b': 0, 'total': 288}",
3,unalive,,[],"{'op_a': 198, 'op_b': 0, 'total': 201}",
4,unalive,,[],"{'op_a': 190, 'op_b': 0, 'total': 195}",


In [17]:
#isolating the data we gathered

#getting rid of empty entries and unnecessary columns for unalive
#dropping all columns except data
unaliveraw= unaliveraw.drop(['word', 'date', 'metadata', 'error'], axis= 1) 
unaliveraw= unaliveraw.replace('nan', np.nan)
unaliveraw= unaliveraw.dropna() #dropping nan values
unaliveraw= unaliveraw[unaliveraw['data'] != '[]'] #removing empty data entries
unaliveraw= unaliveraw.reset_index(drop=True) #resetting index for df

#repeating for kill
killraw= killraw.drop(['word', 'date', 'metadata', 'error'], axis= 1)
killraw= killraw.replace('nan', np.nan)
killraw= killraw.dropna()
killraw= killraw[killraw['data'] != '[]']
killraw= killraw.reset_index(drop=True)

#repeating for murder
murderraw= murderraw.drop(['word', 'date', 'metadata', 'error'], axis= 1)
murderraw= murderraw.replace('nan', np.nan)
murderraw= murderraw.dropna()
murderraw= murderraw[murderraw['data'] != '[]']
murderraw= murderraw.reset_index(drop=True)

#repeating for suicide
suicideraw= suicideraw.drop(['word', 'date', 'metadata', 'error'], axis= 1)
suicideraw= suicideraw.replace('nan', np.nan)
suicideraw= suicideraw.dropna()
suicideraw= suicideraw[suicideraw['data'] != '[]']
suicideraw= suicideraw.reset_index(drop=True)


In [18]:
#a list of the data we got for unalive
#after this point we can access the data to analyze it
unalivedata=[]
#combining all the dataframe data into one list
for index, row in unaliveraw.iterrows():
    entry= row['data']
    entry= eval(entry)    
    unalivedata+=entry
    
#as usual repeating the process for the other words
killdata=[]
for index, row in killraw.iterrows():
    entry= row['data']
    entry= eval(entry)    
    killdata+=entry
    
murderdata=[]
for index, row in murderraw.iterrows():
    entry= row['data']
    entry= eval(entry)    
    murderdata+=entry
    
suicidedata=[]
for index, row in suicideraw.iterrows():
    entry= row['data']
    entry= eval(entry)    
    suicidedata+=entry


In [19]:
#sorting by date, with numbers for each word
def collectposts(data):
    posts= {}
    for item in data:
        itemdate= item['created_utc']
        text= item['title']+item['selftext']
        posts.update({str(itemdate) : text})
    return posts

In [20]:
#collecting post text by date for each of our words
unaliveposts= collectposts(unalivedata)
killposts= collectposts(killdata)
murderposts= collectposts(murderdata)
suicideposts= collectposts(suicidedata)

In [25]:
#helper function to determine if two datetimes are on the same day
#time1 represents a day within our vis range
#time2 represents the timestamp from a post
def sameday(time1, time2):
    time1= str(time1).split(" ")[0]
    time2= pd.to_datetime(time2, unit='s')
    time2= str(time2).split(" ")[0]
    
    if time1==time2:
        return True
    else:
        return False

In [26]:
#helper function to count the number of posts 
# there are occuring on a specific day
def postcount(posts, date):
    count=0
    for post in posts:
        if sameday(date, post):
            count+=1
    return count

In [27]:
#helper function to consolidate all post text from a
# specific day into a list
def textcollect(posts, date):
    texts= []
    for post in posts:
        if sameday(date, post):
            texts.append(posts[post])
    return texts

In [None]:
#generate timestamps for every day we are surveying
#this will give us the keys for our definitive dataset
daterange= pd.date_range(start ='12-31-2018', 
         end ='1-1-2025', freq ='24h')

#initializing our dataset for counting number of posts
data_definitive= []

#create a list of dictionaries for the data
for date in daterange:
    #count up the number of posts for each word for each day
    unalivecount= postcount(unaliveposts, date)
    print('unalive posts counted')
    killcount= postcount(killposts, date)
    print('kill posts counted')
    murdercount= postcount(murderposts, date)
    print('murder posts counted')
    suicidecount= postcount(suicideposts, date)
    print('suicide posts counted')
    
    #collect together the text from each day for each word
    unalivetext= textcollect(unaliveposts, date)
    print('unalive texts collected')
    killtext= textcollect(killposts, date)
    print('kill texts collected')
    murdertext= textcollect(murderposts, date)
    print('murder texts collected')
    suicidetext= textcollect(suicideposts, date)
    print('suicide texts collected')
    
    #compile all the information into a dictionary
    dictentry= {'date' : date, 
                'unalive' : unalivecount, 'unalive_text' : unalivetext, 
                'kill' : killcount, 'kill_text' : killtext,
                'murder' : murdercount, 'murder_text' : murdertext, 
                'suicide' : suicidecount, 'suicide_text' : suicidetext}
    
    #add this dictionary to data_definitive
    data_definitive.append(dictentry)

  time2= pd.to_datetime(time2, unit='s')


In [33]:
#export our cleaned data into a csv so we can use it for vis

#define our headers (column names)
header = ["date", "unalive", "unalive_text", 
          "kill", "kill_text", "murder", "murder_text",
          "suicide", "suicide_text"]

#send to csv
with open('cleaned_data.csv', 'w', encoding="utf-8", newline='') as file:
    writer = csv.DictWriter(file, fieldnames=header)
    writer.writeheader()
    writer.writerows(data_definitive)