### **Data collection**

In [4]:
# imports
import requests
import pandas as pd
import numpy as np
import os
pd.options.display.max_colwidth = 350

In [2]:
# I will define dictionaries with time stamps, with a gap of several days.
# I will collect 100 submissions from each time stamp.
# I took a several days gap so that we don't risk collecting the same submissions - in any case, we'll 
# check this again later on to make sure each time stamp in our data is unique.
# For each date, the information pulled is from midnight (12:00:00 AM, GMT).
# An Epoch Converter was used to get the UTC from the date (https://www.epochconverter.com/)

# For some reason, when I tried 1 dictionary containing all the time stamps and 1 loop 
# that pulled down all the data together, it crashed each time after a different number of iterations. So
# I had to split it into several pieces and run it several times separately for each subreddit.

In [2]:
# dictionaries with time stamps
time_dict1 = {
    1577836800 : '01/01/2020',
    1578182400 : '01/05/2020',
    1578614400 : '01/10/2020',
    1579046400 : '01/15/2020',
    1579478400 : '01/20/2020',
    1579910400 : '01/25/2020',
    1580428800 : '01/31/2020',
    1580860800 : '02/05/2020',
    1581292800 : '02/10/2020',
    1581724800 : '02/15/2020',
    1582156800 : '02/20/2020',
    1582588800 : '02/25/2020' }

time_dict2 = {
    1582934400 : '02/29/2020',
    1583366400 : '03/05/2020',
    1583798400 : '03/10/2020',
    1584230400 : '03/15/2020',
    1584662400 : '03/20/2020',
    1585094400 : '03/25/2020',
    1585612800 : '03/31/2020',
    1586044800 : '04/05/2020',
    1586476800 : '04/10/2020',
    1586908800 : '04/15/2020' }

time_dict3 = {
    1587340800 : '04/20/2020',
    1587772800 : '04/25/2020',
    1588204800 : '04/30/2020',
    1588636800 : '05/05/2020',
    1589068800 : '05/10/2020',
    1589500800 : '05/15/2020',
    1589932800 : '05/20/2020',
    1590364800 : '05/25/2020',
    1590883200 : '05/31/2020',
    1591315200 : '06/05/2020' }

time_dict4 = {
    1591747200 : '06/10/2020',
    1592179200 : '06/15/2020',
    1592611200 : '06/20/2020',
    1593043200 : '06/25/2020',
    1593475200 : '06/30/2020',
    1593907200 : '07/05/2020',
    1594339200 : '07/10/2020',
    1594771200 : '07/15/2020',
    1595203200 : '07/20/2020',
    1595635200 : '07/25/2020' }

time_dict5 = {
    1596153600 : '07/31/2020',
    1596585600 : '08/05/2020',
    1597017600 : '08/10/2020',
    1597449600 : '08/15/2020',
    1597881600 : '08/20/2020',
    1598313600 : '08/25/2020',
    1598832000 : '08/31/2020',
    1599264000 : '09/05/2020',
    1599696000 : '09/10/2020',
    1600128000 : '09/15/2020' }

time_dict6 = {
    1600560000 : '09/20/2020',
    1600992000 : '09/25/2020',
    1601424000 : '09/30/2020',
    1601856000 : '10/05/2020',
    1602288000 : '10/10/2020',
    1602720000 : '10/15/2020',
    1603152000 : '10/20/2020',
    1603584000 : '10/25/2020',
    1604016000 : '10/30/2020',
    1604534400 : '11/05/2020' }

time_dict7 = {
    1604966400 : '11/10/2020',
    1605398400 : '11/15/2020',
    1605830400 : '11/20/2020',
    1606262400 : '11/25/2020',
    1606694400 : '11/30/2020',
    1607126400 : '12/05/2020',
    1607558400 : '12/10/2020',
    1607990400 : '12/15/2020',
    1608422400 : '12/20/2020',
    1608854400 : '12/25/2020' }

time_dict8 = {
    1609286400 : '12/30/2020',
    1609804800 : '01/05/2021',
    1610236800 : '01/10/2021',
    1610668800 : '01/15/2021',
    1611100800 : '01/20/2021',
    1611532800 : '01/25/2021',
    1611964800 : '01/30/2021',
    1612483200 : '02/05/2021',
    1612915200 : '02/10/2021',
    1613347200 : '02/15/2021' }

time_dict9 = {
    1613779200 : '02/20/2021',
    1614470400 : '02/28/2021',
    1614556800 : '03/01/2021',
    1614902400 : '03/05/2021',
    1615334400 : '03/10/2021',
    1615766400 : '03/15/2021',
    1616198400 : '03/20/2021',
    1616630400 : '03/25/2021',
    1617062400 : '03/30/2021',
    1617580800 : '04/05/2021' }

time_dict10 = {
    1618012800 : '04/10/2021',
    1618444800 : '04/15/2021',
    1618876800 : '04/20/2021',
    1619308800 : '04/25/2021',
    1619740800 : '04/30/2021',
    1620172800 : '05/05/2021',
    1620604800 : '05/10/2021',
    1621036800 : '05/15/2021',
    1621468800 : '05/20/2021',
    1621900800 : '05/25/2021',
    1622332800 : '05/30/2021' }

In [14]:
## collecting data for 'datascience' subreddit ##

# URL
url = 'https://api.pushshift.io//reddit/search/submission'

# The for loop below will request the submissions.
# The time period is determined by the time dictionaries defined in the previous cell.
 
# The loop should be run separately for each time dictionary (10 times) for the subreddit 'datascience'; 
# in the next cell the same is done for the subreddit 'books' (using the same 10 time dictionaries).

# Define a count variable to count the number of data pulling operations we make - we'll use it for the 
# file name. For each time we run the loop, we need to start the count variable from a different value, 
# so that each file name will be unique.

count = 1 # for time_dict1 
# count = 13 # for time_dict2
# count = 23 # for time_dict3
# count = 33 # for time_dict4
# count = 43 # for time_dict5
# count = 53 # for time_dict6
# count = 63 # for time_dict7
# count = 73 # for time_dict8
# count = 83 # for time_dict9
# count = 93 # for time_dict10

# for loop to request the data
for time in list(time_dict1.keys()): # change here once for time_dict1, once for time_dict2, etc. until 
                                     # time_dict10
    
    # print loop progress
    print(f'Submission {count} with time stamp: {time}...')
    
    # set parameters to specify which submissions to take
    params = {
        'subreddit' : 'datascience',   # subreddit     
        'size' :  100,                 # how many submissions (max allowed 100 at once)
        'before' :  time               # date from which to get the subreddits
    }
    
    # requesting the posts
    response = requests.get(url, params)
    
    # transforming the information into a data frame
    df = pd.DataFrame(response.json()['data'])
    
    # extracting the relevant columns
    df = df[['subreddit', 'selftext', 'title', 'created_utc']]
        
    # save df as a csv file
    df.to_csv('../datafiles/datascience' + str(count) +'.csv', index=False)
    
    # update the count variable
    count += 1
        
    # print info on loop progress
    print('... done.')

Submission 93 with time stamp: 1618012800...
... done.
Submission 94 with time stamp: 1618444800...
... done.
Submission 95 with time stamp: 1618876800...
... done.
Submission 96 with time stamp: 1619308800...
... done.
Submission 97 with time stamp: 1619740800...
... done.
Submission 98 with time stamp: 1620172800...
... done.
Submission 99 with time stamp: 1620604800...
... done.
Submission 100 with time stamp: 1621036800...
... done.
Submission 101 with time stamp: 1621468800...
... done.
Submission 102 with time stamp: 1621900800...
... done.
Submission 103 with time stamp: 1622332800...
... done.


In [25]:
## Same operation as above for the 'books' subreddit ##

# URL
url = 'https://api.pushshift.io//reddit/search/submission'

# count variable

count = 1 # for time_dict1 
# count = 13 # for time_dict2
# count = 23 # for time_dict3
# count = 33 # for time_dict4
# count = 43 # for time_dict5
# count = 53 # for time_dict6
# count = 63 # for time_dict7
# count = 73 # for time_dict8
# count = 83 # for time_dict9
# count = 93 # for time_dict10

# for loop to collect the data
for time in list(time_dict1.keys()):
    
    # print loop progress
    print(f'Submission {count} with time stamp: {time}...')
    
    # set parameters to specify which submissions to pull in
    params = {
        'subreddit' : 'books',        # subreddit     
        'size' :  100,                 # how many submissions (max allowed 100 at once)
        'before' :  time               # date from which to get the subreddits
    }
    
    # pulling down the submissions
    response = requests.get(url, params)
    
    # transforming the information into a data frame
    df = pd.DataFrame(response.json()['data'])
    
    # extracting the columns for the NLP model
    df = df[['subreddit', 'selftext', 'title', 'created_utc']]
        
    # save df as a csv file
    df.to_csv('../datafiles/books' + str(count) +'.csv', index=False)
    
    # update the count variable
    count += 1
        
    # print info on loop progress
    print('... done.')

Submission 93 with time stamp: 1618012800...
... done.
Submission 94 with time stamp: 1618444800...
... done.
Submission 95 with time stamp: 1618876800...
... done.
Submission 96 with time stamp: 1619308800...
... done.
Submission 97 with time stamp: 1619740800...
... done.
Submission 98 with time stamp: 1620172800...
... done.
Submission 99 with time stamp: 1620604800...
... done.
Submission 100 with time stamp: 1621036800...
... done.
Submission 101 with time stamp: 1621468800...
... done.
Submission 102 with time stamp: 1621900800...
... done.
Submission 103 with time stamp: 1622332800...
... done.


---

In [26]:
## make a list of the data files
files = os.listdir('../datafiles')

In [27]:
# check the first file(s) - they might be hidden files that we don't need
files[0]

'.ipynb_checkpoints'

In [28]:
# make sure we don't include the ipnyb file
files = files[1:]

In [29]:
## putting all the data files together ##

# load the first file
data = pd.read_csv('../datafiles/'+files[0])

# for loop to concatenate all data frames together
for file in range(1, len(files)):
    # loading the next file in the line    
    df = pd.read_csv('../datafiles/'+files[file])
    # pasting the data frames together
    data = pd.concat([data, df], axis=0)

In [30]:
# check uniquness of UTC time stamps in the data frame
# the resulting list should be empty
dupes = [x for n, x in enumerate(data['created_utc']) if x in data['created_utc'][:n]] # code from https://stackoverflow.com/questions/9835762/how-do-i-find-the-duplicates-in-a-list-and-create-another-list-with-them
dupes

[]

In [31]:
# look at the big data frame
data.shape

(20600, 4)

In [32]:
# print first 5 rows
data.head(5)

Unnamed: 0,subreddit,selftext,title,created_utc
0,books,"And it's not because I'm a slow reader - quite the contrary. But this was a book worth taking my time over.\n\nI think it's mostly to do with the fact that I knew beforehand how the story of this remarkable ship ends, and I was putting off reading about the details.\n\nCouple this with the wonderful and charming way in which Palin (of Monty Pyt...","It's taken me all year, but I finally finished ""Erebus: The Story of a Ship"" by Michael Palin",1577836753
1,books,"Those that scroll through reddit AND read so many books, some of you read sooooo many, how do you find the time?",How do you find the time?,1577835261
2,books,What’s everyone’s reading goals for next year? I’d like to read 50 books and read more fiction.,2020 reading goals,1577834790
3,books,Has anyone read Cage of Souls and what did you think of it? Never read anything by Tchaikovsky before but if I like cage or Souls I'll try 'Children of Time' series. I haven't read cage of Souls yet so no spoilers please,Cage of Souls - Adrian Tchaikovsky,1577834642
4,books,"For the past few years, one of my New Year's resolutions was to read 100 books I had not previously read before. I had the same resolution for 2019, but this time I bothered to keep track of the books I had read (and finished). In the end, I managed to finish 58. However, I am happy with the result, since I can say I kept track of what I was re...",Tracking the Books I Read In 2019,1577834517


In [33]:
# save the data frame
data.to_csv('../data/data_reddit.csv', index=False)

---