In [1]:
import pandas as pd
import datetime as dt
import time
import praw
import matplotlib.pyplot as plt
import seaborn as sns
import config


Bad key "text.kerning_factor" on line 4 in
/opt/anaconda3/envs/reddit/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution


In [2]:
reddit = praw.Reddit(client_id=config.CLIENT_ID, \
                     client_secret=config.CLIENT_SECRET, \
                     user_agent=config.USER_AGENT, \
                     username=config.USERNAME, \
                     password=config.PASSWORD)

#### Utility functions: dataframes, time converter and timestamper

In [3]:
# dictionary to dataframe for better readability

def dict_to_frame(red_dict):
    """takes a dictionary with a 'created' key and
    outpurs a datagframe with human readable time"""
    frame = pd.DataFrame(red_dict)
    if 'created' in red_dict:
        frame['created'] = frame['created'].apply(get_date)
    return frame

In [4]:
# a function that adds a timestamp to a string
def timestamper():
    """returns a string of a timestamp: year, month, day"""
    timestamp = dt.datetime.now()
    return str(dt.date(timestamp.year, timestamp.month, timestamp.day))

In [72]:
def get_date(created):
    """takes a unix timestamp and converts it to date time"""
    return dt.datetime.fromtimestamp(created)

#### Author analysis: all subreddit authors, most popular authors, author details

In [6]:
# author comment extraction function 

def author_comments(author):
    """takes a reddit user name and 
    returns a dataframe of the 'new' the author's comments with score and creation date"""
    author_dict = {"body":[], "score":[], "created": []}
    for comment in reddit.redditor(author).comments.new(limit=None):
        author_dict['body'].append(comment.body),
        author_dict['score'].append(comment.score),
        author_dict['created'].append(comment.created)
    author_frame = pd.DataFrame(author_dict)
    author_frame['created'] = author_frame['created'].apply(get_date)
    return author_frame

In [7]:
# extracting all commentors of a 100 submission subset of a subreddit and their scores 

def subreddit_authors(data_frame):
    """takes a dataframe of submissions (needs 'id') in a subreddit, iterates over all comments in all submission,
    outputs a dictionary of authors and their number of comment & their accumulated score"""
    author_dict = {}
    # iterating over all submission in a subreddit dataframe
    for sub in data_frame['id'][:1]:
        submission = reddit.submission(sub)
        # iterating over all comments within a submission 
        submission.comments.replace_more(limit=None)
        for comment in submission.comments.list():
            if str(comment.author) in author_dict.keys():
                author_dict[str(comment.author)][0] += 1
                author_dict[str(comment.author)][1] += comment.score
            else:
                # creating a first entry if the author is not yet in the dict
                author_dict[str(comment.author)] = [1, comment.score]
    return author_dict

In [8]:
# a function to scrape the detailed data of the top 25 weekly redditors 

def top_redditor_scraper(redditor_list):
    """Takes a list of usernames and outputs
    a dictionary with author key and cake day, link and comment karma"""
    detail_dict = {}
    for author in top_reds.index[:25]:
        try:
            created = get_date(reddit.redditor(author).created)
            link_karma = reddit.redditor(author).link_karma 
            comment_karma = reddit.redditor(author).comment_karma
            detail_dict[author] = [created, link_karma, comment_karma]
        except:
            detail_dict[author] = [None, None, None]
    return detail_dict

#### Subreddit & submission analysis: most popular submissions, all comments from an author 

In [9]:
def subreddit_submissions(subreddit):
    """takes a subreddit and outputs a
    dataframe with the 100 hottest posts"""
    topics_dict = {"title":[], "author": [], "score":[], "id":[], "url":[], "comms_num": [], \
                "created": [], "body":[]}
    for submission in subreddit.hot(limit=100):
        topics_dict["title"].append(submission.title)
        topics_dict["author"].append(submission.author)
        topics_dict["score"].append(submission.score)
        topics_dict["id"].append(submission.id)
        topics_dict["url"].append(submission.url)
        topics_dict["comms_num"].append(submission.num_comments)
        topics_dict["created"].append(submission.created)
        topics_dict["body"].append(submission.selftext)
    topic_frame = pd.DataFrame(topics_dict)
    return topic_frame

In [71]:
# creating a function that collects all comments from a submission

def submission_comments(submission):
    """takes a submission (reddit.submission) and outputs
    a dataframe of all comments in that submission"""
    comment_dict = {"body":[], "score":[], "author":[], "created": []}
    submission.comments.replace_more(limit=None)
    for comment in submission.comments.list():
        comment_dict['body'].append(comment.body),
        comment_dict['score'].append(comment.score),
        comment_dict['author'].append(comment.author),
        comment_dict['created'].append(comment.created)
    comment_frame = pd.DataFrame(comment_dict)
    comment_frame['created'] = comment_frame['created'].apply(get_date)
    comment_frame['author'] = comment_frame['author'].apply(lambda x: x.name)
    return comment_frame

In [11]:
de_sub = reddit.subreddit('de')

In [13]:
de_sub.accounts_active, de_sub.subscribers

(4277, 264878)

### r/dataisbeautiful analysis

In [56]:
test_redditor = reddit.redditor('DukeMactavish')

In [34]:
# get individual submission 

data_discussion_post = reddit.submission('gdbaiz')

In [70]:
# get all comments in a submission

discussion_comments = submission_comments(data_discussion_post)

In [65]:
discussion_comments['author'] = discussion_comments['author'].apply(lambda x: x.name)

In [68]:
discussion_comments.groupby('author')['author'].count().sort_values()

author
2134123412341234      1
pierre_x10            1
jtg123g               1
ilikemusicandgame     1
gints                 1
flurbius              1
fast_edo              1
dummy_thiqq           1
alsocomfy             1
Zarricaron            1
TradingToni           1
vikram201112018       1
TheNajeeb             1
Prudent-Gain101       1
Mildly_Upset_Toast    1
Halstrop              1
Electric_sheeples     1
DukeMactavish         1
Brittle_Panda         1
Azzozs                1
AVLien                1
yeuxwbbw              1
DinosaurAssassin      2
Barnst                2
ranginpanda           2
Puppies4Lovies        2
Name: author, dtype: int64

In [14]:
# get the subreddit

beautidata = reddit.subreddit('dataisbeautiful')

In [15]:
# step one: get all hot & all new submissions

hot_submissions = subreddit_submissions(beautidata)

In [17]:
# wrinting a dataframe to disk
#hot_submissions.to_csv(f'top100_beautifuldata_{timestamper()}.csv', header=True)

### Comparing dataframes of 2 consecutive days

In [18]:
# reading in the dataframes 

hot_10may = pd.read_csv('top100_beautifuldata_2020-05-09.csv', index_col=0)
hot_9may = pd.read_csv('top100_beautifuldata_2020-05-10.csv', index_col=0)

In [19]:
hot_10may.head()

Unnamed: 0,title,author,score,id,url,comms_num,created,body
0,[Topic][Open] Open Discussion Monday — Anybody...,AutoModerator,25,gdbaiz,https://www.reddit.com/r/dataisbeautiful/comme...,30,2020-05-04 23:08:17,Anybody can post a Dataviz-related question or...
1,[Battle] Data Viz Battle Winners of April 2020!,Brittle_Panda,27,ge4n8r,https://www.reddit.com/r/dataisbeautiful/comme...,4,2020-05-06 05:47:06,[Hello there](https://i.imgur.com/3QeRlCQ.jpg)...
2,[OC] The remarkable decline in child mortality...,toddrjones,30147,gg5m68,https://i.redd.it/nulc5rai0nx41.gif,1048,2020-05-09 10:54:29,
3,[OC] I made a lyrical analysis & statistics da...,mochizuki,812,ggdvsg,https://i.redd.it/jao1aoru5qx41.png,67,2020-05-09 21:29:10,
4,[OC] Visualizing of the most used words in hea...,dudumm,521,ggcxx2,https://i.redd.it/6mygbuesrpx41.png,46,2020-05-09 20:10:04,


In [24]:
hot_9may.head()

Unnamed: 0,title,author,score,id,url,comms_num,created,body
0,[Topic][Open] Open Discussion Monday — Anybody...,AutoModerator,30,gdbaiz,https://www.reddit.com/r/dataisbeautiful/comme...,33,1588626000.0,Anybody can post a Dataviz-related question or...
1,[Battle] Data Viz Battle Winners of April 2020!,Brittle_Panda,35,ge4n8r,https://www.reddit.com/r/dataisbeautiful/comme...,4,1588737000.0,[Hello there](https://i.imgur.com/3QeRlCQ.jpg)...
2,[OC] life expectancy over last 65 years,karthikvcp,5433,gguaz1,https://v.redd.it/zin787no0vx41,276,1589111000.0,
3,[OC] Every Subgenre of Music,reteps144,4758,ggmull,https://i.redd.it/b9uekzkjssx41.jpg,524,1589084000.0,
4,[OC] I made a lyrical analysis & statistics da...,mochizuki,16362,ggdvsg,https://i.redd.it/jao1aoru5qx41.png,648,1589053000.0,


In [25]:
duplicates = [i for i in hot_10may['title'].values if i in hot_9may['title'].values]

In [26]:
duplicates

['[Topic][Open] Open Discussion Monday — Anybody can post a general visualization question or start a fresh discussion!',
 '[Battle] Data Viz Battle Winners of April 2020!',
 '[OC] The remarkable decline in child mortality, where each dot is a country.',
 '[OC] I made a lyrical analysis & statistics database for 50 hiphop artists as a text mining exercise. Here is Mac Miller',
 '[OC] Visualizing of the most used words in headlines of r/floridaman except florida man',
 '[OC] Minnesota Topography',
 '[OC] My experience with cheating in the upper echelons of Counter-Strike: Global Offensive',
 'Colours in Bible (in order) [OC]',
 '[OC] Contiguous US and the continental shelf, elevation along latitudes between 20°N and 50°N',
 '[OC] Every color in Moby-Dick (in order)',
 '[OC] It took 5 years to build the number of jobs lost in only 4 months from COVID-19 in Canada',
 '[OC] Countries with Lowest Birth Rates (1900-2020+) - Shows Worldwide Birth Rates on the Decline',
 'Quarantine project: t

In [27]:
len(duplicates)

71