In [5]:
import pandas as pd
import datetime as dt
import time
import praw
import matplotlib.pyplot as plt
import seaborn as sns
import config


Bad key "text.kerning_factor" on line 4 in
/opt/anaconda3/envs/reddit/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution


In [6]:
reddit = praw.Reddit(client_id=config.CLIENT_ID, \
                     client_secret=config.CLIENT_SECRET, \
                     user_agent=config.USER_AGENT, \
                     username=config.USERNAME, \
                     password=config.PASSWORD)

#### Utility functions: dataframes, time converter and timestamper

In [7]:
# dictionary to dataframe for better readability

def dict_to_frame(red_dict):
    """takes a dictionary with a 'created' key and
    outpurs a datagframe with human readable time"""
    frame = pd.DataFrame(red_dict)
    if 'created' in red_dict:
        frame['created'] = frame['created'].apply(get_date)
    return frame

In [8]:
# a function that adds a timestamp to a string
def timestamper():
    """returns a string of a timestamp: year, month, day"""
    timestamp = dt.datetime.now()
    return str(dt.date(timestamp.year, timestamp.month, timestamp.day))

In [9]:
def get_date(created):
    """takes a unix timestamp and converts it to date time"""
    return dt.datetime.fromtimestamp(created)

#### Author analysis: all subreddit authors, most popular authors, author details

In [10]:
# author comment extraction function 

def author_comments(author):
    """takes a reddit user name and 
    returns a dataframe of the 'new' the author's comments with score and creation date"""
    author_dict = {"body":[], "score":[], "created": []}
    for comment in reddit.redditor(author).comments.new(limit=None):
        author_dict['body'].append(comment.body),
        author_dict['score'].append(comment.score),
        author_dict['created'].append(comment.created)
    author_frame = pd.DataFrame(author_dict)
    author_frame['created'] = author_frame['created'].apply(get_date)
    return author_frame

In [35]:
# extracting all commentors of a 100 submission subset of a subreddit and their scores 

def subreddit_authors(data_frame):
    """takes a dataframe of submissions (needs 'id') in a subreddit, iterates over all comments in all submission,
    outputs a dataframe of authors and their number of comment & their accumulated score"""
    author_dict = {}
    # iterating over all submission in a subreddit dataframe
    for sub in data_frame['id']:
        submission = reddit.submission(sub)
        # iterating over all comments within a submission 
        submission.comments.replace_more(limit=None)
        for comment in submission.comments.list():
            if str(comment.author) in author_dict.keys():
                author_dict[str(comment.author)][0] += 1
                author_dict[str(comment.author)][1] += comment.score
            else:
                # creating a first entry if the author is not yet in the dict
                author_dict[str(comment.author)] = [1, comment.score]
    author_frame = pd.DataFrame(author_dict, index=['num_comments', 'sum_karma']).transpose()
    return author_frame

In [50]:
# a function to scrape the detailed data of the top 25 weekly redditors 

def top_redditor_scraper(redditor_list):
    """Takes a list of usernames and outputs
    a dictionary with author key and cake day, link and comment karma"""
    detail_dict = {}
    for author in redditor_list.index[:25]:
        try:
            created = get_date(reddit.redditor(author).created)
            link_karma = reddit.redditor(author).link_karma 
            comment_karma = reddit.redditor(author).comment_karma
            detail_dict[author] = [created, link_karma, comment_karma]
        except:
            detail_dict[author] = [None, None, None]
    detail_frame = pd.DataFrame(detail_dict, index=['created', 'link_karma', 'comment_karma'])
    return detail_frame

#### Subreddit & submission analysis: most popular submissions, all comments from an author 

In [13]:
def subreddit_submissions(subreddit):
    """takes a subreddit and outputs a
    dataframe with the 100 hottest posts"""
    topics_dict = {"title":[], "author": [], "score":[], "id":[], "url":[], "comms_num": [], \
                "created": [], "body":[]}
    for submission in subreddit.hot(limit=100):
        topics_dict["title"].append(submission.title)
        topics_dict["author"].append(submission.author)
        topics_dict["score"].append(submission.score)
        topics_dict["id"].append(submission.id)
        topics_dict["url"].append(submission.url)
        topics_dict["comms_num"].append(submission.num_comments)
        topics_dict["created"].append(submission.created)
        topics_dict["body"].append(submission.selftext)
    topic_frame = pd.DataFrame(topics_dict)
    return topic_frame

In [14]:
# creating a function that collects all comments from a submission

def submission_comments(submission):
    """takes a submission (reddit.submission) and outputs
    a dataframe of all comments in that submission"""
    comment_dict = {"body":[], "score":[], "author":[], "created": []}
    submission.comments.replace_more(limit=None)
    for comment in submission.comments.list():
        comment_dict['body'].append(comment.body),
        comment_dict['score'].append(comment.score),
        comment_dict['author'].append(comment.author),
        comment_dict['created'].append(comment.created)
    comment_frame = pd.DataFrame(comment_dict)
    comment_frame['created'] = comment_frame['created'].apply(get_date)
    comment_frame['author'] = comment_frame['author'].apply(lambda x: x.name)
    return comment_frame

In [15]:
de_sub = reddit.subreddit('de')

In [13]:
de_sub.accounts_active, de_sub.subscribers

(4277, 264878)

### r/dataisbeautiful analysis

In [56]:
test_redditor = reddit.redditor('DukeMactavish')

In [34]:
# get individual submission 

data_discussion_post = reddit.submission('gdbaiz')

In [70]:
# get all comments in a submission

discussion_comments = submission_comments(data_discussion_post)

In [65]:
discussion_comments['author'] = discussion_comments['author'].apply(lambda x: x.name)

In [68]:
discussion_comments.groupby('author')['author'].count().sort_values()

author
2134123412341234      1
pierre_x10            1
jtg123g               1
ilikemusicandgame     1
gints                 1
flurbius              1
fast_edo              1
dummy_thiqq           1
alsocomfy             1
Zarricaron            1
TradingToni           1
vikram201112018       1
TheNajeeb             1
Prudent-Gain101       1
Mildly_Upset_Toast    1
Halstrop              1
Electric_sheeples     1
DukeMactavish         1
Brittle_Panda         1
Azzozs                1
AVLien                1
yeuxwbbw              1
DinosaurAssassin      2
Barnst                2
ranginpanda           2
Puppies4Lovies        2
Name: author, dtype: int64

In [14]:
# get the subreddit

beautidata = reddit.subreddit('dataisbeautiful')

In [15]:
# step one: get all hot & all new submissions

hot_submissions = subreddit_submissions(beautidata)

In [17]:
# wrinting a dataframe to disk
#hot_submissions.to_csv(f'top100_beautifuldata_{timestamper()}.csv', header=True)

### Comparing dataframes of 2 consecutive days

In [17]:
# reading in the dataframes 

hot_10may = pd.read_csv('top100_beautifuldata_2020-05-09.csv', index_col=0)
hot_9may = pd.read_csv('top100_beautifuldata_2020-05-10.csv', index_col=0)

In [36]:
# getting the most popular redditors in r/dataisbeautiful hot posts 

top_data_authors = subreddit_authors(hot_10may)

In [42]:
most_negative = top_data_authors.sort_values(by='sum_karma')
most_positive = top_data_authors.sort_values(by='sum_karma', ascending=False)

In [54]:
most_negative

Unnamed: 0,num_comments,sum_karma
Sv_gravlty,2,-82
bitch6,2,-28
bscottlove,3,-27
GetPucked14,2,-25
ElectricShock,2,-22
...,...,...
JPAnalyst,27,1964
zipflop,2,2152
steevie265,1,2992
strumthebuilding,1,3446


In [41]:
# top_data_authors.to_csv(f'top100_top_redditors_{timestamper()}.csv', header=True)

In [46]:
negative_frame = top_redditor_scraper(most_negative)
positive_frame = top_redditor_scraper(most_positive)

In [49]:
negative_redditors = pd.DataFrame(negative_frame, index=['created', 'link_karma', 'comment_karma']).transpose()
positive_redditors = pd.DataFrame(positive_frame, index=['created', 'link_karma', 'comment_karma']).transpose()

In [57]:
positive_redditors.head()
#positive_redditors.to_csv(f'positive_redditors_{timestamper()}.csv', header=True)

Unnamed: 0,created,link_karma,comment_karma
zkgkilla,2016-01-18 06:50:34,4377,9819
strumthebuilding,2012-10-13 17:40:00,218,6675
steevie265,2018-07-11 05:40:43,38,3145
zipflop,2013-02-12 11:51:37,12676,25675
JPAnalyst,2018-02-15 04:56:39,27007,35139


In [58]:
negative_redditors.head()
#negative_redditors.to_csv(f'negative_redditors_{timestamper()}.csv', header=True)

Unnamed: 0,created,link_karma,comment_karma
Sv_gravlty,2019-04-19 09:27:39,1,1009
bitch6,2015-02-16 10:01:33,7,8108
bscottlove,2015-03-09 21:29:00,1,40
GetPucked14,2017-01-30 10:23:24,33,125
ElectricShock,2011-01-08 13:03:59,81,4799
