In [1]:
# This handy piece of code changes Jupyter Notebooks margins to fit your screen.
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# Scrape Reddit Comments for a Sentiment Analysis - Walkthough
### This tutorial was adapted from a number of sources including: http://www.storybench.org/how-to-scrape-reddit-with-python/ and https://towardsdatascience.com/scraping-reddit-data-1c0af3040768

In [None]:
# Import all the necessary libraries
import praw  # Import the Praw library: https://praw.readthedocs.io/en/latest/code_overview/reddit_instance.html
import pandas as pd  # Import Pandas library: https://pandas.pydata.org/
import datetime as dt  # Import datetime library
import matplotlib.pyplot as plt  # Import Matplot lib for plotting

In [None]:
# Praw (Python Reddit API Wrapper) is used to communicate with Reddit
reddit = praw.Reddit(client_id='',
                     client_secret='',
                     user_agent='')

### We will begin viewing the top 100 posts from the 'front page' of '/r/all' within the last month.  

In [None]:
subreddit = reddit.subreddit('all').top('month', limit = 100)# Define the subreddit of interest. Here we are taking the top 100 posts under 'All' from the past month

In [None]:
# Lets print the submission title and score:
for submission in subreddit:
    print(submission.title, submission.score)

### Kind of hard to read.  Lets add some more information and clean it up a bit.

In [None]:
# Define the subreddit of interest. Here we are taking the top 100 posts under 'All' from the past month
subreddit = reddit.subreddit('all').top('month', limit = 100)

In [None]:
# First, we will create a dictionary for a Pandas table.
topics_dict = []  # Ceate and empty dictionary
topics_dict = { "title":[],  # Dictionary headers 
                "score":[], 
                "id":[], "url":[], 
                "comms_num": [],
                "created": [],
                "body":[]}

In [None]:
# add elements of each comment to our dictionary
for comment in list(subreddit):
    topics_dict["title"].append(comment.title)
    topics_dict["score"].append(comment.score)
    topics_dict["id"].append(comment.id)
    topics_dict["url"].append(comment.url)
    topics_dict["comms_num"].append(comment.num_comments)
    topics_dict["created"].append(comment.created)
    topics_dict["body"].append(comment.selftext)

In [None]:
# Convert the dictionary to a pandas data frame.
topics_data = pd.DataFrame(topics_dict)
topics_data  # Show the data frame

In [None]:
# The created column is in unix time. Convert it to normal time.
topics_data['created'] = topics_data['created'].astype(int)  # Change the creaded column to an integer.
ts = []  # create an empty list for storing timestamps
for time in topics_data.index:
    ts.append(dt.datetime.fromtimestamp(topics_data['created'][time]))  #  convert unix time to normal time

In [None]:
topics_data.assign(created=ts)  # assign ts to created column

### That's better to look at.  We can cearly see the top 100 posts within the last month from the '/r/all' front page. Now we want to pull the top level comments (first comments) found in the top 100 posts within the last month from the '/r/all' front page.  These top level comments these will be used as our baseline sentiment

In [None]:
baseline_subreddit = reddit.subreddit('all').top('month', limit = 100)  # Taking the same top 100 posts under 'All' from the past month
comments_all_dict = {"id":[],  # Create the empty dictionary
    "comments":[]}
for post in baseline_subreddit:
    submission = reddit.submission(id = post)
    submission.comments.replace_more(limit=0)  # This line of code expands the comments if “load more comments” and “continue this thread” links are encountered
    for top_level_comment in submission.comments: 
        comments_all_dict["id"].append(top_level_comment.id)  # Saving ID into the dictionary
        comments_all_dict["comments"].append(top_level_comment.body)  # Saving comment into the dictionary

In [None]:
# View the comments. They should be in a Pandas Data Table
comments_all_dict

In [None]:
# Store comments in the the DataFrame
comments_base_data = pd.DataFrame.from_dict(comments_all_dict, orient='index').T  # Add and transpose them to data table.
comments_base_data

In [None]:
# Now we need to prep the comments for a sentment analysis. We tokenize the comments into individual words
for word in comments_base_data.comments:  # loop over each word
        commentWords = word.split()  # split comments into individual words
        for word in commentWords:  # loop over idndividual words in each comment
            word = word.strip('?:!.,;"!@()#-')  # remove extraneous characters
            word = word.replace("\n", "")  # remove end of line
            print(word)

### Now we will use the sentiment file called AFINN-en-165.txt.  This file contains a sentiment score for 3382 words.  More information can be found here: https://github.com/fnielsen/afinn With the sentiment file we will assign scores to words within the top comments that are found in the AFINN file

In [None]:
sentimentfile = open("AFINN-en-165.txt", "r")  # open sentiment file
sentiments = {"-5": 0, "-4": 0, "-3": 0, "-2": 0, "-1": 0, "0": 0, "1": 0, "2": 0, "3": 0, "4": 0, "5": 0}  # Create the sentiment dictionary and populate it with zeros 
scores = {}  # an empty dictionary
for line in sentimentfile:  # loop over each word / sentiment score
    word, score = line.split("\t")  # file is tab-delimited
    scores[word] = int(score)  # convert the scores to intergers

In [None]:
for word in comments_base_data.comments:  # loop over each word
        commentWords = word.split()  # split comments into individual words
        for word in commentWords:  # loop over individual words in each comment
            word = word.strip('?:!.,;"!@()#-')  # remove extraneous characters
            word = word.replace("\n", "")  # remove end of line
            if word in scores.keys():  # check if word is in sentiment dictionary
                score = scores[word]  # check if word is in sentiment dictionary
                sentiments[str(score)] += 1  # add one to the sentiment score

In [None]:
# Print the scores
print("-5 sentiments ", sentiments["-5"])
print("-4 sentiments ", sentiments["-4"])
print("-3 sentiments ", sentiments["-3"])
print("-2 sentiments ", sentiments["-2"])
print("-1 sentiments ", sentiments["-1"])
print(" 0 sentiments ", sentiments["0"])
print(" 1 sentiments ", sentiments["1"])
print(" 2 sentiments ", sentiments["2"])
print(" 3 sentiments ", sentiments["3"])
print(" 4 sentiments ", sentiments["4"])
print(" 5 sentiments ", sentiments["5"])

In [None]:
# Now let us put the sentiment scores into a dataframe.
senti_base = pd.DataFrame(sentiments, index=['Count']).T.reset_index()  # Convert the sentiment dictionary to a data frame, transpose the data, and reset the index
senti_base['Value'] = [-5,-4,-3,-2,-1,0,1,2,3,4,5]  # add a score column
senti_base = senti_base[['Value', 'Count']]  # Reorder the columns
senti_base

### We will plot the data so it is easier to visualize.  

In [None]:
# Plot the data
plt.bar(senti_base['Value'], senti_base['Count'], color = 'grey')  # plot x-values, y-values, color
plt.xlabel('Sentiment Value')  # add x-label
plt.ylabel('Sentiment Count')  # add y-label
plt.title('Baseline Reddit Sentiment Analysis')  # add title
plt.show()

### Now that we have baseline comment sentiment data we will build a sentiment file containing sentiment analysis from a specific subreddit.  Below, I picked the /r/aww subreddit, a subreddit for cute and cuddly pictures :-). 

In [None]:
search_subreddit = reddit.subreddit('aww').top('month', limit = 100)  # pick the subreddit you want to select. Do you want to find top posts, or hot posts?, from what time period?, how many posts?
comments_subreddit = {"id": [], "comments": []}  # Create a dictionary for subreddit comments
for post in search_subreddit:
    submission = reddit.submission(id=post)
    submission.comments.replace_more(limit=0)  # This line of code expands the comments if “load more comments” and “continue this thread” links are encountered
    for top_level_comment in submission.comments:
        comments_subreddit["id"].append(top_level_comment.id)  # Saving ID into the dictionary
        comments_subreddit["comments"].append(top_level_comment.body)  # Saving comment into the dictionary
        
comments_subreddit_data = pd.DataFrame.from_dict(comments_subreddit, orient='index').T # Create a dataframe for the subreddit comments

sentimentfile = open("AFINN-en-165.txt", "r")  # open sentiment file
scores = {}  # an empty dictionary
for line in sentimentfile:  # loop over each word / sentiment score
    word, score = line.split("\t")  # file is tab-delimited
    scores[word] = int(score)  # convert the scores to intergers
    
for word in comments_subreddit_data.comments:  # loop over each word in dataframe
         commentWords = word.split()  # split comments into individual words
         for word in commentWords:  # loop over idndividual words in each comment
            word = word.strip('?:!.,;"!@()#-')  # remove extraneous characters
            word = word.replace("\n", "")  # remove end of line
            if word in scores.keys():  # check if word is in sentiment dictionary
                score = scores[word]  # check if word is in sentiment dictionary
                sentiments[str(score)] += 1  # add one to the score if 

subreddit_senti = []
subreddit_senti = pd.DataFrame(sentiments, index=['Count']).T.reset_index()  # Convert the sentiment dictionary to a data frame, transpose the data, and reset the index
subreddit_senti['Value'] = [-5,-4,-3,-2,-1,0,1,2,3,4,5]  # add a score column
subreddit_senti = subreddit_senti[['Value', 'Count']]  # Reorder the columns

plt.bar(subreddit_senti['Value'], subreddit_senti['Count'], color='blue')  # plot the data x-values, y-values, color
plt.xlabel('Sentiment Value')  # add x-label
plt.ylabel('Sentiment Count')  # add y-label
plt.title('SubReddit Sentiment Analysis')  # add title
plt.show()

### Now we will overlay the baseline comment sentiment and the subreddit comment sentiment to help compare.

In [None]:
# Plot the data together
plt.bar(subreddit_senti['Value'] + 0.2,subreddit_senti['Count'], color='blue', label='Sub Reddit') # add subreddit data

plt.bar(senti_base['Value'], senti_base['Count'], color='grey', label='Base Reddit') # add baseline data
plt.legend() # add the legend

plt.xlabel('Sentiment Value')  # add x-label
plt.ylabel('Sentiment Count')  # add y-label
plt.title('Reddit Sentiment Analysis')  # add title
plt.tight_layout()  # tight layout makes it look nice
plt.show()  # show the plot

## Is this an accurate representation of the data?  What are we missing?  

## Let us normalize the data and replot.

In [None]:
# First we will add normalized count and normalized scores to the senti_base dataframe.
senti_base['Normalized']=senti_base['Count'] / senti_base['Count'].sum()  # Normalize the Count
senti_base

In [None]:
# Now we will add normalized count and normalized scores to the subreddit_senti dataframe.
subreddit_senti['Normalized'] = subreddit_senti['Count'] / subreddit_senti['Count'].sum()  # Normalize the Count
subreddit_senti

In [None]:
# Plot the normalized data together
plt.bar(subreddit_senti['Value'] + 0.2 ,subreddit_senti['Normalized'], color='b', label = 'Sub Reddit')  # add subreddit data

plt.bar(senti_base['Value'], senti_base['Normalized'], color='grey', label = 'Base Reddit')  # add baseline data
plt.legend()  # add the legend

plt.xlabel('Sentiment Value')  # add x-label
plt.ylabel('Normalized Count')  # add y-label
plt.title('Reddit Sentiment Analysis')  # add title
plt.tight_layout()  # tight layout makes it look nice
plt.show()  # show the plot