## Import Libraries

In [1]:
import praw
import pandas as pd
import datetime as dt
import sys
import json
from tqdm import tqdm
import os.path

## Functions

In [2]:
def get_date(created):
    return dt.datetime.fromtimestamp(created)

## Credentials

In [3]:
# Enter your keys/secrets as strings in the following fields
# credentials = {}
# credentials['client_id'] = 'Tt3sc9zHX1U4Pg'
# credentials['client_secret'] = 'Tl_rWZZtVo0k46FFkM2i0BBCWQM'
# credentials['user_agent'] = 'Scraping_data'
# credentials['username'] = '311Sheetal'
# credentials['password'] = 'Reddit'

In [4]:
# Save the credentials object to file
# with open("reddit_credentials.json", "w") as file:
#           json.dump(credentials, file)

In [5]:
# Load credentials from json file\n
with open("reddit_credentials.json", "r") as file:
    creds = json.load(file)

In [6]:
reddit = praw.Reddit(client_id = creds['client_id'],
                     client_secret = creds['client_secret'],
                     user_agent = creds['user_agent'],
                     username = creds['username'],
                     password = creds['password'])

## Scrape Reddit Subreddits

In [7]:
subreddits = {'republicans' : ['Republican'],
              'democrats': ['democrats']
             }

### Retrieve Posts and Comments

In [8]:
def pull_posts(reddit_instance, subreddits, limit_posts=100):
    
    subreddit_submissions_dict = {"created":[],
                              "title":[],
                              "score":[],
                              "post_id": [],
                              "subreddit_id": [],
                              "subreddit" : [],
                              "author" : [],
                              "title":[],
                              "upvote_ratio": [],
                             "body": [],
                             "url": [],
                             "num_comments":[],
                                 "group": []}

    for i in subreddits:
        for j in subreddits[i]:
            subreddit = reddit.subreddit(j)
            for submission in tqdm(subreddit.new(limit=limit_posts), total = limit_posts, file=sys.stdout):
                if (not submission.banned_by is None) or (not submission.author is '[Deleted]') or (not submission.selftext == '[deleted]') or (not submission.selftext == '[removed]'):
                    subreddit_submissions_dict['created'].append(submission.created)
                    subreddit_submissions_dict['title'].append(submission.title)
                    subreddit_submissions_dict['score'].append(submission.score)
                    subreddit_submissions_dict['post_id'].append(submission.id)
                    subreddit_submissions_dict['subreddit_id'].append(submission.subreddit_id)
                    subreddit_submissions_dict['subreddit'].append(submission.subreddit)
                    subreddit_submissions_dict['author'].append(submission.author)
                    subreddit_submissions_dict['num_comments'].append(submission.num_comments)
                    subreddit_submissions_dict['upvote_ratio'].append(submission.upvote_ratio)
                    subreddit_submissions_dict['body'].append(submission.selftext)
                    subreddit_submissions_dict['url'].append(submission.url)
                    subreddit_submissions_dict['group'].append(i)
                    
    subreddit_data = pd.DataFrame(subreddit_submissions_dict)
    _timestamp = subreddit_data["created"].apply(get_date)
    subreddit_data = subreddit_data.assign(timestamp = _timestamp)
                    
    return subreddit_data

In [9]:
def fetch_comments(reddit_instance,postids):
    
    comments_dict = {
        "created": [],
        "comment_id": [],
        "author": [],
        "body": [],
        "parent_id":[],
        "submission_id":[],
        "score":[],
        "subreddit":[],
        "subreddit_id":[]
    }

    for postid in tqdm(postids, total = len(post_ids), file=sys.stdout):
        submission = reddit_instance.submission(postid)
        submission.comments.replace_more(limit=None)
        for comment in submission.comments.list():
            if comment.body != "[removed]" and comment.author != None:
                comments_dict['created'].append(comment.created_utc)
                comments_dict['comment_id'].append(comment.id)
                comments_dict['author'].append(comment.author)
                comments_dict['body'].append(comment.body)
                comments_dict['parent_id'].append(comment.parent_id)
                comments_dict['submission_id'].append(postid)
                comments_dict['score'].append(comment.score)
                comments_dict['subreddit'].append(comment.subreddit)
                comments_dict['subreddit_id'].append(comment.subreddit_id)
            
    return pd.DataFrame(comments_dict)

In [10]:
if os.path.exists('posts_group.csv'):
    print("Posts File Exists!")
    subreddit_data = pd.read_csv('posts_group.csv')
    print("Read File!")
else:
    # pull posts from the group of subreddits
    print("Pulling Subreddits!")
    limit_posts = 1000
    subreddit_data = pull_posts(reddit, subreddits=subreddits, limit_posts= limit_posts)
    subreddit_data.to_csv('posts_group.csv', index = False)
    print("Pulled Posts from Subreddits!")

Posts File Exists!
Read File!


In [11]:
# make dictionary of dataframes for each group
if os.path.exists("comments_group.csv"):
    print("Comments File Exists!")
    subreddit_data = pd.read_csv('posts_group.csv')
    print("Read File!")

else:
    print("Pulling Comments from each Post!")
    groups_posts = {}
    for i in subreddits:
        post_ids = subreddit_data.loc[subreddit_data.group==i].post_id.values
        groups_posts_df = fetch_comments(reddit, postids=post_ids)
        groups_posts[i] = groups_posts_df  
    # make a dataframe of users for each post thread
    users_df = pd.concat(groups_posts, keys = groups_posts.keys()).reset_index().rename({'level_0':'group'},axis ="columns").drop("level_1", axis = 1)
    users_df.to_csv('comments_group.csv', index = False)
    print("Pulled Comments from each Post!")

Pulling Comments from each Post!
100%|██████████| 985/985 [19:12<00:00,  1.32it/s]
100%|██████████| 976/976 [17:56<00:00,  1.03s/it]
Pulled Comments from each Post!


In [21]:
# make username dictionary for each group
usernames = {}
for i in subreddits:
    usernames[i] = list()
    
for i in subreddits:
    usernames[i].extend(list(set(users_df.loc[users_df['group'] == i]['author'].values)))
    usernames[i].extend(list(set(subreddit_data.loc[subreddit_data['group'] == i]['author'].values)))

In [13]:
print("Done!")

Done!


## Rough

In [14]:
###Get Comments and count of comments in different subreddits of a Particular User 
# df_republican_posts=pd.read_csv("RepublicanPosts.csv.zip/RepublicanPosts.csv")
# df_republican_comments=pd.read_csv("RepublicanComments.csv.zip")
# df_democrats_posts=pd.read_csv("democratsPosts.csv.zip")
# df_democrats_comments=pd.read_csv("democratsComments.csv.zip")

In [15]:
# all_users=[]
# all_users.append(df_republican_posts['author'])
# all_users.append(df_republican_comments['author'])
# all_users.append(df_democrats_posts['author'])
# all_users.append(df_democrats_comments['author'])
# len(all_users)

In [None]:
user_comments_dict = {"user_id":[],
                    "user":[],
                  "comment":[],
                  "subreddit":[]}

In [None]:
republican_users=[]
democrat_users=[]
count_democrats=0
count_republican=0

In [None]:
# without user inclination
for i in range(0,2):
    for author in all_users[i]:
        republican_users.append(author)
        
        
for i in range(2,4):
    for author in all_users[i]:
        democrat_users.append(author)
        

In [None]:
len(democrat_users)

In [None]:
len(republican_users)

In [None]:
# with user inclination
for i in range(0,4):
    for author in all_users[i]:
        user = reddit.redditor(author)
        
        #get comments of that particular user
        for c in user.comments.new(limit=None):
            user_comments_dict['user_id'].append(user.id)
            user_comments_dict['user'].append(user.name)
            user_comments_dict['subreddit'].append(c.subreddit)
            user_comments_dict['comment'].append(c.body)
            #user_comments_dict['replies'].append(c.replies)
            
        user_comments = pd.DataFrame(user_comments_dict)
        user_comments.to_csv("UserComments.csv",mode='a',encoding = 'utf-8',header=False)
        
        count_df=user_comments.groupby('subreddit')['comment'].count().reset_index(name='counts')
        newdf=count_df[count_df['subreddit']=='Republican']
        
        if len(newdf)!=0:
            #print int(newdf['counts'])
            count_republican=int(newdf['counts'])
            
            
        newdf1=count_df[count_df['subreddit']=='democrats']
        if len(newdf1)!=0:
            count_democrats=int(newdf1['counts'])  
            #print int(newdf1['counts'])
            
        if count_republican>=count_democrats:
            republican_users.append(user)
        else:
            democrat_users.append(user)
        
       

In [None]:
# count_df=user_comments.groupby('subreddit')['comment'].count().reset_index(name='counts')
# newdf=count_df[count_df['subreddit']=='Republican']
# print newdf
# n=newdf['counts'].to_frame()
# int(newdf['counts'])

In [None]:
# user = reddit.redditor('tall_bacon')

In [None]:
# print user
# print user.id
# print user.fullname

In [None]:
# for c in user.comments.new(limit=None):
#     user_comments_dict['user_id'].append(user.id)
#     user_comments_dict['user'].append(user.name)
#     user_comments_dict['subreddit'].append(c.subreddit)
#     user_comments_dict['comment'].append(c.body)
#     #user_comments_dict['replies'].append(c.replies)

In [None]:
# user_comments = pd.DataFrame(user_comments_dict)
# user_comments.head(10)

In [None]:
# len(user_comments)

In [None]:
# count_df=user_comments.groupby('subreddit')['comment'].count()
# count_df=count_df.reset_index(name='counts')

In [None]:
# type(count_df)#.loc[1:3,1:2]

In [None]:
# newdf=count_df[count_df['subreddit']=='Republican']
# len(newdf)

In [None]:
# count=newdf['counts'][1]
# count

In [None]:
# count_df.loc[count_df['subreddit'] == 'The_Donald']

In [None]:
# using Redittor.Stream
#for c in user.stream.comments():
#    print c.body
#    print "~"

### List of subscribers to a particular subreddit

In [None]:
subreddit_name = reddit.subreddit('Republican')

In [None]:
subreddit_name.subscribers  #the names of the subscribers is private