## Libraries

In [1]:
import praw
import pandas as pd
import datetime as dt
import json
from bs4 import BeautifulSoup
from reddit_helpers.text_processor import reddit_text_preprocessing
import re
import nltk
from tqdm import tqdm
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
import os.path

## Get Cred

In [2]:
# Load credentials from json file\n
with open("reddit_credentials.json", "r") as file:
    creds = json.load(file)

In [3]:
creds

{'client_id': 'Tt3sc9zHX1U4Pg',
 'client_secret': 'Tl_rWZZtVo0k46FFkM2i0BBCWQM',
 'user_agent': 'Scraping_data',
 'username': '311Sheetal',
 'password': 'Reddit'}

In [4]:
reddit = praw.Reddit(client_id = creds['client_id'],
                     client_secret = creds['client_secret'],
                     user_agent = creds['user_agent'],
                     username = creds['username'],
                     password = creds['password'])

## Helper Functions

In [5]:
def get_date(created):
    return dt.datetime.fromtimestamp(created)

In [24]:
def fetch_comments(reddit_instance, pd_posts, users):
    
    comments_dict = {
        "created": [],
        "comment_id": [],
        "author": [],
        "body": [],
        "parent_id":[],
        "submission_id":[],
        "score":[],
        "subreddit":[],
        "subreddit_id":[],
        "submission_group":[],
        "comment_group":[]
    }

#     submission = reddit_instance.submission(list(pd_posts['post_id'].values))
    for postid in tqdm(list(pd_posts['post_id'].values)):
        submission = reddit_instance.submission(postid)
        submission.comments.replace_more(limit=None)
        for comment in submission.comments.list():
            if comment is not None:
                if comment.author is not None:
                    if comment.author.name != "AutoModerator":
                        if comment.author.name in users['user'].values:
                            comments_dict['created'].append(comment.created_utc)
                            comments_dict['comment_id'].append(comment.id)
                            comments_dict['author'].append(comment.author)
                            comments_dict['body'].append(comment.body)
                            comments_dict['parent_id'].append(comment.parent_id)
                            comments_dict['submission_id'].append(postid)
                            comments_dict['score'].append(comment.score)
                            comments_dict['subreddit'].append(comment.subreddit)
                            comments_dict['subreddit_id'].append(comment.subreddit_id)
                            comments_dict['submission_group'].append(pd_posts.loc[pd_posts['post_id']==postid]['group'].values)
                            comments_dict['comment_group'].append(users.loc[users['user']==comment.author.name]['subreddit'].values)
                        
    comments_info = pd.DataFrame(comments_dict)
    comments_info.comment_group = comments_info.comment_group.apply(lambda x: x[0])
    comments_info.submission_group = comments_info.submission_group.apply(lambda x: x[0])
    
    return comments_info

## Get user ids for each group

In [7]:
users = pd.read_csv('./user_groups.csv')

In [8]:
conflict_subreddits = ['politics', 
                       'politicaldiscussion',
                       'politicalfactchecking',
                       'neutralpolitics',
                       'moderatepolitics',
                       'centrist',
                       'ask_Politics']

In [9]:
if os.path.exists("submissions_data.csv"):
    print("Submissions File Exists!")
    subreddit_data = pd.read_csv('submissions_data.csv')
    print("Read File!")

else:
    a = 0 
    subreddit_submissions_dict = {"created":[],
                             "title":[],
                             "score":[],
                             "post_id": [],
                             "subreddit_id": [],
                             "subreddit" : [],
                             "author" : [],
                             "title":[],
                             "upvote_ratio": [],
                             "body": [],
                             "url": [],
                             "num_comments":[],
                             "group": []}

    for i in tqdm(users.iterrows()):
        user = reddit.redditor(i[1]['user'])
        try:
            id = user.id
        except:
            continue

        for submission in user.submissions.new(limit=None):
            if (not submission.banned_by is None) or (not submission.author is '[Deleted]') or (not submission.selftext == '[deleted]') or (not submission.selftext == '[removed]'):

                if ' '.join([ word.strip().lower() for word in submission.subreddit.display_name.split()]) in conflict_subreddits:
                    subreddit_submissions_dict['created'].append(submission.created)
                    subreddit_submissions_dict['title'].append(submission.title)
                    subreddit_submissions_dict['score'].append(submission.score)
                    subreddit_submissions_dict['post_id'].append(submission.id)
                    subreddit_submissions_dict['subreddit_id'].append(submission.subreddit_id)
                    subreddit_submissions_dict['subreddit'].append(submission.subreddit)
                    subreddit_submissions_dict['author'].append(submission.author)
                    subreddit_submissions_dict['num_comments'].append(submission.num_comments)
                    subreddit_submissions_dict['upvote_ratio'].append(submission.upvote_ratio)
                    subreddit_submissions_dict['body'].append(submission.selftext)
                    subreddit_submissions_dict['url'].append(submission.url)
                    subreddit_submissions_dict['group'].append(i[1]['subreddit'])

    #     if a == 50:
    #         break
    #     a+=1

    subreddit_data = pd.DataFrame(subreddit_submissions_dict)
    _timestamp = subreddit_data["created"].apply(get_date)
    subreddit_data = subreddit_data.assign(timestamp = _timestamp)

3379it [6:55:38, 11.53s/it]


In [10]:
print("Done!")

Done!


## Load Comments of a Post

In [30]:
#subreddit_data.to_csv("./submissions_data.csv", index = False)

In [11]:
pd_posts = subreddit_data[['post_id', 'group']]

In [45]:
comments_info = fetch_comments(reddit, pd_posts, users)

100%|██████████| 9575/9575 [10:59:25<00:00,  6.29s/it]


In [47]:
comments_info.to_csv("./final_comments.csv", index = False)

In [46]:
print("Done!")

Done!


In [48]:
_timestamp = comments_info["created"].apply(get_date)
comments_info = comments_info.assign(timestamp = _timestamp)

In [49]:
# check linked users
comments_info['linked_users'] = comments_info['body'].apply(lambda x: re.findall('/u/[A-Za-z0-9_-]+',x))
# check linked subreddits
comments_info['linked_subreddits'] = comments_info['body'].apply(lambda x: re.findall('r/[A-Za-z0-9_-]+',x))

# remove numbers etc
comments_info['processed_body'] = comments_info['body'].str.replace("[^a-zA-Z#]", " ")

In [50]:
# process text
comments_info['processed_body'] = comments_info['processed_body'].apply(lambda x: reddit_text_preprocessing(x).replace_abbreviations().remove_short_words().lower_case().process_html().remove_urls().decode_text().stopwords_remove().stopwords_remove().lemmatize().text)

In [51]:
## Use google language api

# def gc_sentiment(text, credentials):  
#     from google.cloud import language
    
#     client = language.LanguageServiceClient(credentials = credentials)
#     document = language.types.Document(
#             content=text,
#             type=language.enums.Document.Type.PLAIN_TEXT)
#     annotations = client.analyze_sentiment(document=document)
#     score = annotations.document_sentiment.score
#     magnitude = annotations.document_sentiment.magnitude
#     return score, magnitude

# import os
# from google.oauth2 import service_account
# credentials = service_account.Credentials.from_service_account_file('./ecbm4040-up2138-b37eacd8e36c.json')
# print('Credendtials from environ: {}'.format(os.environ.get('GOOGLE_APPLICATION_CREDENTIALS')))

# gc_results = [gc_sentiment(row, credentials) for row in tqdm(comments_info['processed_body'])]
# gc_score, gc_magnitude = zip(*gc_results) # Unpacking the result into 2 lists
# gc = list(zip(comments_info['processed_body'], gc_score, gc_magnitude))
# columns = ['text', 'score', 'magnitude']
# gc_df = pd.DataFrame(gc, columns = columns)

# gc_df

In [62]:
conflicting_comments = comments_info
conflicting_comments = conflicting_comments[conflicting_comments.apply(lambda x: x['author'].name != "AutoModerator", axis=1)]
conflicting_comments = conflicting_comments.reset_index()

In [64]:
sia = SIA()
conflicting_comments['compound_sentiment'] = conflicting_comments.apply(lambda x: sia.polarity_scores(x['body'])['compound'], axis = 1)
conflicting_comments['negative_sentiment'] = conflicting_comments.apply(lambda x: sia.polarity_scores(x['body'])['neg'], axis = 1)
conflicting_comments['positive_sentiment'] = conflicting_comments.apply(lambda x: sia.polarity_scores(x['body'])['pos'], axis = 1)
conflicting_comments['neutral_sentiment'] = conflicting_comments.apply(lambda x: sia.polarity_scores(x['body'])['neu'], axis = 1)

In [65]:
conflicting_comments.head()

Unnamed: 0,index,created,comment_id,author,body,parent_id,submission_id,score,subreddit,subreddit_id,submission_group,comment_group,timestamp,linked_users,linked_subreddits,processed_body,compound_sentiment,negative_sentiment,positive_sentiment,neutral_sentiment
0,0,1432870000.0,crofq47,ljrdxyh,Should win Iowa and probably wins New Hampshir...,t1_crofp5c,37o59g,3,politics,t5_2cneq,Republican,Republican,2015-05-28 23:21:19,[],[],iowa probably win hampshire real test signific...,0.8176,0.0,0.429,0.571
1,1,1432868000.0,croeoyz,ljrdxyh,"Well - the people have been there all along, R...",t1_croe8j8,37o59g,5,politics,t5_2cneq,Republican,Republican,2015-05-28 22:48:07,[],[],well peoriginal posterle along rand correctly ...,-0.7096,0.249,0.065,0.685
2,2,1432868000.0,croen90,ljrdxyh,His record? How so?,t1_croeh7y,37o59g,7,politics,t5_2cneq,Republican,Republican,2015-05-28 22:46:36,[],[],record significant,0.0,0.0,0.0,1.0
3,3,1432870000.0,crofoyq,ljrdxyh,Agree that it is their responsibility.....whic...,t1_crofneg,37o59g,2,politics,t5_2cneq,Republican,Republican,2015-05-28 23:20:15,[],[],agree reddit enhancement suiteponsibilithank g...,-0.1531,0.108,0.087,0.804
4,4,1432869000.0,crofl4c,ljrdxyh,"From one of the comments on the link: ""left p...",t1_crof082,37o59g,23,politics,t5_2cneq,Republican,Republican,2015-05-28 23:16:48,[],[],comment linokay leave plenthank looriginal pos...,-0.2263,0.132,0.133,0.735


In [66]:
conflicting_comments.to_csv("./conflicting_comments.csv", index = False)