## Libraries

In [1]:
import praw
import pandas as pd
import datetime as dt
import json
from bs4 import BeautifulSoup
from reddit_helpers.text_processor import reddit_text_preprocessing
import re
import nltk
from tqdm import tqdm
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

## Get Cred

In [2]:
# Load credentials from json file\n
with open("reddit_credentials.json", "r") as file:
    creds = json.load(file)

In [3]:
creds

{'client_id': 'Tt3sc9zHX1U4Pg',
 'client_secret': 'Tl_rWZZtVo0k46FFkM2i0BBCWQM',
 'user_agent': 'Scraping_data',
 'username': '311Sheetal',
 'password': 'Reddit'}

In [4]:
reddit = praw.Reddit(client_id = creds['client_id'],
                     client_secret = creds['client_secret'],
                     user_agent = creds['user_agent'],
                     username = creds['username'],
                     password = creds['password'])

## Helper Functions

In [5]:
def get_date(created):
    return dt.datetime.fromtimestamp(created)

In [79]:
def fetch_comments(reddit_instance, pd_posts, users):
    
    comments_dict = {
        "created": [],
        "comment_id": [],
        "author": [],
        "body": [],
        "parent_id":[],
        "submission_id":[],
        "score":[],
        "subreddit":[],
        "subreddit_id":[],
        "submission_group":[],
        "commment_group":[]
    }

#     submission = reddit_instance.submission(list(pd_posts['post_id'].values))
    for postid in list(pd_posts['post_id'].values):
        submission = reddit_instance.submission(postid)
        submission.comments.replace_more(limit=None)
        for comment in submission.comments.list():
            if comment is not None:
                if comment.author is not None:
                    if comment.author.name != "AutoModerator":
                        if comment.author.name in users['user'].values:
                            comments_dict['created'].append(comment.created_utc)
                            comments_dict['comment_id'].append(comment.id)
                            comments_dict['author'].append(comment.author)
                            comments_dict['body'].append(comment.body)
                            comments_dict['parent_id'].append(comment.parent_id)
                            comments_dict['submission_id'].append(postid)
                            comments_dict['score'].append(comment.score)
                            comments_dict['subreddit'].append(comment.subreddit)
                            comments_dict['subreddit_id'].append(comment.subreddit_id)
                            comments_dict['submission_group'].append(pd_posts.loc[pd_posts['post_id']==postid]['group'].values)
                            comments_dict['comment_group'].append(users.loc[users['user']==comment.author.name]['subreddit'].values)
                        
    comments_info = pd.DataFrame(comments_dict)
    comments_info.comment_group = comments_info.comment_group.apply(lambda x: x[0])
    comments_info.submission_group = comments_info.submission_group.apply(lambda x: x.values[0])
    
    return comments_info

## Get user ids for each group

In [18]:
users = pd.read_csv('./user_groups.csv')

In [19]:
conflict_subreddits = ['politics', 
                       'politicaldiscussion',
                       'politicalfactchecking',
                       'neutralpolitics',
                       'moderatepolitics',
                       'centrist',
                       'ask_Politics']

In [9]:
a = 0 
subreddit_submissions_dict = {"created":[],
                         "title":[],
                         "score":[],
                         "post_id": [],
                         "subreddit_id": [],
                         "subreddit" : [],
                         "author" : [],
                         "title":[],
                         "upvote_ratio": [],
                         "body": [],
                         "url": [],
                         "num_comments":[],
                         "group": []}

for i in tqdm(users.iterrows()):
    user = reddit.redditor(i[1]['user'])

    for submission in user.submissions.new(limit=100):
        if (not submission.banned_by is None) or (not submission.author is '[Deleted]') or (not submission.selftext == '[deleted]') or (not submission.selftext == '[removed]'):
            
            if ' '.join([ word.strip().lower() for word in submission.subreddit.display_name.split()]) in conflict_subreddits:
                subreddit_submissions_dict['created'].append(submission.created)
                subreddit_submissions_dict['title'].append(submission.title)
                subreddit_submissions_dict['score'].append(submission.score)
                subreddit_submissions_dict['post_id'].append(submission.id)
                subreddit_submissions_dict['subreddit_id'].append(submission.subreddit_id)
                subreddit_submissions_dict['subreddit'].append(submission.subreddit)
                subreddit_submissions_dict['author'].append(submission.author)
                subreddit_submissions_dict['num_comments'].append(submission.num_comments)
                subreddit_submissions_dict['upvote_ratio'].append(submission.upvote_ratio)
                subreddit_submissions_dict['body'].append(submission.selftext)
                subreddit_submissions_dict['url'].append(submission.url)
                subreddit_submissions_dict['group'].append(i[1]['subreddit'])
                
#     if a == 50:
#         break
#     a+=1

subreddit_data = pd.DataFrame(subreddit_submissions_dict)
_timestamp = subreddit_data["created"].apply(get_date)
subreddit_data = subreddit_data.assign(timestamp = _timestamp)

198it [08:46,  6.88s/it]


In [13]:
subreddit_data.shape

(218, 13)

## Load Comments of a Post

In [80]:
pd_posts = subreddit_data[['post_id', 'group']]

In [81]:
comments_info = fetch_comments(reddit, pd_posts, users)

In [85]:
_timestamp = comments_info["created"].apply(get_date)
comments_info = comments_info.assign(timestamp = _timestamp)

In [86]:
# check linked users
comments_info['linked_users'] = comments_info['body'].apply(lambda x: re.findall('/u/[A-Za-z0-9_-]+',x))
# check linked subreddits
comments_info['linked_subreddits'] = comments_info['body'].apply(lambda x: re.findall('r/[A-Za-z0-9_-]+',x))

# remove numbers etc
comments_info['processed_body'] = comments_info['body'].str.replace("[^a-zA-Z#]", " ")

In [87]:
# process text
comments_info['processed_body'] = comments_info['processed_body'].apply(lambda x: reddit_text_preprocessing(x).replace_abbreviations().remove_short_words().lower_case().process_html().remove_urls().decode_text().stopwords_remove().stopwords_remove().lemmatize().text)

In [91]:
## Use google language api

# def gc_sentiment(text, credentials):  
#     from google.cloud import language
    
#     client = language.LanguageServiceClient(credentials = credentials)
#     document = language.types.Document(
#             content=text,
#             type=language.enums.Document.Type.PLAIN_TEXT)
#     annotations = client.analyze_sentiment(document=document)
#     score = annotations.document_sentiment.score
#     magnitude = annotations.document_sentiment.magnitude
#     return score, magnitude

# import os
# from google.oauth2 import service_account
# credentials = service_account.Credentials.from_service_account_file('./ecbm4040-up2138-b37eacd8e36c.json')
# print('Credendtials from environ: {}'.format(os.environ.get('GOOGLE_APPLICATION_CREDENTIALS')))

# gc_results = [gc_sentiment(row, credentials) for row in tqdm(comments_info['processed_body'])]
# gc_score, gc_magnitude = zip(*gc_results) # Unpacking the result into 2 lists
# gc = list(zip(comments_info['processed_body'], gc_score, gc_magnitude))
# columns = ['text', 'score', 'magnitude']
# gc_df = pd.DataFrame(gc, columns = columns)

# gc_df

In [182]:
conflicting_comments = comments_info.loc[comments_info['submission_group'] != comments_info['commment_group']]
conflicting_comments = conflicting_comments[conflicting_comments.apply(lambda x: x['author'].name != "AutoModerator", axis=1)]
conflicting_comments = conflicting_comments.reset_index()

In [225]:
sia = SIA()
conflicting_comments['sentiment'] = conflicting_comments.apply(lambda x: sia.polarity_scores(x['body'])['compound'], axis = 1)

In [227]:
conflicting_comments.head()

Unnamed: 0,index,created,comment_id,author,body,parent_id,submission_id,score,subreddit,subreddit_id,submission_group,commment_group,timestamp,linked_users,linked_subreddits,processed_body,sentiment
0,21,1542735000.0,ea41qcc,The_Best_Taker,New York Times already had her as the winner w...,t3_9ytcwe,9ytcwe,6,politics,t5_2cneq,democrats,Republican,2018-11-20 12:24:58,[],[],yorokay tinstant message already winner weeokays,0.5859
1,22,1542739000.0,ea47lrb,urbanlife78,With Democrats more in charge for the 2020 cen...,t1_ea3urnl,9ytcwe,1,politics,t5_2cneq,democrats,Republican,2018-11-20 13:33:44,[],[],de-mailoriginal contentrats charge census chan...,0.7579
2,27,1542741000.0,ea4ard0,urbanlife78,"That is pretty funny, in a number of states, n...",t1_ea499ws,9ytcwe,2,politics,t5_2cneq,democrats,Republican,2018-11-20 14:13:30,[],[],pretthank fuck younny number state partisan co...,0.7717
3,28,1542749000.0,ea4lgxv,urbanlife78,I would really love to see the country move in...,t1_ea4l2sw,9ytcwe,2,politics,t5_2cneq,democrats,Republican,2018-11-20 16:29:02,[],[],would really love country move direction vote ...,0.9168
4,30,1542739000.0,ea47qh2,urbanlife78,And now we can all get 24 hour news feed strai...,t1_ea407z8,9ytcwe,0,politics,t5_2cneq,democrats,Republican,2018-11-20 13:35:22,[],[],hour news fee strainstagramht echo chamber fil...,0.2263


In [228]:
conflicting_comments.to_csv("./conflicting_comments.csv", index = False)