In [118]:
import praw
from dotenv import load_dotenv
load_dotenv()
import os
import pandas as pd
import numpy as np
from prawcore.exceptions import Forbidden, NotFound

#### Get configs from .env file
Documentation for dotenv [here](https://github.com/theskumar/python-dotenv)

#### Setup Reddit API to get data 
Documentation [here](https://praw.readthedocs.io/en/latest/)

In [3]:
reddit = praw.Reddit(client_id=os.getenv("CLIENT_ID"), client_secret=os.getenv("CLIENT_SECRET"), user_agent=os.getenv("USER_AGENT"))

#### Get subreddit r/india

In [4]:
subreddit_india = reddit.subreddit('india')

#### Attempt to get all flairs in r/india
Motivation
- Get all classes for classification

Blocker
- 403 Forbidden error, only moderators of a subreddit can get details of all flairs

In [5]:
for flair in subreddit_india.flair(limit=10):
    print(flair)

Forbidden: received 403 HTTP response

#### Explore attributes of submission
Documentation [here](https://praw.readthedocs.io/en/latest/code_overview/models/submission.html?highlight=submission)

Observation - Submission attributes of interest to use case
- author
- comments
- id
- link_flair_text
- selftext
- title

In [11]:
for post in subreddit_india.hot(limit=1):
    print(post.author)
    print(post.clicked)
    print(post.comments)
    print(post.created_utc)
    print(post.distinguished)
    print(post.edited)
    print(post.id)
    print(post.is_original_content)
    print(post.is_self)
    print(post.link_flair_template_id)
    print(post.link_flair_text)
    print(post.locked)
    print(post.name)
    print(post.num_comments)
    print(post.over_18)
    print(post.permalink)
    print(post.score)
    print(post.selftext)
    print(post.spoiler)
    print(post.stickied)
    print(post.subreddit)
    print(post.title)
    print(post.upvote_ratio)
    print(post.url)

IAmMohit
False
<praw.models.comment_forest.CommentForest object at 0x000001DD001255C8>
1586980815.0
moderator
1587315403.0
g1zi21
False
True
8041227c-6517-11ea-b83e-0e7048fc0c5b
Coronavirus
False
t3_g1zi21
2258
False
/r/india/comments/g1zi21/coronavirus_covid19_megathread_news_and_updates_4/
153
###[Covid-19 Fundraisers & Donation Links](https://amnesty.org.in/support-indias-most-vulnerable-fight-covid-19-a-list-of-fundraisers-you-can-donate-to/) via Amnesty International
* [This link covers](https://amnesty.org.in/support-indias-most-vulnerable-fight-covid-19-a-list-of-fundraisers-you-can-donate-to/) Migrant Workers Day-Labourers, Other Vulnerable Groups, Urban Poor, Transgender Community, Waste-pickers and Sanitation Workers, Healthcare Workers and Doctors, Older Persons & Children and Animal Care 

------------------------------------------------------------------------------------------------------

#####Indian Goverment
* [Official Twitter Collection of Indian Govt. Communications

#### Dataframe of collected useful data
- Collected from hot and top submissions for subreddit
- Attributes of interest
- Use comment_forrest IDs to gather comment bodies of top 50 comments (assuming that the most popular comments are popular because of their relevance to the subject being discussed)
- Remove duplicates in collected data based on id

In [102]:
submissions = list()
for submission in subreddit_india.hot(limit=400):
    submissions.append([submission.author, submission.comments, submission.id, submission.link_flair_text, submission.score, submission.selftext, submission.title, submission.upvote_ratio])
submissions_df_hot = pd.DataFrame(submissions,columns=['author', 'comments', 'id', 'flair', 'score', 'selftext', 'title', 'upvote_ratio'])

submissions = list()
for submission in subreddit_india.top(limit=400):
    submissions.append([submission.author, submission.comments, submission.id, submission.link_flair_text, submission.score, submission.selftext, submission.title, submission.upvote_ratio])
submissions_df_top = pd.DataFrame(submissions,columns=['author', 'comments', 'id', 'flair', 'score', 'selftext', 'title', 'upvote_ratio'])
submissions_df = pd.concat([submissions_df_hot,submissions_df_top], ignore_index=True)
submissions_df.drop_duplicates(subset='id', keep="first", inplace=True)

comments = list()
comments_authors = list()
for comments_forrest in submissions_df['comments']:
    comments_content = list()
    comments_content += [str(comment.body) for comment in comments_forrest[:50]]
    comments_content = ';'.join([str(elem) for elem in comments_content]) 
    comment_authors = [comment.author for comment in comments_forrest[:50]]
    comments.append(comments_content)
    comments_authors.append(comment_authors)
submissions_df['comments_text']=comments
submissions_df

Unnamed: 0,author,comments,id,flair,score,selftext,title,upvote_ratio,comments_text
0,IAmMohit,"(fnij87h, fo2ir77, fnv730i, fnyk7va, fnkd0tr, ...",g1zi21,Coronavirus,160,###[Covid-19 Fundraisers & Donation Links](htt...,Coronavirus (COVID-19) Megathread - News and U...,0.96,###[Covid-19 Fundraisers & Donation Links](htt...
1,cool_boyy,"(fnwq7oi, fnzuvuc, fnypgrf, fnzqzlf, fnz3luo, ...",g4d2ix,Scheduled,78,<3 \n \nLinks: ...,"[Monthly Happiness Thread] Randians, please sh...",0.89,Working from home since past one month. For so...
2,drunk_sithlord,"(fo68j6q, fo6mtbf, fo630va, fo66ivs, fo6ilbo, ...",g5xgab,Coronavirus,764,,"Lockdown scenes in Kurnool, Andhra Pradesh whi...",0.98,What do you expect if you allow the market to ...
3,poleco1,"(fo5ko22, fo5ix1d, fo5ech7, fo5go3i, fo5o0z9, ...",g5swem,Business/Finance,1384,,Facebook buys 9.99% stake in Reliance Jio for ...,0.98,Kiss goodbye to your privacy!;So our privacy g...
4,DenseSpirit5,"(fo5uoea, fo5y5qw, fo5tbik, fo64l8u, fo5tl3s, ...",g5uuhi,Coronavirus,468,,Covidiots Arrested and Paraded for Making TikT...,0.96,"Play stupid games, win stupid prizes.;Ye kya n..."
...,...,...,...,...,...,...,...,...,...
795,pk1515,"(dxcboyy, dxc9g3x, dxcblnb, dxcftev, dxch2ml, ...",8c5f9x,Non-Political,2357,,[NP] Wikipedia needs our help.,0.95,My college made us do this for two semesters a...
796,artanurag,"(eu40v73, eu3xkhi, eu3weko, eu4128p, eu47jnc, ...",cenxm2,Non-Political,2344,,"Sacred Games season 1 fanart poster, by me",0.97,Put a watermark on it bro or it may soon be pl...
797,sumedh0123,"(edyklm5, edyjjuq, edylk2r, edyo8uo, edyk8qo, ...",afgxto,Non-Political,2344,,Mumbai police's recent tweet on scams.,0.96,"while I love the effort, I think the meme will..."
798,loulan,"(e8pwxiu, e8q730g, e8pz32h, e8q6o8g, e8pzdy5, ...",9sn0ug,Non-Political,2345,,Shimla,0.98,r/accidentalWesAnderson;[deleted];Saturation o...


In [103]:
submissions_df.to_csv('reddit_india_submissions.csv', index=False)

#### Attempt to get all submissions and their flairs by a particular author
Motivation
- Explore possible correlation of a Redditor(author) with flairs; an author might add more submissions around a few particular topics more than others

Documentation on Redditor [here](https://praw.readthedocs.io/en/latest/code_overview/models/redditor.html?highlight=Redditor)

Observation
- This author 'IAmMohit' has highest submissions 38 in flair 'Politics' - area of interest

In [119]:
submissions_with_author_df = submissions_df.dropna(subset=['author'])
authors = list()
for author in submissions_with_author_df['author']:
    try:
        for submission in reddit.redditor(author.name).submissions.top(limit=50):
            authors.append([author.name,submission.link_flair_text])
    except (Forbidden, NotFound):
        continue
authors_df = pd.DataFrame(authors, columns=['author','flair']).dropna()
authors_flairs_df = pd.crosstab(authors_df['author'], authors_df['flair'])
authors_flairs_df

flair,Unnamed: 1_level_0,Unnamed: 2_level_0,Star,YOUTUBE CIRCLEJERK,!,! Good boy !,#Ask-Indiaspeaks,#Corona-Virus,#General,#Geopolitics,...,ÏòàÏà†Ìíà | Artwork,üÖ±Ô∏èoNo tHiS tYrEs ArE dEaD,üéÅ üéÑ üéÖ,üî• ROAST üî•,üî∏ Misc.,üò¢ bye Moise,üö®UNCREDITED REPOSTüö®,ü§°Humour,ü•ö Easter Egg,üß†Big IQ memeüß†
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--echoes--,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-DrugsAndHugs-,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
007ninjaprincess,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1hakr,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4everaBau5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yasir_unlighted,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
yothisisyo,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zakiiboy,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zannyxena,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [120]:
submissions_df.to_csv('reddit_india_authors.csv', index=False)

In [1]:
authors_flairs_df.to_csv('reddit_india_authors.csv', index=False)

NameError: name 'authors_flairs_df' is not defined