In [0]:
import json
import requests
import pandas as pd
from tqdm import tqdm
import re

### Collecting latest reddit posts
To collect reddit flair data [pushshift api](https://pushshift.io/) is used as it has higher usage limit than other api's like [praw](https://praw.readthedocs.io/en/latest/#).

Most of the posts are empty, so a large number of posts are extracted using pushshift.
Also the posts are taken from 4 different times over the last 5 years to have a good mix of flairs.

As pushshift returns at max 1000 posts in a single request, posts time stamps are used to avoid repeated posts.


In [0]:
size = 1000
url = "https://api.pushshift.io/reddit/submission/search/?subreddit=india&size={}&before={}"
returned_submissions = []
def downloadFromUrl(start_time):
	count = 0
	previous_epoch = start_time
	num_posts = 200000
	with tqdm(total=num_posts) as pbar:
		while True:
			new_url = url.format(size, previous_epoch)
			request = requests.get(new_url)
			json_data = request.json()
			if 'data' not in json_data:
				break
			objects = json_data['data']
			if len(objects) == 0:
				break
			for object in objects:
				previous_epoch = object['created_utc'] - 1
				count += 1
				pbar.update(1)
				returned_submissions.append(object)
			if count > num_posts:
				break
	return previous_epoch

ret_time = downloadFromUrl(1587286440)
ret_time = downloadFromUrl(min(1577750400, ret_time-1))
ret_time = downloadFromUrl(min(1555977600, ret_time-1))
ret_time = downloadFromUrl(min(1529853415, ret_time-1))
print(ret_time)	

201000it [03:11, 1047.81it/s]
201000it [03:16, 1021.23it/s]
201000it [02:42, 1235.64it/s]
201000it [02:41, 1243.45it/s]

1437484530





#### Saving the raw pushshift data

In [0]:
import pickle
pickle.dump(returned_submissions, open("pushshift_raw_data_midas.pkl", "wb"))
# returned_submissions = pickle.load(open("pushshift_raw_data_midas.pkl", "rb"))

In [0]:
!cp pushshift_raw_data_midas.pkl /content/drive/My\ Drive/midas_reddit_flair

#### check if all the posts are unique

In [0]:
s = set()
for post in returned_submissions:
  if post['id'] in s:
    print("repeat")
  else:
    s.add(post['id'])

#### get number of posts for each flair

In [5]:
st = set()
flairs_count = {}
for i,d in enumerate(returned_submissions):
    if(d.get('link_flair_text')) not in st:
        st.add(d.get('link_flair_text'))
        flairs_count[d.get('link_flair_text')] = 0
    flairs_count[d.get('link_flair_text')] = flairs_count[d.get('link_flair_text')] + 1
print(flairs_count)



#### Selecting important flairs
Only flairs with more than 2000 posts are selected are taken as important.

In [6]:
important_flairs = list(filter(lambda x : x and flairs_count[x] > 2000, flairs_count))
for flair in important_flairs:
  print(flair, " : ", flairs_count[flair])

Science/Technology  :  16074
Coronavirus  :  8011
AskIndia  :  63133
Non-Political  :  156791
Policy/Economy  :  20809
Politics  :  132398
Business/Finance  :  19508
Photography  :  4675
Scheduled  :  3025
Food  :  4618
Sports  :  8768
Demonetization  :  7319
Science &amp; Technology  :  3051
[R]eddiquette  :  12330
Not in English.  :  5979
Not Original Title.  :  2228
Not about India.  :  2715
Repost.  :  2990
Policy  :  6328
Entertainment  :  4614
Policy &amp; Economy  :  2012
Technology  :  3280


#### Removing ambiguous flairs

In [7]:
amb_flairs = ['Non-Political', 'Not in English.', '[R]eddiquette', 'Not Original Title.', 'Not about India.', 'Repost.', 'Scheduled', 'Non-Political', 'AskIndia']
important_flairs = list(set(important_flairs) - set(amb_flairs))
for flair in important_flairs:
  print(flair, " : ", flairs_count[flair])

Sports  :  8768
Demonetization  :  7319
Business/Finance  :  19508
Entertainment  :  4614
Politics  :  132398
Policy/Economy  :  20809
Food  :  4618
Photography  :  4675
Policy  :  6328
Policy &amp; Economy  :  2012
Science &amp; Technology  :  3051
Science/Technology  :  16074
Technology  :  3280
Coronavirus  :  8011


#### Merge same flairs

In [0]:
for submission in returned_submissions:
  if 'link_flair_text' in submission:
    if submission['link_flair_text'] == 'Science &amp; Technology' or submission['link_flair_text'] == 'Technology':
      submission['link_flair_text'] = 'Science/Technology'
    if submission['link_flair_text'] == 'Policy &amp; Economy':
      submission['link_flair_text']  = 'Policy/Economy'

In [9]:
st = set()
flairs_count = {}
for i,d in enumerate(returned_submissions):
    if(d.get('link_flair_text')) not in st:
        st.add(d.get('link_flair_text'))
        flairs_count[d.get('link_flair_text')] = 0
    flairs_count[d.get('link_flair_text')] = flairs_count[d.get('link_flair_text')] + 1

important_flairs = list(filter(lambda x : x and flairs_count[x] > 2000, flairs_count))
important_flairs = list(set(important_flairs) - set(amb_flairs))
for flair in important_flairs:
  print(flair, " : ", flairs_count[flair])

Business/Finance  :  19508
Politics  :  132398
Sports  :  8768
Demonetization  :  7319
Entertainment  :  4614
Policy/Economy  :  22821
Food  :  4618
Photography  :  4675
Policy  :  6328
Science/Technology  :  22405
Coronavirus  :  8011


some flairs have high number of posts as compared to other flairs. To avoid problems due to disproportion of data, each flair is set to have no more than 10000 posts.

In [0]:
max_post_per_flair = 20000
current_num = {flair:0 for flair in important_flairs}

#### Extracting title and body from posts
extra whitespace characters are removed from the post body, the title and the body are combined under the ```combined``` column. All the text in the posts are converted to lower case, punctuation marks are removed and put under the ```combined_clean``` column.

In [11]:
flair = []
body = []
title = []
combined = []

with tqdm(total = len(returned_submissions)) as pbar:
  for submission in returned_submissions:
    try:
      if submission['link_flair_text'] in important_flairs and current_num[submission['link_flair_text']] < max_post_per_flair:
        current_num[submission['link_flair_text']] += 1
        flair.append(submission['link_flair_text'])
        title.append(submission['title'])
        body.append(re.sub("\s\s+" , " ", submission["selftext"]))
        combined.append(submission['title'] + ". " + re.sub("\s\s+" , " ", submission["selftext"]))
    except KeyError:
      pass
    pbar.update(1)
reddit_flair_dataframe = pd.DataFrame(zip(title, body, combined, flair), columns=['title', 'body', 'combined', 'flair'])

def  clean_text(df, text_field, new_text_field_name):
    df[new_text_field_name] = df[text_field].str.lower()
    df[new_text_field_name] = df[new_text_field_name].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem)) 
    df[new_text_field_name] = df[new_text_field_name].apply(lambda elem: (re.sub(r"\d+", "", elem)).strip())    
    return df

reddit_flair_dataframe = clean_text(reddit_flair_dataframe, 'combined', 'combined_clean')
reddit_flair_dataframe = reddit_flair_dataframe.drop(reddit_flair_dataframe[reddit_flair_dataframe['combined_clean'] == ''].index)
reddit_flair_dataframe.to_csv('reddit_flair_data_midas.csv', index = False)

100%|██████████| 804000/804000 [00:01<00:00, 477618.13it/s]


In [12]:
reddit_flair_dataframe.head()

Unnamed: 0,title,body,combined,flair,combined_clean
0,The Most Beautiful White Flowers in the Nature,,The Most Beautiful White Flowers in the Nature.,Science/Technology,the most beautiful white flowers in the nature
1,"Trains, Flights Unlikely To Resume After May 3...",,"Trains, Flights Unlikely To Resume After May 3...",Coronavirus,trains flights unlikely to resume after may s...
2,Coronavirus Lockdown: Change In E-Commerce Rul...,,Coronavirus Lockdown: Change In E-Commerce Rul...,Coronavirus,coronavirus lockdown change in ecommerce rules...
3,Help researchers understand how the COVID-19 P...,Help researchers understand how the COVID-19 P...,Help researchers understand how the COVID-19 P...,Coronavirus,help researchers understand how the covid pand...
4,China - A Challenge to India and East Asia | P...,,China - A Challenge to India and East Asia | P...,Policy/Economy,china a challenge to india and east asia pro...


In [13]:
reddit_flair_dataframe['flair'].value_counts()

Policy/Economy        19974
Science/Technology    19971
Politics              19925
Business/Finance      19461
Sports                 8248
Coronavirus            7985
Demonetization         7308
Policy                 5792
Photography            4663
Entertainment          4609
Food                   4318
Name: flair, dtype: int64

In [0]:
!cp reddit_flair_data_midas.csv /content/drive/My\ Drive/midas_reddit_flair