In [9]:
import praw
import re
import random
import bs4, markdown
from tqdm import tqdm
import copy
import pandas as pd
import time

In [5]:
credential_file = "credentials.key"

try:
    with open(credential_file, 'r') as f:
        creds = f.read().split('\n')
    personal = creds[0]
    secret = creds[1]
    username = creds[2]
    password = creds[3]
except IOError as e:
    print("You didn't create a credential file! Please see sample_credentials.key")
    print("Then go to http://www.storybench.org/how-to-scrape-reddit-with-python/")
    print("And register a new app named fastai_reddit in your reddit account.")
    print("And insert the values into sample_credentials.key and save it as {}.".format(credential_file))
    raise(e)

def noquotes(text):
    """
This function first stated out as a way to remove markdown quotes from raw reddit markdown text but now it's more of a
general purpose text parser, but the name hasn't changed.
    """
    #https://stackoverflow.com/questions/761824/python-how-to-convert-markdown-formatted-text-to-text
    t1 = re.sub(">.+?(\n|$)","",text).replace("\\n","").replace("\\","")
    html = markdown.markdown(t1)
    t2 = ''.join(bs4.BeautifulSoup(html, 'lxml').findAll(text=True))
    
    return t2    

reddit = praw.Reddit(client_id=personal, client_secret=secret, user_agent='fastai_reddit', username=username, \
                     password=password)

In [6]:
all_mbti = set(['INTP','INTJ','INFJ','INFP',\
               'ENTP','ENTJ','ENFJ','ENFP',\
               'ESTP','ESTJ','ESFJ','ESFP',
               'ISTP','ISTJ','ISFJ','ISFP'])
len(all_mbti)

16

In [16]:
def run_mbti(target_mbti = "INTP", limit = 1000):
    subreddit = reddit.subreddit(target_mbti)
    bad_mbti = copy.copy(all_mbti)
    bad_mbti.remove(target_mbti)
    bad_mbti = list(bad_mbti)
    top = list(subreddit.top(limit=limit))
    new = list(subreddit.new(limit=limit))
    rising = list(subreddit.rising(limit=limit))
    controversial = list(subreddit.controversial(limit=limit))
    gilded = list(subreddit.gilded(limit=limit))
    posts = list(set(top + new + rising + controversial + gilded))
    
    def query_flair(c):
        result = True
        flair = c.author_flair_text
        if not flair:
            return True
        if len(flair) > 0:
            flairU = flair.upper()
            for mbti in bad_mbti:
                if mbti in flairU:
                    result = False
                    break
        return result

    def get_comment_text(comment):
        comments = []
        if isinstance(comment, praw.models.MoreComments):
            while True:
                try:
                    newcomments = comment.comments()
                    break
                except praw.exceptions.APIException as e:
                    if e.response.status_code == 500:
                        print('Received 500 error. Sleeping for 1 minute...')
                        time.sleep(60)
                    else:
                        raise e
            for n in newcomments:
                comments += get_comment_text(n)
        else:
            while True:
                try:
                    if query_flair(comment):
                        comments.append(noquotes(comment.body))
                    break
                except praw.exceptions.APIException as e:
                    if e.response.status_code == 500:
                        print('Received 500 error. Sleeping for 1 minute...')
                        time.sleep(60)
                    else:
                        raise e
        return comments

    alltext = []
    for p in tqdm(posts):
        try:
            tmp = noquotes(p.selftext)
        except AttributeError:
            continue
        if len(tmp) > 0:
            if query_flair(p):
                alltext.append(tmp)
        for c in p.comments.list():
            alltext += get_comment_text(c)
    df = pd.DataFrame()
    df['type'] = [target_mbti]*len(alltext)
    df['posts'] = alltext
    df.to_csv("{}.csv".format(target_mbti))

In [19]:
for mbti in all_mbti:
    print(mbti)
    run_mbti(mbti)

ESTP


100%|██████████████████████████████████████████████████████████████████████████████| 2377/2377 [40:29<00:00,  1.02s/it]


ESFP


100%|██████████████████████████████████████████████████████████████████████████████| 2089/2089 [35:14<00:00,  1.01s/it]


ESFJ


100%|██████████████████████████████████████████████████████████████████████████████| 1568/1568 [26:27<00:00,  1.01s/it]


ISTJ


100%|██████████████████████████████████████████████████████████████████████████████| 2378/2378 [40:02<00:00,  1.01s/it]


ESTJ


100%|██████████████████████████████████████████████████████████████████████████████| 1281/1281 [21:46<00:00,  1.02s/it]


ISFJ


100%|██████████████████████████████████████████████████████████████████████████████| 2489/2489 [41:41<00:00,  1.00s/it]


INFP


100%|██████████████████████████████████████████████████████████████████████████████| 2960/2960 [54:40<00:00,  1.11s/it]


INFJ


100%|██████████████████████████████████████████████████████████████████████████████| 2956/2956 [56:25<00:00,  1.15s/it]


ENTP


100%|████████████████████████████████████████████████████████████████████████████| 2949/2949 [1:02:14<00:00,  1.27s/it]


ENFP


100%|██████████████████████████████████████████████████████████████████████████████| 2968/2968 [53:37<00:00,  1.08s/it]


INTJ


100%|████████████████████████████████████████████████████████████████████████████| 2968/2968 [1:04:47<00:00,  1.31s/it]


INTP


100%|████████████████████████████████████████████████████████████████████████████| 3020/3020 [1:04:11<00:00,  1.28s/it]


ISFP


100%|██████████████████████████████████████████████████████████████████████████████| 2685/2685 [45:21<00:00,  1.01s/it]


ENTJ


100%|██████████████████████████████████████████████████████████████████████████████| 2871/2871 [53:44<00:00,  1.12s/it]


ISTP


100%|██████████████████████████████████████████████████████████████████████████████| 2775/2775 [48:46<00:00,  1.05s/it]


ENFJ


100%|██████████████████████████████████████████████████████████████████████████████| 2704/2704 [47:19<00:00,  1.05s/it]


In [20]:
dfs = [pd.read_csv("{}.csv".format(m), index_col = 0) for m in all_mbti]

In [21]:
df = pd.concat(dfs, axis=0)

In [22]:
df.head()

Unnamed: 0,type,posts
0,ESTP,"Hey, y’all. Cheers to us reaching 8,000 member..."
1,ESTP,You boys and girls keep being awesome!
2,ESTP,I'm unsure when he lies and when he doesnt.. I...
3,ESTP,I think you’re in the wrong sub.
4,ESTP,Don't care


In [24]:
df.to_csv("mbti2.csv", index = None)

In [25]:
len(df)

1002166