# Create Dataset
1. Scrape Reddit data using Praw (Python wrapper around Reddit API)
2. Prepare data into the correct format for labelling.

Note: Need to have a `praw.ini` file in this directory for authentication.

In [1]:
import praw
import pandas as pd
from datetime import datetime

In [2]:
def get_date(created_at):
    return datetime.fromtimestamp(created)

In [3]:
class Post(object):
    def __init__(self, session):
        self.title = session.title
        self.score = session.score
        self.url = session.url
        self.num_comments = session.num_comments
        self.created_at = get_date(session.created)
        self.body = session.selftext

    def __dict__(self):
        return {'title': self.title, 'score': self.score, 'num_comments': self.num_comments,
                'created_at': self.created_at, 'body': self.body, 'url': self.url}

In [4]:
class RedditScraper(object):
    def __init__(self, subreddit):
        self.reddit = praw.Reddit('UserTesting')
        self.subreddit = self.reddit.subreddit(subreddit)
    
    def scrape(self, limit):
        new_posts = self.subreddit.new(limit=limit)
        for session in new_posts:
            yield Post(session)

    def get_data(self, limit):
        data = []
        for post in self.scrape(limit):
            data.append(post.__dict__())
        df = pd.DataFrame.from_records(data)
        df['text'] = df['title'].fillna('') + ' ' + df['body'].fillna('')
        return df

In [None]:
scraper = RedditScraper('usertesting')
df = scraper.get_data(1000)

In [None]:
df.to_csv('dataset/reddit_scrape.csv', index=False)