# Retrieving Data 

1. The Data will be retrieved using reddit praw, the data will be found through going through reddit's 25 most popular story telling subreddits
2. The retrieved data will include for each instance of a post: 
    - id: to keep track
    - title: first taste of the story
    - the story within the post: main point of analysis
    - the time it was created: time of story post might be related to success of story
    - the subreddit it was made within: the followers of each subreddit could cause more success
    - number of comments: more conversation means better story
    - upvote amount: more upvotes means better story as well
    - upvote ratio: important to understand downvotes which a marker for poor story telling
3. For each subreddit, we will aim towards getting 1000 posts but sometimes reddit does not allow for that amount, but it will be about 25,000 posts from 25 unique subreddits

In [12]:
import praw
import time
import pandas as pd

from textblob import TextBlob
reddit = praw.Reddit(
    client_id= input("Enter client_id: "),
    client_secret= input("Enter client_secret: "),
    user_agent="stories",
)


In [2]:
subreddits = [
    "TIFU", "IAmA", "relationships", "nosleep", "prorevenge",
    "casualconversation", "personalfinance", "confession", "MaliciousCompliance",
    "AmItheAsshole", "JustNoMIL", "creepypasta",
    "shortscarystories", "ScaryStories", "Paranormal",
    "UnresolvedMysteries", "TalesFromRetail", "TalesFromTechSupport",
    "TalesFromYourServer", "TalesFromTheFrontDesk", "TalesFromTheCustomer",
    "TalesFromThePharmacy", "TalesFromThePizzaGuy", "TalesFromCallCenters",
    "TalesFromTheSquadCar"
]


attributes = [
    'id', 'title', 'selftext', 'created_utc', 'subreddit',
    'num_comments', 'score', 'upvote_ratio'
]

In [3]:
def get_data():
    data = []

    for subreddit in subreddits:
        current = reddit.subreddit(subreddit)
        for post in current.top(limit=1000):
            post_data = {attr: getattr(post, attr, None) for attr in attributes}
            data.append(post_data)
        time.sleep(60)
        
    return data

# Cleaning the Data

In [5]:
import os

if os.path.exists('stories.csv'):
    df = pd.read_csv('stories.csv')
else: 
    df = pd.DataFrame(get_data())
    df.set_index('id', inplace=True)

In [14]:
from datetime import datetime
from textblob import TextBlob
import re 
import textstat

def calculate_downvotes(upvotes, upvote_ratio):
      return round(upvotes/upvote_ratio - upvotes)

def convert_to_time(str):
      return datetime.strptime(str, "%H:%M")

def handle_date(created_time):
      dt = datetime.fromtimestamp(created_time)
      
      date = f"{str(dt.month).zfill(2)}-{str(dt.day).zfill(2)}-{dt.year}"
      time_of_day = convert_to_time(dt.strftime("%H:%M"))
      
      if convert_to_time("00:00") < time_of_day < convert_to_time("04:59"):
            time_of_day = "Midnight"
      elif convert_to_time("05:00") < time_of_day < convert_to_time("05:59"):
            time_of_day = "Dawn"
      elif convert_to_time("06:00") < time_of_day < convert_to_time("11:59"):
            time_of_day = "Morning"
      elif convert_to_time("12:00") < time_of_day < convert_to_time("12:59"):
            time_of_day = "Noon"
      elif convert_to_time("13:00") < time_of_day < convert_to_time("16:59"):
            time_of_day = "Afternoon"
      elif convert_to_time("17:00") < time_of_day < convert_to_time("18:59"):
            time_of_day = "Evening"
      else:
            time_of_day = "Night"

      return {"date": date, "time_of_day": time_of_day}

def get_word_list(str):
      return re.findall(r"\b\w[\w'-]*\b", str)

def get_avg_word_length(arr):
      return round(sum(len(word) for word in arr) / len(arr), 3)

def get_sentences(str):
      return TextBlob(str).sentences

def get_sentiment(str):
      return TextBlob(str).sentiment

def get_syllable_count(str):
      return textstat.syllable_count(str)

def get_reading_score(str):
      return textstat.flesch_reading_ease(str)

def get_reading_grade(str):
      return textstat.text_standard(str)

def get_keyword_counts(text):
    blob = TextBlob(text)
    return dict(blob.word_counts)

def info(str, isTitle):
      words = get_word_list(str)
      name = 'title' if isTitle else 'story'
      
      return {
            f'{name}_word_count': len(words),
            f'{name}_avg_word_length': get_avg_word_length(words),
            f'{name}_length': len(str),
            f'{name}_polarity': round(get_sentiment(str).polarity, 3),
            f'{name}_subjectivity': round(get_sentiment(str).subjectivity, 3), 
            f'{name}_syllables': get_syllable_count(str),
            f'{name}_reading_score': get_reading_grade(str),
            f'{name}_reading_grade': get_reading_grade(str),
            f'{name}_keywords': get_keyword_counts(str)
      }


{'im': 1,
 'depressed': 1,
 'got': 1,
 'any': 1,
 'slice': 1,
 'of': 1,
 'live/romance': 1,
 'animes': 1,
 'to': 1,
 'recommend': 1}