# Retrieving Data 

1. The Data will be retrieved using reddit praw, the data will be found through going through reddit's 25 most popular story telling subreddits
2. The retrieved data will include for each instance of a post: 
    - id: to keep track
    - title: first taste of the story
    - the story within the post: main point of analysis
    - the time it was created: time of story post might be related to success of story
    - the subreddit it was made within: the followers of each subreddit could cause more success
    - number of comments: more conversation means better story
    - upvote amount: more upvotes means better story as well
    - upvote ratio: important to understand downvotes which a marker for poor story telling
3. For each subreddit, we will aim towards getting 1000 posts but sometimes reddit does not allow for that amount, but it will be about 25,000 posts from 25 unique subreddits

In [100]:
import praw
import time
import pandas as pd

from textblob import TextBlob
reddit = praw.Reddit(
    client_id= input("Enter client_id: "),
    client_secret= input("Enter client_secret: "),
    user_agent="stories",
)


In [101]:
subreddits = [
    "TIFU", "IAmA", "relationships", "nosleep", "prorevenge",
    "casualconversation", "personalfinance", "confession", "MaliciousCompliance",
    "AmItheAsshole", "JustNoMIL", "creepypasta",
    "shortscarystories", "ScaryStories", "Paranormal",
    "UnresolvedMysteries", "TalesFromRetail", "TalesFromTechSupport",
    "TalesFromYourServer", "TalesFromTheFrontDesk", "TalesFromTheCustomer",
    "TalesFromThePharmacy", "TalesFromThePizzaGuy", "TalesFromCallCenters",
    "TalesFromTheSquadCar"
]


attributes = [
    'id', 'title', 'selftext', 'created_utc', 'subreddit',
    'num_comments', 'score', 'upvote_ratio'
]

In [102]:
def get_data():
    data = []

    for subreddit in subreddits:
        current = reddit.subreddit(subreddit)
        for post in current.top(limit=1000):
            post_data = {attr: getattr(post, attr, None) for attr in attributes}
            data.append(post_data)
        time.sleep(60)
        
    return data

# Cleaning the Data

In [103]:
import os

if os.path.exists('stories.csv'):
    df = pd.read_csv('stories.csv')
else: 
    df = pd.DataFrame(get_data())
    df.set_index('id', inplace=True)
    
print(df.shape) 
df.head(5)


(24353, 8)


Unnamed: 0,id,title,selftext,created_utc,subreddit,num_comments,score,upvote_ratio
0,x35iu6,TIFU / My (20F) girlfriend of two years told m...,\nA little back story; when I first started ha...,1662033000.0,tifu,4997,183765,0.95
1,a99fw9,TIFU by buying everyone an AncestryDNA kit and...,"Earlier this year, AncestryDNA had a sale on t...",1545691000.0,tifu,8828,173987,0.95
2,ak2k64,TIFU by living in a dark bedroom for 6 years.,The overhead fan in our bedroom uses one of th...,1548522000.0,tifu,2566,148075,0.96
3,bbgmzp,TIFU by spending the last year on reddit talki...,Today was the day I realised I messed up by no...,1554862000.0,tifu,3120,143727,0.95
4,i3xnlq,TIFU by owning a Golden Retriever while being ...,Sigh. \n\nWork was rough today and all I wante...,1596597000.0,tifu,7084,139831,0.9


### Functions necessary to clean data

In [104]:
from datetime import datetime
from textblob import TextBlob
import re 
import textstat

def calculate_downvotes(upvotes, upvote_ratio):
      return upvotes/upvote_ratio - upvotes

def convert_to_time(string):
      return datetime.strptime(string, "%H:%M")

def handle_date(created_time):
      dt = datetime.fromtimestamp(created_time)
      
      date = f"{str(dt.month).zfill(2)}-{str(dt.day).zfill(2)}-{dt.year}"
      time_of_day = convert_to_time(dt.strftime("%H:%M"))
      
      if convert_to_time("00:00") <= time_of_day <= convert_to_time("04:59"):
            time_of_day = "Midnight"
      elif convert_to_time("05:00") < time_of_day < convert_to_time("05:59"):
            time_of_day = "Dawn"
      elif convert_to_time("06:00") < time_of_day < convert_to_time("11:59"):
            time_of_day = "Morning"
      elif convert_to_time("12:00") < time_of_day < convert_to_time("12:59"):
            time_of_day = "Noon"
      elif convert_to_time("13:00") < time_of_day < convert_to_time("16:59"):
            time_of_day = "Afternoon"
      elif convert_to_time("17:00") < time_of_day < convert_to_time("18:59"):
            time_of_day = "Evening"
      else:
            time_of_day = "Night"

      return {"date": date, "time_of_day": time_of_day}

def get_word_list(string):
      return re.findall(r"\b\w[\w'-]*\b", str(string))

def divider(num1, num2):
      return num1 / num2 if num2 > 0 else 0

def get_avg_word_length(arr):
      return round(divider(sum(len(word) for word in arr), len(arr)), 3)

def get_sentences(string):
      return TextBlob(string).sentences

def get_sentiment(string):
      return TextBlob(string).sentiment

def get_syllable_count(string):
      return textstat.syllable_count(string)

def get_reading_score(string):
      return textstat.flesch_reading_ease(string)

def get_reading_grade(string):
      return textstat.text_standard(string)

def get_story_sent_info(sentences):
      amount_sent = len(sentences)
      words_in_sent = 0
      syllables_in_sent = 0
      polarity_sentences = 0
      subjectivity_sentences = 0
      reading_score_sentences = 0
      
      for sentence in sentences:
            sent = str(sentence)
            sentiment = get_sentiment(sent)
            words_in_sent += len(get_word_list(sent))
            syllables_in_sent += get_syllable_count(sent)
            polarity_sentences += sentiment.polarity
            subjectivity_sentences += sentiment.subjectivity
            reading_score_sentences += get_reading_score(sent)
      
      return {
            "amount_sentences": len(sentences),
            "avg_words_per_sentence": round(divider(words_in_sent, amount_sent), 3),
            "avg_syllables_per_sentence": round(divider(syllables_in_sent, amount_sent), 3),
            "avg_polarity_per_sentence": round(divider(polarity_sentences, amount_sent), 3),
            "avg_subjectivity_per_sentence": round(divider(subjectivity_sentences, amount_sent), 3),
            "avg_readscore_per_sentence": round(divider(reading_score_sentences, amount_sent), 3)
      } 

def info(string, isTitle):
      words = get_word_list(string)
      name = 'title' if isTitle else 'story'
      
      result =  {
            f'{name}_length': len(string),
            f'{name}_word_count': len(words),
            f'{name}_avg_word_length': get_avg_word_length(words),
            f'{name}_syllables': get_syllable_count(string),
            f'{name}_reading_score': get_reading_score(string),
            f'{name}_reading_grade': get_reading_grade(string),
            f'{name}_polarity': round(get_sentiment(string).polarity, 3),
            f'{name}_subjectivity': round(get_sentiment(string).subjectivity, 3)     
      }
      
      if not isTitle:
        result.update(get_story_sent_info(get_sentences(string)))
      
      return result


In [105]:
df = df[df['title'].str.len() > 0]
df = df[df['selftext'].str.len() > 0]

print(df.shape) 
df.head(5)

(22722, 8)


Unnamed: 0,id,title,selftext,created_utc,subreddit,num_comments,score,upvote_ratio
0,x35iu6,TIFU / My (20F) girlfriend of two years told m...,\nA little back story; when I first started ha...,1662033000.0,tifu,4997,183765,0.95
1,a99fw9,TIFU by buying everyone an AncestryDNA kit and...,"Earlier this year, AncestryDNA had a sale on t...",1545691000.0,tifu,8828,173987,0.95
2,ak2k64,TIFU by living in a dark bedroom for 6 years.,The overhead fan in our bedroom uses one of th...,1548522000.0,tifu,2566,148075,0.96
3,bbgmzp,TIFU by spending the last year on reddit talki...,Today was the day I realised I messed up by no...,1554862000.0,tifu,3120,143727,0.95
4,i3xnlq,TIFU by owning a Golden Retriever while being ...,Sigh. \n\nWork was rough today and all I wante...,1596597000.0,tifu,7084,139831,0.9


### Applying functions to clean data

In [106]:
df['downvotes'] = calculate_downvotes(df['score'], df['upvote_ratio']).astype(int)

df[['date', 'time_of_day']] = df['created_utc'].apply(lambda x: pd.Series(handle_date(x)))

df[['title_length', 'title_word_count', 'title_avg_word_length', 
    'title_syllables', 'title_reading_score', 'title_reading_grade', 
    'title_polarity', 'title_subjectivity']
   ] = df['title'].apply(lambda x: pd.Series(info(x, True)))

df[['story_length', 'story_word_count', 'story_avg_word_length',
    'story_syllables', 'story_reading_score', 'story_reading_grade',
    'story_polarity', 'story_subjectivity', 'amount_sentences',
    'avg_words_per_sentence', 'avg_syllables_per_sentence',
    'avg_polarity_per_sentence', 'avg_subjectivity_per_sentence',
    'avg_readscore_per_sentence']
   ] = df['selftext'].apply(lambda x: pd.Series(info(x, False)))


print(df.shape) 
df.head(5)

(22722, 33)


Unnamed: 0,id,title,selftext,created_utc,subreddit,num_comments,score,upvote_ratio,downvotes,date,...,story_reading_score,story_reading_grade,story_polarity,story_subjectivity,amount_sentences,avg_words_per_sentence,avg_syllables_per_sentence,avg_polarity_per_sentence,avg_subjectivity_per_sentence,avg_readscore_per_sentence
0,x35iu6,TIFU / My (20F) girlfriend of two years told m...,\nA little back story; when I first started ha...,1662033000.0,tifu,4997,183765,0.95,9671,09-01-2022,...,75.44,7th and 8th grade,0.052,0.535,16,22.812,29.0,0.04,0.504,77.457
1,a99fw9,TIFU by buying everyone an AncestryDNA kit and...,"Earlier this year, AncestryDNA had a sale on t...",1545691000.0,tifu,8828,173987,0.95,9157,12-24-2018,...,81.73,5th and 6th grade,0.143,0.447,26,14.423,18.808,0.063,0.264,76.44
2,ak2k64,TIFU by living in a dark bedroom for 6 years.,The overhead fan in our bedroom uses one of th...,1548522000.0,tifu,2566,148075,0.96,6169,01-26-2019,...,70.87,9th and 10th grade,0.067,0.51,14,23.857,29.857,0.017,0.516,76.945
3,bbgmzp,TIFU by spending the last year on reddit talki...,Today was the day I realised I messed up by no...,1554862000.0,tifu,3120,143727,0.95,7564,04-09-2019,...,70.13,8th and 9th grade,0.122,0.569,24,22.5,29.458,0.083,0.477,65.953
4,i3xnlq,TIFU by owning a Golden Retriever while being ...,Sigh. \n\nWork was rough today and all I wante...,1596597000.0,tifu,7084,139831,0.9,15536,08-04-2020,...,77.98,6th and 7th grade,0.045,0.438,70,18.657,23.571,0.048,0.289,78.577


In [108]:
df = df.rename(columns={"score": "upvotes"})

columns = [
    'id', 'subreddit', 'date', 'time_of_day',
    'title_length', 'title_word_count', 'title_avg_word_length',
    'title_syllables', 'title_reading_score', 'title_reading_grade',
    'title_polarity', 'title_subjectivity', 'story_length',
    'story_word_count', 'story_avg_word_length', 'story_syllables',
    'story_reading_score', 'story_reading_grade', 'story_polarity',
    'story_subjectivity', 'amount_sentences', 'avg_words_per_sentence',
    'avg_syllables_per_sentence', 'avg_polarity_per_sentence',
    'avg_subjectivity_per_sentence', 'avg_readscore_per_sentence',
    'num_comments', 'upvotes', 'upvote_ratio', 'downvotes'
]

df = df[columns]

df.head(5)


Unnamed: 0,id,subreddit,date,time_of_day,title_length,title_word_count,title_avg_word_length,title_syllables,title_reading_score,title_reading_grade,...,amount_sentences,avg_words_per_sentence,avg_syllables_per_sentence,avg_polarity_per_sentence,avg_subjectivity_per_sentence,avg_readscore_per_sentence,num_comments,upvotes,upvote_ratio,downvotes
0,x35iu6,tifu,09-01-2022,Morning,116,24,3.625,29,80.96,8th and 9th grade,...,16,22.812,29.0,0.04,0.504,77.457,4997,183765,0.95,9671
1,a99fw9,tifu,12-24-2018,Evening,64,10,5.5,20,27.49,11th and 12th grade,...,26,14.423,18.808,0.063,0.264,76.44,8828,173987,0.95,9157
2,ak2k64,tifu,01-26-2019,Morning,45,10,3.5,13,86.71,3rd and 4th grade,...,14,23.857,29.857,0.017,0.516,76.945,2566,148075,0.96,6169
3,bbgmzp,tifu,04-09-2019,Night,77,14,4.5,22,57.27,5th and 6th grade,...,24,22.5,29.458,0.083,0.477,65.953,3120,143727,0.95,7564
4,i3xnlq,tifu,08-04-2020,Night,52,9,4.778,15,53.88,8th and 9th grade,...,70,18.657,23.571,0.048,0.289,78.577,7084,139831,0.9,15536
