# Retrieving Data 

1. The Data will be retrieved using reddit praw, the data will be found through going through reddit's 25 most popular story telling subreddits
2. The retrieved data will include for each instance of a post: 
    - id: to keep track
    - title: first taste of the story
    - the story within the post: main point of analysis
    - the time it was created: time of story post might be related to success of story
    - the subreddit it was made within: the followers of each subreddit could cause more success
    - number of comments: more conversation means better story
    - upvote amount: more upvotes means better story as well
    - upvote ratio: important to understand downvotes which a marker for poor story telling
3. For each subreddit, we will aim towards getting 1000 posts but sometimes reddit does not allow for that amount, but it will be about 25,000 posts from 25 unique subreddits

In [35]:
import praw
import time
import pandas as pd
from textblob import TextBlob

reddit = praw.Reddit(
    client_id= input("Enter client_id: "),
    client_secret= input("Enter client_secret: "),
    user_agent="stories",
)


In [36]:
subreddits = [
    "TIFU", "IAmA", "relationships", "nosleep", "prorevenge",
    "casualconversation", "personalfinance", "confession", "MaliciousCompliance",
    "AmItheAsshole", "JustNoMIL", "creepypasta",
    "shortscarystories", "ScaryStories", "Paranormal",
    "UnresolvedMysteries", "TalesFromRetail", "TalesFromTechSupport",
    "TalesFromYourServer", "TalesFromTheFrontDesk", "TalesFromTheCustomer",
    "TalesFromThePharmacy", "TalesFromThePizzaGuy", "TalesFromCallCenters",
    "TalesFromTheSquadCar"
]


attributes = [
    'id', 'title', 'selftext', 'created_utc', 'subreddit',
    'num_comments', 'score', 'upvote_ratio'
]

In [37]:
def get_data():
    data = []

    for subreddit in subreddits:
        current = reddit.subreddit(subreddit)
        for post in current.top(limit=1000):
            post_data = {attr: getattr(post, attr, None) for attr in attributes}
            data.append(post_data)
        time.sleep(60)
        
    return data

# Cleaning the Data

In [38]:
import os

#This is done in this manner simply because the data was retrieved once and stored within the CSV 
#So that we do not have to spend a long time retrieving it once more
if os.path.exists('stories.csv'):
    df = pd.read_csv('stories.csv')
else: 
    df = pd.DataFrame(get_data())
    
df.set_index('id', inplace= True)    
print(df.shape) 
df.sample(n=5)


(24353, 7)


Unnamed: 0_level_0,title,selftext,created_utc,subreddit,num_comments,score,upvote_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
crfff4,"What part of ""I do not have that type of room ...","About ten minutes ago I had a phone call, it w...",1566004000.0,TalesFromTheFrontDesk,87,1725,0.99
dweyph,To the couple who left a $0.76 tip,I despise you. Not just for the poor excuse of...,1573762000.0,TalesFromYourServer,475,4613,0.91
o2h88m,"Scam with Bank of America, Zelle and Chase",So I wanted to write about a scam I \*almost\*...,1623993000.0,personalfinance,711,6514,0.97
p0mwc9,Factory 4,I work the line at Factory 4. The work is hard...,1628456000.0,shortscarystories,69,1775,0.99
byiwjh,AITA for exposing the real reason my sister an...,"throwaway, and I will be keeping this is Anony...",1560076000.0,AmItheAsshole,2215,25765,0.9


### Functions necessary to clean data

In [39]:
from datetime import datetime
from textblob import TextBlob
import re 
import textstat

def calculate_downvotes(upvotes, upvote_ratio):
      return upvotes/upvote_ratio - upvotes

def convert_to_time(string):
      return datetime.strptime(string, "%H:%M")

def handle_date(created_time):
      dt = datetime.fromtimestamp(created_time)
      
      date = f"{str(dt.month).zfill(2)}-{str(dt.day).zfill(2)}-{dt.year}"
      time_of_day = convert_to_time(dt.strftime("%H:%M"))
      
      if convert_to_time("00:00") <= time_of_day <= convert_to_time("04:59"):
            time_of_day = "Midnight"
      elif convert_to_time("05:00") < time_of_day < convert_to_time("05:59"):
            time_of_day = "Dawn"
      elif convert_to_time("06:00") < time_of_day < convert_to_time("11:59"):
            time_of_day = "Morning"
      elif convert_to_time("12:00") < time_of_day < convert_to_time("12:59"):
            time_of_day = "Noon"
      elif convert_to_time("13:00") < time_of_day < convert_to_time("16:59"):
            time_of_day = "Afternoon"
      elif convert_to_time("17:00") < time_of_day < convert_to_time("18:59"):
            time_of_day = "Evening"
      else:
            time_of_day = "Night"

      return {"date": date, "time_of_day": time_of_day}

def get_fre_score_level(score):
      levels = [
            (30, 'Very Confusing'),
            (50, 'Difficult'),
            (60, 'Fairly Difficult'),
            (70, 'Standard'),
            (80, 'Fairly Easy'),
            (90, 'Easy'),
      ] 
    
      for threshold, level in levels:
        if score < threshold:
            return level
      
      return 'Very Easy'

def get_word_list(string):
      return re.findall(r"\b\w[\w'-]*\b", str(string))

def divider(num1, num2):
      return num1 / num2 if num2 > 0 else 0

def get_avg_word_length(arr):
      return round(divider(sum(len(word) for word in arr), len(arr)), 3)

def get_sentences(string):
      return TextBlob(string).sentences

def get_sentiment(string):
      return TextBlob(string).sentiment

def get_syllable_count(string):
      return textstat.syllable_count(string)

def get_reading_score(string):
      return textstat.flesch_reading_ease(string)

def get_reading_grade(string):
      return textstat.text_standard(string)

def get_story_sent_info(sentences):
      amount_sent = len(sentences)
      words_in_sent = 0
      syllables_in_sent = 0
      polarity_sentences = 0
      subjectivity_sentences = 0
      reading_score_sentences = 0
      
      for sentence in sentences:
            sent = str(sentence)
            sentiment = get_sentiment(sent)
            words_in_sent += len(get_word_list(sent))
            syllables_in_sent += get_syllable_count(sent)
            polarity_sentences += sentiment.polarity
            subjectivity_sentences += sentiment.subjectivity
            reading_score_sentences += get_reading_score(sent)
      
      return {
            "amount_sentences": len(sentences),
            "avg_words_per_sentence": round(divider(words_in_sent, amount_sent), 3),
            "avg_syllables_per_sentence": round(divider(syllables_in_sent, amount_sent), 3),
            "avg_polarity_per_sentence": round(divider(polarity_sentences, amount_sent), 3),
            "avg_subjectivity_per_sentence": round(divider(subjectivity_sentences, amount_sent), 3),
            "avg_readscore_per_sentence": round(divider(reading_score_sentences, amount_sent), 3)
      } 

def info(string, isTitle):
      words = get_word_list(string)
      name = 'title' if isTitle else 'story'
      
      result =  {
            f'{name}_length': len(string),
            f'{name}_word_count': len(words),
            f'{name}_avg_word_length': get_avg_word_length(words),
            f'{name}_syllables': get_syllable_count(string),
            f'{name}_reading_score': get_reading_score(string),
            f'{name}_reading_grade': get_reading_grade(string),
            f'{name}_reading_difficulty': get_fre_score_level(get_reading_score(string)),
            f'{name}_polarity': round(get_sentiment(string).polarity, 3),
            f'{name}_subjectivity': round(get_sentiment(string).subjectivity, 3)     
      }
      
      if not isTitle:
        result.update(get_story_sent_info(get_sentences(string)))
      
      return result


In [40]:
df = df[df['title'].str.len() > 0]
df = df[df['selftext'].str.len() > 0]

print(df.shape) 
df.sample(n=5)

(22722, 7)


Unnamed: 0_level_0,title,selftext,created_utc,subreddit,num_comments,score,upvote_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
a8lkxb,“did that hurt?”,"Short, but bewildering:\n\nI m standing agains...",1545492000.0,TalesFromRetail,117,2416,0.99
b1nw0k,Look after my kids... again.,"Back when I was a teenager, my Aunt would ask ...",1552706000.0,ProRevenge,262,8428,0.97
he4yn5,Heard a deep voice speaking a strange language...,I m tagging this sleep paralysis because I thi...,1592875000.0,Paranormal,139,700,0.97
gd3teh,Customers whining about call experiences vs. w...,"**Complaint:** ""I was on hold for twenty minut...",1588562000.0,talesfromcallcenters,125,951,0.99
dxo1a1,The customer's always right about the price......,A recent post here reminded me of this. So I h...,1574005000.0,TalesFromRetail,140,3519,0.99


### Applying functions to clean data

In [41]:
df['downvotes'] = calculate_downvotes(df['score'], df['upvote_ratio']).astype(int)

df[['date', 'time_of_day']] = df['created_utc'].apply(lambda x: pd.Series(handle_date(x)))

df[['title_length', 'title_word_count', 'title_avg_word_length', 
    'title_syllables', 'title_reading_score', 'title_reading_grade', 
    'title_reading_difficulty', 'title_polarity', 'title_subjectivity']
   ] = df['title'].apply(lambda x: pd.Series(info(x, True)))

df[['story_length', 'story_word_count', 'story_avg_word_length',
    'story_syllables', 'story_reading_score', 'story_reading_grade',
    'story_reading_difficulty', 'story_polarity', 'story_subjectivity', 
    'amount_sentences', 'avg_words_per_sentence', 'avg_syllables_per_sentence',
    'avg_polarity_per_sentence', 'avg_subjectivity_per_sentence',
    'avg_readscore_per_sentence']
   ] = df['selftext'].apply(lambda x: pd.Series(info(x, False)))


print(df.shape) 
df.sample(n=5)

(22722, 34)


Unnamed: 0_level_0,title,selftext,created_utc,subreddit,num_comments,score,upvote_ratio,downvotes,date,time_of_day,...,story_reading_grade,story_reading_difficulty,story_polarity,story_subjectivity,amount_sentences,avg_words_per_sentence,avg_syllables_per_sentence,avg_polarity_per_sentence,avg_subjectivity_per_sentence,avg_readscore_per_sentence
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
hdf7g0,i still can't believe it that every upvote is ...,&#x200B;\n\ni got bullied a lot and i never re...,1592776000.0,CasualConversation,276,28039,0.9,3115,06-21-2020,Evening,...,7th and 8th grade,Easy,0.166,0.367,18,15.444,19.278,0.091,0.289,87.291
ifgm3q,I kissed a girl today,"Edit: Woman, not girl.\n\nI separated from ex-...",1598236000.0,CasualConversation,278,14600,0.93,1098,08-23-2020,Night,...,3rd and 4th grade,Easy,0.143,0.542,19,9.211,12.105,0.11,0.386,85.282
zzpfgs,"Adult man tells his mommy on me, then hits me ...",I had a lady come down asking for my assistanc...,1672475000.0,TalesFromTheFrontDesk,37,1720,0.98,35,12-31-2022,Midnight,...,7th and 8th grade,Easy,0.052,0.452,23,21.87,26.913,0.067,0.338,80.176
gk7wiu,TIFU by hiding in my girlfriend's room when he...,"Long time Reddit reader, first time making an ...",1589544000.0,tifu,2311,49617,0.83,10162,05-15-2020,Morning,...,4th and 5th grade,Very Easy,0.057,0.474,163,13.736,16.945,0.031,0.319,88.429
cqvfk2,Does anyone remember picking their parents?,"Odd post, I know, but I think it falls under t...",1565901000.0,Paranormal,537,738,0.93,55,08-15-2019,Afternoon,...,6th and 7th grade,Fairly Easy,0.109,0.556,7,12.571,17.286,0.045,0.364,73.026


In [43]:
df = df.rename(columns={"score": "upvotes"})

columns = [
    'subreddit', 'date', 'time_of_day',
    'title_length', 'title_word_count', 'title_avg_word_length',
    'title_syllables', 'title_reading_score', 'title_reading_grade', 
    'title_reading_difficulty', 'title_polarity', 'title_subjectivity', 
    'story_length', 'story_word_count', 'story_avg_word_length', 
    'story_syllables', 'story_reading_score', 'story_reading_grade', 
    'story_reading_difficulty', 'story_polarity', 'story_subjectivity', 
    'amount_sentences', 'avg_words_per_sentence', 'avg_syllables_per_sentence', 
    'avg_polarity_per_sentence', 'avg_subjectivity_per_sentence', 
    'avg_readscore_per_sentence', 'num_comments', 'upvotes', 'upvote_ratio', 'downvotes'
]

df = df[columns]

df.sample(n=5)


Unnamed: 0_level_0,subreddit,date,time_of_day,title_length,title_word_count,title_avg_word_length,title_syllables,title_reading_score,title_reading_grade,title_reading_difficulty,...,amount_sentences,avg_words_per_sentence,avg_syllables_per_sentence,avg_polarity_per_sentence,avg_subjectivity_per_sentence,avg_readscore_per_sentence,num_comments,upvotes,upvote_ratio,downvotes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
asxr9t,TalesFromTheCustomer,02-20-2019,Night,46,7,5.714,12,55.91,-1th and 0th grade,Fairly Difficult,...,11,22.364,30.091,0.048,0.353,68.775,85,869,0.98,17
6uqkc1,TalesFromRetail,08-19-2017,Afternoon,43,9,3.778,10,104.64,3rd and 4th grade,Very Easy,...,6,38.667,50.333,-0.043,0.555,69.312,190,4407,0.96,183
di8aq9,TalesFromTheCustomer,10-15-2019,Morning,36,6,5.167,11,48.47,7th and 8th grade,Difficult,...,19,19.579,25.053,0.079,0.264,76.985,39,1410,0.99,14
7pfbpj,talesfromtechsupport,01-10-2018,Morning,62,13,3.385,15,98.72,-1th and 0th grade,Very Easy,...,28,15.321,20.214,0.104,0.227,87.101,200,4383,0.98,89
1c89bii,TalesFromThePizzaGuy,04-19-2024,Evening,51,12,3.167,12,95.17,2nd and 3rd grade,Very Easy,...,24,11.292,13.75,0.003,0.123,91.741,32,376,0.98,7


In [44]:
df.to_excel('storiesDB.xlsx')