In [9]:
import json
import pandas as pd
from datetime import datetime as dt


In [7]:
# Borrowed from here: https://stackoverflow.com/questions/5891555/display-the-date-like-may-5th-using-pythons-strftime

def suffix(d):
    return 'th' if 11<=d<=13 else {1:'st',2:'nd',3:'rd'}.get(d%10, 'th')

def custom_strftime(format, t):
    return t.strftime(format).replace('{S}', str(t.day) + suffix(t.day))


### Load the New Joke of the Day

In [3]:
fileName = '../data/raw/Daily Chuck Norris Joke for {}'.format(custom_strftime('%B {S}, %Y', dt.now()))
print('Loading a Chuck Norris joke to "{}"'.format(fileName))


Loading a Chuck Norris joke to "../data/raw/Daily Chuck Norris Joke for August 4th, 2022"


In [4]:
f = open(fileName)
data = json.load(f)

f.close()


In [5]:
data

{'categories': [],
 'created_at': '2020-01-05 13:42:28.420821',
 'icon_url': 'https://assets.chucknorris.host/img/avatar/chuck-norris.png',
 'id': 'HT825IiPR9qy6vvxlA_hQQ',
 'updated_at': '2020-01-05 13:42:28.420821',
 'url': 'https://api.chucknorris.io/jokes/HT825IiPR9qy6vvxlA_hQQ',
 'value': "After seeing ''The Blair Witch Project'', Chuck Norris went into the woods, found the Blair Witch, and roundhouse-kicked it repeatedly until it died. When Chuck Norris pays 6 dollars to see a witch movie, you'd better show him a fuckin' witch."}

### Load the Jokes Database

In [20]:
try:
    oldDf = pd.read_parquet('../data/jokes.parquet')
    
    oldDf.head()
except:
    print("Maybe the file doesn't exist yet, that's okay, it will get created shortly.")
    
    oldDf = pd.DataFrame()

Maybe the file doesn't exist yet, that's okay, it will get created shortly.


### Add the new joke

In [37]:
newDf = pd.DataFrame.from_records([data])

df = pd.concat([oldDf, newDf], ignore_index=True)

print('The DB now contains {} jokes, but there may be duplicates.'.format(len(df)))

The DB now contains 1 jokes, but there may be duplicates.


### Deduplicate

In [38]:
df.drop_duplicates(subset='id', keep="first", inplace=True)

print('The DB now contains {} jokes, all duplicates have been removed.'.format(len(df)))

The DB now contains 1 jokes, all duplicates have been removed.


### Save

In [39]:
df.to_parquet('../data/jokes.parquet', index=False)