In [1]:
import requests, re, json, datetime
import os, sys
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin
import uuid
from hashlib import md5
import dateparser

import sqlite3

first_time=False


def recombine_link_list(link_list):
    rlist = []
    t = ""
    s=-1
    for (url, text ,subtitle, desc, user, date ) in link_list:
        s = s + 1
        try:
            rlist.append((s, url, text, subtitle, desc, user, dateparser.parse(date).timestamp()))
        except:
            print ((s, url, text, subtitle, desc, user, dateparser.parse(date)))
            rlist.append((s, url, text, subtitle, desc, user, dateparser.parse(date)))
    return rlist

def recombine_anno_list(anno_list):
    rlist = []
    t = ""
    s=-1
    for (a,u,d) in anno_list:
        if u=="" and d=="":
            t=t+" "+a
        else:
            s=s+1
            t=(t+" "+a)
            #rlist.append((s,t.replace("\r\n", ""),u,d))
            rlist.append((s,t,u,dateparser.parse(d).timestamp()))
            t=""
    return rlist

def scrape_link_values(link_list_soup_element):
    link_url = link_list_soup_element.find('a')['href']
    try:
        link_text = "".join(link_list_soup_element.find_next_sibling().find('nobr').strings)
    except:
        link_text = ""
    link_subtitle = "".join(link_list_soup_element.find('a').strings)
    link_desc = "".join("".join(link_list_soup_element.find_next_sibling().find('br').next_element))
    link_user = "".join(link_list_soup_element.find_next_sibling().find('a').strings)
    lstring = "".join(link_list_soup_element.find_next_sibling().strings)
    link_date = "".join(link_list_soup_element.find_next_sibling().strings)[lstring.find(link_user)+
            len(link_user)+2:lstring.find(']',lstring.find(link_user)+len(link_user))]
    try:
        link_date = datetime.strptime(
            re.search("([Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec]{3} \d{2} \d{4})", link_date).group(1), "%b %d %Y").isoformat()
    except:
        pass
        #print(link_date)
    return link_url, link_text, link_subtitle, link_desc, link_user, link_date

def scrape_annotations(anno_element):
    #print(anno_element)
    anno_content = "".join(anno_element.find('font', attrs={'class':'fcs'}).strings)
    try:
        anno_user = "".join(anno_element.find('td', attrs={'class':'fcs'}).find('a').strings)
        #anno_date = datetime.datetime.now()
        anno_date = "".join(anno_element.find('td', attrs={'class':'fcs'}).strings)[-11:]
    except:
        anno_user = ""                        
        anno_date = ""
    return anno_content, anno_user, anno_date



def get_links(s, url):
    r = s.get (url)
    page_links_regex = re.compile("<a class=\"(?:newidea|oldidea)\" href=\"(/idea/.*?)\"")
    link_harvest = [urljoin(url,l).split("#")[0] for l in page_links_regex.findall(r.text)]
    return link_harvest

def idea_components(hb_link):
    l=hb_link
    r = s.get(l)
    fetch_time=datetime.datetime.now().timestamp()
    soup=bs(r.text,"html")
    mainpanel = soup.find('td', attrs={'class':'mainpanel'})
    idea_header = mainpanel.findAll('table')[2]
    title = str("".join(idea_header.find('a', attrs={'name':'idea'}).strings))
    fetch_id = str(uuid.uuid4())
    description = "".join(mainpanel.find('font', attrs={'class':'fcl'}).strings)
    #votes = self.getvotes("".join(mainpanel.find('td', attrs={'class':'controls'}).find('td', attrs={'valign':'top', 'align':'center'}).strings).replace("(","").replace(")","").split(","))
    copy = str("".join(idea_header.find('div', attrs={'class':'copy'}).strings))
    (user, text_date) = ( n.strip() for n in str("".join(idea_header.find('td', attrs={'class':'fcm'}).strings)).split(","))
    #idate = datetime.datetime.strptime(text_date, "%b %d %Y").isoformat()
    idate=dateparser.parse(text_date).timestamp()
    links = recombine_link_list([scrape_link_values(n) for n in idea_header.findAll('font', attrs={'class':'fcm'})])
    annos = recombine_anno_list([scrape_annotations(n) for n in idea_header.next_siblings if n.name=='table'])
    #print("".join([str(j) for j in [title, description, copy, user, idate, links, annos]]).encode("utf-8"))
    ihash = md5("".join([str(j) for j in [title, description, copy, user, idate, links, annos]]).encode("utf-8")).hexdigest()
    return {
                 "fetch_id" : fetch_id,
                 "url" : l, 
                 "hash" : ihash,
                 "title":title, 
                 "description" : description, 
                 "copy" : copy, 
                 "user" : user, 
                 "idea_date" : idate, 
                 "links": links, 
                 "annos" : annos,
                 "fetch_date" : fetch_time
            }


# SQLite requires dates be converted according to some convention - here we'll use integer seconds since epoch
# or whatever is convenient.
# Also, we have a multi-table structure, since annos and links are collections of records themselves.
# So the structure looks like:

#   +--------------------+
#   |  idea_fetch        |
#   +--------------------+
#   |  fetch_id (pk)     |
#   |  url               |
#   |  hash              |
#   |  title             |
#   |  description       |
#   |  copy              |
#   |  user              |
#   |  idea_date         |
#   |  fetch_date        |
#   +--------------------+


def sql_create_schema(conn,first_time=False):
    if first_time:
        c = conn.cursor()
        c.execute( """DROP TABLE idea_fetch""")
        c.close()
        c = conn.cursor()
        c.execute( """CREATE TABLE idea_fetch
                    (   fetch_id text,
                        url text, 
                        hash text, 
                        title text, 
                        description text, 
                        copy text, 
                        user text, 
                        idea_date integer, 
                        fetch_date integer)""")
        c.close()
        c = conn.cursor()
        c.execute( """DROP TABLE anno_fetch""")
        c.close()
        c = conn.cursor()
        c.execute( """CREATE TABLE anno_fetch
                    (   fetch_id text,
                        anno_seq integer, 
                        anno_text text, 
                        anno_user text, 
                        anno_date integer
                        )""")
        c.close()
        c = conn.cursor()
        c.execute( """DROP TABLE link_fetch""")
        c.close()
        c = conn.cursor()
        c.execute( """CREATE TABLE link_fetch
                    (   fetch_id text,
                        link_seq integer, 
                        link_url text, 
                        link_rickroll text, 
                        link_text text, 
                        link_anno text,
                        link_user text, 
                        link_date integer
                        )""")
        c.close()

        return True
    

def store_fetch_record(c,record):
    idea_insert_sql = """INSERT INTO idea_fetch VALUES
                        ( ?, ?, ?, ?, ?, ?, ?, ?, ? )"""
    anno_insert_sql = """INSERT INTO anno_fetch VALUES
                        ( ?, ?, ?, ?, ? )"""
    link_insert_sql = """INSERT INTO link_fetch VALUES
                    ( ?, ?, ?, ?, ?, ?, ?, ? )"""
    links_pk = uuid.uuid4()
    annos_pk = uuid.uuid4()
    idea_values = [record["fetch_id"], 
                   record["url"], 
                   record["hash"], 
                   record["title"], 
                   record["description"], 
                   record["copy"], 
                   record["user"], 
                   record["idea_date"], 
                   record["fetch_date"]]
    #for e,v in enumerate(idea_values):
    #    print(e,v)
    c.execute(idea_insert_sql, idea_values)
    for anno in record['annos']:
        anno_values = [record["fetch_id"], 
                       anno[0], 
                       anno[1], 
                       anno[2], 
                       anno[3]]
        c.execute(anno_insert_sql, anno_values)
    
    for link in record['links']:
        link_values = [record["fetch_id"], 
                       link[0], 
                       link[1], 
                       link[2], 
                       link[3],
                       link[4],
                       link[5],
                       link[6]]
        c.execute(link_insert_sql, link_values)
    

In [3]:
print (first_time)

False


In [4]:
# If an idea component is "novel" i.e. it has a hash that's not on file, then it can be saved for posterity
# It also qualifies for a review for any content that matches the search criteria. 
# The details of successful searches are then logged independently in such a way that they can be used to 
# filter out repeat matches. 

# What should be the logging mechanism? SQLlite probably. Makes sense to create a database to host and persist
# the content. 
conn=None
c=None
conn = sqlite3.connect('hb_records.db')

In [24]:
# This makes use of a search url, returning all the ideas posted today
url = "https://www.halfbakery.com/view/ftm=r86400:s=Qr:d=irq:dn={m}:ds=0:n=Today_27s_20Notions:i=A_20list_20of_20todays_20ideas_20and_20annotations:t=Today_27s_20Notions".format(m=100)
s = requests.Session()
contents = []
link_harvest = get_links(s, url)
for l in link_harvest:
    contents.append(idea_components(l))
lindex = [c['url'] for c in contents]    
#conn.row_factory = sqlite3.Row
# If this is the first time running, then we need to create the schema
if first_time is True:
    first_time = False
    sql_create_schema(conn, first_time)
print (lindex)

(3, 'https://www.halfbakery.com/idea/_22Duck_22_20and_20_22Ducking_22_20Word_20Processor_20Verification', '_22Duck_22_20and_20...ssor_20Verification', 'Former Nazi Chancellor of Germany mention #1', 'Last line of the idea post by doctorremulac3 on 23-NOV-2020 [', 'doctorremulac3', None)
(0, 'http://www.halfbakery.com/idea/Telepresence_20Airship', 'http://www.halfbake...epresence_20Airship', 'Telepresence Airship', 'Thank you [spacemoggy] for the inspiration. [', 'Worldgineer', None)
(1, 'https://www.halfbakery.com/view/ftm=r86400:fs=hitler:s=Q:d=iq:dn=100:ds=1', 'https://www.halfbak...=Q:d=iq:dn=100:ds=1', 'Start here...', 'Run this at same time each day and parse/compare results last seen / previous search hit? [', 'kdf', None)
['https://www.halfbakery.com/idea/Plaid_20conductor_20(Redundant_20Array_20of_20Independent_20Conductors)', 'https://www.halfbakery.com/idea/Days_20Since_20Hitler_20Was_20Mentioned_20Here', 'https://www.halfbakery.com/idea/Milligram', 'https://www.halfbakery.co

In [26]:

c = conn.cursor()
retrieve_latest_sql = """
                select i.url, i.hash, i.fetch_date 
                from (
                    select url, max(fetch_date) max_fetch_date
                    from (
                        select url, fetch_date
                         from idea_fetch
                         where url in ({in_list}))
                    group by url) as latest_v
                    join idea_fetch i on 
                    i.url = latest_v.url and
                    i.fetch_date = latest_v.max_fetch_date 
                """.format(in_list = ",".join(["?" for l in lindex]))

rs = c.execute(retrieve_latest_sql, lindex)
r_cols = rs.description
content_filter=[]
for r in rs:
    lindex_i = lindex.index(r[0])
    if contents[lindex_i]['hash']==r[1]:
        print ("Hashmatch - no update")
        content_filter.append(r[0])
    else:
        print ("Hashfail - got update")
        
rs.close()
c.close()

save_list = list(set(lindex).difference(set(content_filter)))

for c in contents:
    if c['url'] in save_list:
        store_fetch_record(conn, c)
        print("Saving", c['url'])

Hashmatch - no update
Hashmatch - no update
Hashmatch - no update
Hashmatch - no update
Hashmatch - no update
Hashmatch - no update
Hashmatch - no update
Hashmatch - no update
Hashmatch - no update
Hashmatch - no update
Hashmatch - no update
Hashmatch - no update
Hashmatch - no update
Hashmatch - no update
Hashmatch - no update
Hashmatch - no update
Hashmatch - no update


set()

In [15]:
c = conn.cursor()
rs = c.execute("""select url, hash, fetch_date
                from idea_fetch 
                order by url, fetch_date """)
r_cols = rs.description

for r in rs:
    #print ( [(r_cols[e][0], r[e]) for e,v in enumerate(r)] )
    print ( [(r[e]) for e,v in enumerate(r)] )
rs.close()
c.close()

['https://www.halfbakery.com/idea/30-300_25_20better_20foam_20earplugs', 'ebf93ada436691c17f2c903eea261df2', 1608226307.98454]
['https://www.halfbakery.com/idea/Cell_20phone_20tourism', 'a6f8a9eb2ba940e270a623217f6a13af', 1608226305.998213]
['https://www.halfbakery.com/idea/Completely_20Realistic_20Fake_20Candle', '526ae5b7cd8eaff0e38bd3d920f423c2', 1608226308.268491]
['https://www.halfbakery.com/idea/Days_20Since_20Hitler_20Was_20Mentioned_20Here', 'e983ab3e8492c98e23f6ac875f6175d7', 1608226303.061566]
['https://www.halfbakery.com/idea/Dinnerware_20tearoffs', 'e4702c677d8e609ac21d5402361fc230', 1608226307.093517]
['https://www.halfbakery.com/idea/Eugenics_20SEO', 'fdc2033e15427364dfa95f461bd122cf', 1608226308.881248]
['https://www.halfbakery.com/idea/F_fcrst_20annual_20HalfBakery_20_93Wo_20ist_20der_20F_fchrer_20_3f_94_20programming_20competition_2e', 'b92c80a6b32afe35f5e8d46edd0238c5', 1608226309.480555]
['https://www.halfbakery.com/idea/Gourmet_20dog', 'd91480008ee76740cef1058dec950

In [8]:
datetime.datetime.now().timestamp()

1608226309.781665