In [1]:
%load_ext nb_black

from pipeline import Pipeline
import json
import csv
from pipeline import build_csv
import io
import datetime
import string

<IPython.core.display.Javascript object>

In [2]:
pipeline = Pipeline()

<IPython.core.display.Javascript object>

In [3]:
@pipeline.task()
def file_to_json():
    with open("hn_stories_2014.json") as f:
        stories = json.load(f)["stories"]
    return stories

<IPython.core.display.Javascript object>

In [4]:
@pipeline.task(depends_on=file_to_json)
def filter_stories(stories):
    return (
        story
        for story in stories
        if (story["points"] > 50)
        and (story["num_comments"] > 1)
        and not (story["title"].startswith("Ask HN"))
    )

<IPython.core.display.Javascript object>

In [5]:
@pipeline.task(depends_on=filter_stories)
def json_to_csv(stories):
    lines = [
        [
            story["objectID"],
            datetime.datetime.strptime(
                story["created_at"], "%Y-%m-%dT%H:%M:%SZ"
            ),  # Sample: 2014-05-29T04:27:42Z
            story["url"],
            story["points"],
            story["title"],
        ]
        for story in stories
    ]
    return build_csv(
        lines,
        header=["objectID", "created_at", "url", "points", "title"],
        file=io.StringIO(),
    )

<IPython.core.display.Javascript object>

In [6]:
@pipeline.task(depends_on=json_to_csv)
def extract_titles(csv_file):
    reader = csv.reader(csv_file)
    header = next(reader)
    idx = header.index("title")
    return (story[idx] for story in reader)

<IPython.core.display.Javascript object>

In [7]:
@pipeline.task(depends_on=extract_titles)
def clean_titles(titles):
    return (
        title.translate(title.maketrans("", "", string.punctuation + "‘’–")).lower()
        for title in titles
    )

<IPython.core.display.Javascript object>

In [8]:
@pipeline.task(depends_on=clean_titles)
def word_freq_tbl_sorted(titles):
    counts = {}
    for title in titles:
        for word in title.split():
            if word not in counts:
                counts[word] = 0
            counts[word] += 1
    return sorted(counts.items(), key=lambda x: x[1], reverse=True)

<IPython.core.display.Javascript object>

In [9]:
@pipeline.task(depends_on=word_freq_tbl_sorted)
def top_100_title_words(word_counts):
    return word_counts[:100]

<IPython.core.display.Javascript object>

In [10]:
results = pipeline.run()
list(results.items())[6][1]

[('the', 1361),
 ('to', 936),
 ('a', 843),
 ('of', 783),
 ('for', 625),
 ('in', 567),
 ('and', 542),
 ('is', 409),
 ('on', 328),
 ('how', 268),
 ('with', 264),
 ('why', 212),
 ('hn', 210),
 ('show', 192),
 ('an', 192),
 ('your', 190),
 ('from', 187),
 ('new', 185),
 ('i', 175),
 ('google', 167),
 ('you', 157),
 ('by', 127),
 ('at', 125),
 ('not', 121),
 ('what', 121),
 ('are', 121),
 ('my', 120),
 ('as', 112),
 ('it', 111),
 ('that', 108),
 ('we', 106),
 ('bitcoin', 101),
 ('its', 93),
 ('open', 92),
 ('programming', 90),
 ('web', 89),
 ('data', 85),
 ('us', 85),
 ('be', 81),
 ('video', 79),
 ('python', 76),
 ('about', 75),
 ('code', 72),
 ('using', 71),
 ('facebook', 71),
 ('released', 71),
 ('now', 69),
 ('has', 67),
 ('2013', 65),
 ('javascript', 65),
 ('free', 64),
 ('source', 64),
 ('down', 63),
 ('internet', 63),
 ('game', 63),
 ('first', 62),
 ('go', 60),
 ('will', 59),
 ('microsoft', 59),
 ('one', 59),
 ('c', 59),
 ('linux', 58),
 ('when', 58),
 ('app', 57),
 ('all', 57),
 ('up

<IPython.core.display.Javascript object>

In [11]:
# next(generator)

<IPython.core.display.Javascript object>

In [12]:
pipeline

<pipeline.Pipeline at 0x20c0a9d0c70>

<IPython.core.display.Javascript object>

In [13]:
pipeline.tasks.graph

{<function __main__.file_to_json()>: [<function __main__.filter_stories(stories)>],
 <function __main__.filter_stories(stories)>: [<function __main__.json_to_csv(stories)>],
 <function __main__.json_to_csv(stories)>: [<function __main__.extract_titles(csv_file)>],
 <function __main__.extract_titles(csv_file)>: [<function __main__.clean_titles(titles)>],
 <function __main__.clean_titles(titles)>: [<function __main__.word_freq_tbl_sorted(titles)>],
 <function __main__.word_freq_tbl_sorted(titles)>: [<function __main__.top_100_title_words(word_counts)>],
 <function __main__.top_100_title_words(word_counts)>: []}

<IPython.core.display.Javascript object>

In [14]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

<IPython.core.display.Javascript object>

In [15]:
x = "show hn developeragents  a newsletter to promote remote developers"

<IPython.core.display.Javascript object>

In [16]:
x.split()

['show',
 'hn',
 'developeragents',
 'a',
 'newsletter',
 'to',
 'promote',
 'remote',
 'developers']

<IPython.core.display.Javascript object>