In [7]:
%load_ext nb_black

from pipeline import Pipeline
import json
from pipeline import build_csv
import io
import datetime
import string

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [2]:
pipeline = Pipeline()

<IPython.core.display.Javascript object>

In [3]:
@pipeline.task()
def file_to_json():
    with open("hn_stories_2014.json") as f:
        stories = json.load(f)["stories"]
    return stories

<IPython.core.display.Javascript object>

In [4]:
@pipeline.task(depends_on=file_to_json)
def filter_stories(stories):
    return (
        story
        for story in stories
        if (story["points"] > 50)
        and (story["comments"] > 1)
        and not (story["title"].startswith("Ask HN"))
    )

<IPython.core.display.Javascript object>

In [5]:
@pipeline.task(depends_on=filter_stories)
def json_to_csv(stories):
    stories = [
        [
            story["objectID"],
            datetime.datetime.fromiso(story["created_at"]),
            story["url"],
            story["points"],
            story["title"],
        ]
        for story in stories
    ]
    return build_csv(
        stories,
        header=["objectID", "created_at", "url", "points", "title"],
        file=io.StringIO(),
    )

<IPython.core.display.Javascript object>

In [6]:
@pipeline.task(depends_on=filter_stories)
def extract_titles(csv_file):
    reader = csv.reader(csv_file)
    header = next(reader)
    idx = header.index("title")
    return (story[idx] for story in reader)

<IPython.core.display.Javascript object>

In [25]:
@pipeline.task(depends_on=extract_titles)
def clean_titles(titles):
    return (
        title.translate(title.maketrans("", "", string.punctuation)).lower()
        for title in titles
    )

# Alternate code
# @pipeline.task(depends_on=extract_titles)
# def clean_title(titles):
#     for title in titles:
#         title = title.lower()
#         title = ''.join(c for c in title if c not in string.punctuation)
#         yield title

<IPython.core.display.Javascript object>

In [22]:
x = "ABC#.%$"

<IPython.core.display.Javascript object>

In [23]:
x.strip(".%")

'ABC#.%$'

<IPython.core.display.Javascript object>

In [14]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

<IPython.core.display.Javascript object>

In [24]:
x.translate(x.maketrans("", "", string.punctuation))

'ABC'

<IPython.core.display.Javascript object>