<h1>HackerNews Data pipeline</h1> 

In [133]:
from pipeline import Pipeline
from utils import build_csv
from datetime import datetime
from stop_words import stop_words
import io, csv, json, string

pipeline = Pipeline()

# load data from json
@pipeline.task()
def file_to_json():
    """Loads a json file into dictionary and return list of stories"""
    with open('hn_stories_2014.json', 'r') as f: 
        data = json.load(f)   # accept file object for deserialisation
        stories = data['stories']
        return stories
    
# filter popular stories based on points
@pipeline.task(depends_on=file_to_json)
def filter_stories(stories):
    """Filter and return a generator of popular stories"""
    return (story for story in stories if (story['points'] > 50 and story['num_comments'] > 1 and not story['title'].startswith('Ask HN')))

# write the filtered stories to csv file 
@pipeline.task(depends_on=filter_stories)
def json_to_csv(filtered_stories):
    """Write json to csv file"""
    csv_header = ['objectID', 'created_at', 'url', 'points', 'title']
    # get all the filtered stories 
    data = []
    for story in filtered_stories: 
        data.append((story['objectID'], datetime.strptime(story['created_at'],"%Y-%m-%dT%H:%M:%SZ"), 
                    story['url'], story['points'], story['title']))
    return build_csv(data, file=io.StringIO(), header=csv_header)

# extract tiltle of post
@pipeline.task(depends_on=json_to_csv)
def extract_titles(filtered_stories_csv):
    """Return generator of post title"""
    reader = csv.reader(filtered_stories_csv) 
    # find index of header 
    csv_header = next(reader)
    title_idx = csv_header.index('title')
    # iterate through reader to extract title    
    titles = (line[title_idx] for line in reader)
    return titles

# clean the titles
@pipeline.task(depends_on=extract_titles)
def clean_titles(titles):
    """Transform titles to lowercase and remove punctuation"""
    for title in titles:
        title = title.lower() 
        title = ''.join(char for char in title if char not in string.punctuation)
        yield title

# build word frequency from cleaned titles 
@pipeline.task(depends_on=clean_titles)
def build_keyword_dict(titles): 
    """Count word frequency of titles excluding stop words 
    and return a dictionary containing the word and frequency count as key-value pair
    """
    frequency = {}
    for title in titles: 
        words = title.split(" ")
        words = filter(lambda x: x not in stop_words, words)
        for word in words: 
                if word not in frequency: 
                    frequency[word] = 0
                frequency[word] += 1 
    return frequency

# sort top words 
@pipeline.task(depends_on=build_keyword_dict)
def find_top_words(frequency): 
    """Sort frequency dictionary based on value to find top words"""
    # sort by frequency in descending order
    sorted_freq = sorted(frequency.items(), key=lambda x: x[1], reverse=True)
    return sorted_freq

In [134]:
# run the pipeline 
result = pipeline.run() 
# get the result for top words mentioned in the posts 
result[find_top_words]  

[('new', 185),
 ('google', 167),
 ('', 159),
 ('bitcoin', 101),
 ('open', 92),
 ('programming', 90),
 ('web', 88),
 ('data', 85),
 ('video', 79),
 ('python', 75),
 ('code', 72),
 ('facebook', 71),
 ('released', 71),
 ('using', 70),
 ('2013', 65),
 ('javascript', 65),
 ('free', 64),
 ('source', 64),
 ('game', 63),
 ('internet', 62),
 ('microsoft', 59),
 ('c', 59),
 ('linux', 58),
 ('app', 57),
 ('pdf', 55),
 ('work', 54),
 ('language', 54),
 ('software', 52),
 ('2014', 52),
 ('startup', 51),
 ('apple', 50),
 ('use', 50),
 ('make', 50),
 ('time', 48),
 ('yc', 48),
 ('security', 48),
 ('nsa', 45),
 ('github', 45),
 ('windows', 44),
 ('world', 41),
 ('way', 41),
 ('like', 41),
 ('1', 40),
 ('project', 40),
 ('computer', 40),
 ('heartbleed', 40),
 ('git', 37),
 ('users', 37),
 ('dont', 37),
 ('design', 37),
 ('ios', 37),
 ('developer', 36),
 ('os', 36),
 ('twitter', 36),
 ('ceo', 36),
 ('vs', 36),
 ('life', 36),
 ('big', 35),
 ('day', 35),
 ('android', 34),
 ('online', 34),
 ('years', 33),
