In [14]:
"""
Hacker News Dataset Analysis
Quick overview of the dataset
"""

from datasets import load_dataset
from collections import Counter
import numpy as np
from tqdm import tqdm

### **Explore Hacker News Data**

In [15]:
# Load dataset
print("\n�� Loading dataset...")
dataset = load_dataset("julien040/hacker-news-posts", split="train", cache_dir="../mainrun/data")
print(dataset)

# Take subset of dataset
sub_dataset = dataset.select(range(100))

# Convert to pandas dataframe
df = sub_dataset.to_pandas()
df.head()


�� Loading dataset...
Dataset({
    features: ['id', 'title', 'url', 'score', 'time', 'comments', 'author'],
    num_rows: 4010957
})


Unnamed: 0,id,title,url,score,time,comments,author
0,3404047,Copyright Office Seeks To Make It More Difficu...,http://www.techdirt.com/articles/20111227/1728...,31,1325175114,2,nextparadigms
1,7642295,High Fidelity System Architecture,https://highfidelity.io/blog/2014/04/high-fide...,1,1398367025,0,_pius
2,261704,"Ribbit investor talks about good returns, bad ...",http://venturebeat.com/2008/07/29/ribbit-inves...,1,1217418551,0,paulsb
3,31192205,Why Optimizing Employee Performance Is the Fut...,https://www.forbes.com/sites/forbestechcouncil...,1,1651152663,1,JSeymourATL
4,7624091,Tiling by regular polygons,https://en.wikipedia.org/wiki/Tiling_by_regula...,2,1398116185,0,hnha


In [16]:
# Extract 100k titles
titles = [row['title'].strip() for row in dataset.take(100_000)]

In [17]:
# Example Titles
titles[:5]

['Copyright Office Seeks To Make It More Difficult To Retain DMCA Safe Harbors',
 'High Fidelity System Architecture',
 'Ribbit investor talks about good returns, bad markets and ‘Communication 2.0′',
 'Why Optimizing Employee Performance Is the Future of Work',
 'Tiling by regular polygons']

In [20]:
from collections import Counter
import pandas as pd

def basic_stats(titles):

    title_lengths, word_counts, word_freq = [], [], Counter()

    for title in tqdm(titles):
        title_lengths.append(len(title))
        words = title.split()
        word_counts.append(len(words))
        word_freq.update(words)

    return word_freq, pd.DataFrame(data={'title_lengths': title_lengths, 'word_counts': word_counts})

word_freq, stat_df = basic_stats(titles)

100%|██████████| 100000/100000 [00:08<00:00, 11388.66it/s]


In [21]:
stat_df.describe()

Unnamed: 0,title_lengths,word_counts
count,100000.0,100000.0
mean,48.22232,7.78984
std,17.941885,3.079378
min,1.0,1.0
25%,35.0,6.0
50%,48.0,8.0
75%,62.0,10.0
max,147.0,28.0


In [26]:
print("Vocab size: ", len(word_freq))
print('Most common words: ', word_freq.most_common(10))

Vocab size:  110474
Most common words:  [('to', 17494), ('the', 14426), ('of', 13208), ('for', 11102), ('in', 10945), ('and', 10849), ('a', 10831), ('The', 8377), ('HN:', 7049), ('on', 5952)]
