# Read in data
CSV file read in.

CSV file downloaded from [Kaggle](https://www.kaggle.com/hacker-news/hacker-news-posts)

In [1]:
#Read in csv
opened_file = open('HN_posts_year_to_Sep_26_2016.csv')
from csv import reader
read_file = reader(opened_file)
hn_data = list(read_file)
hn_header = hn_data[0]
hn = hn_data[1:]

def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))


# Summary of data

The data set has 7 columns and 393,119 rows of data (excl. the header)

The columns are:

|0|1|2|3|4|5|6|
|:---|:---|:---|:---|:---|:---|:---|
|id|title|url|num_points|num_comments|author|created_at|
|id of the post|title of the post|the url of the item being linked to|the number of upvotes the post received|the number of comments the post received|the name of the account that made the post|the date and time the post was made ("mm/dd/yyyy hh:mm" Eastern Time in the US)

In [2]:
print("Number of columns: ", len(hn_header), "\n")
print(hn_header)

Number of columns:  7 

['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at']


In [3]:
print("Number of rows of data: ", len(hn), "\n")

explore_data(hn,0,5)

Number of rows of data:  293119 

['12579008', 'You have two days to comment if you want stem cells to be classified as your own', 'http://www.regulations.gov/document?D=FDA-2015-D-3719-0018', '1', '0', 'altstar', '9/26/2016 3:26']


['12579005', 'SQLAR  the SQLite Archiver', 'https://www.sqlite.org/sqlar/doc/trunk/README.md', '1', '0', 'blacksqr', '9/26/2016 3:24']


['12578997', 'What if we just printed a flatscreen television on the side of our boxes?', 'https://medium.com/vanmoof/our-secrets-out-f21c1f03fdc8#.ietxmez43', '1', '0', 'pavel_lishin', '9/26/2016 3:19']


['12578989', 'algorithmic music', 'http://cacm.acm.org/magazines/2011/7/109891-algorithmic-composition/fulltext', '1', '0', 'poindontcare', '9/26/2016 3:16']


['12578979', 'How the Data Vault Enables the Next-Gen Data Warehouse and Data Lake', 'https://www.talend.com/blog/2016/05/12/talend-and-Â\x93the-data-vaultÂ\x94', '1', '0', 'markgainor1', '9/26/2016 3:14']




# Separate data into three lists: ask_posts, show_posts, and other_posts

The three lists as based on the title starting with "ask_posts" or "show_posts". Where a title does not start with one of these two options, the data is added to the other_posts list of lists.

The three list of lists have the following lengths:
- ask_posts 9,139
- show_posts 1,158
- other_posts 273,822

These three list add up to the total rows of the original dataset, which is 293,119

In [4]:
ask_posts = []
show_posts = []
other_posts = []

for row in hn:
    title = row[1]
    title = title.lower()
    
    if title.startswith("ask hn"):
        ask_posts.append(row)
    elif title.startswith("show hn"):
        show_posts.append(row)
    else:
        other_posts.append(row)
        
print("Number of rows in ask_posts: ", len(ask_posts))
print("Number of rows in show_posts: ", len(show_posts))
print("Number of rows in other_posts: ", len(other_posts))

print("\n", "The first 5 rows of the ask_posts list of lists:", "\n")
explore_data(ask_posts, 0, 5)

Number of rows in ask_posts:  9139
Number of rows in show_posts:  10158
Number of rows in other_posts:  273822

 The first 5 rows of the ask_posts list of lists: 

['12578908', 'Ask HN: What TLD do you use for local development?', '', '4', '7', 'Sevrene', '9/26/2016 2:53']


['12578522', 'Ask HN: How do you pass on your work when you die?', '', '6', '3', 'PascLeRasc', '9/26/2016 1:17']


['12577908', 'Ask HN: How a DNS problem can be limited to a geographic region?', '', '1', '0', 'kuon', '9/25/2016 22:57']


['12577870', 'Ask HN: Why join a fund when you can be an angel?', '', '1', '3', 'anthony_james', '9/25/2016 22:48']


['12577647', 'Ask HN: Someone uses stock trading as passive income?', '', '5', '2', '00taffe', '9/25/2016 21:50']




# Number of comments for each post type

The average number of comments for each post type is:
- ask_post average number of comments: 10.39
- show_post average number of comments: 4.89

It is evident that the ask_posts receive significantly more comments as the average is approximately double that of the show_posts average.

Based on this finding, this project will now focus on ask_posts

In [5]:
total_ask_comments = 0

for row in ask_posts:
    num_comments = int(row[4])
    total_ask_comments += num_comments
    
avg_ask_comments = total_ask_comments / len(ask_posts)

print("The average number of comments per ask_post is: ", avg_ask_comments)

The average number of comments per ask_post is:  10.393478498741656


In [6]:
total_show_comments = 0

for row in show_posts:
    num_comments = int(row[4])
    total_show_comments += num_comments
    
avg_show_comments = total_show_comments / len(show_posts)

print("The average number of comments per show_post is: ", avg_show_comments)

The average number of comments per show_post is:  4.886099625910612


# Relationship between time of day of post and number of comments

We will now calculate the posts created in each hour of the day, along with the number of comments recieved, and then determine the average number of comments a post receives for each hours of the day.

In [21]:
import datetime as dt

results_list = []

for row in ask_posts:
    dt_created = dt.datetime.strptime(row[6], "%m/%d/%Y %H:%M")
    num_comments = int(row[4])
    results_list.append([dt_created, num_comments])
    


counts_by_hour = {}
comments_by_hour = {}

for row in results_list:
    hour = row[0].strftime("%H")
    num_comments = row[1]
    
    if hour in counts_by_hour:
        counts_by_hour[hour] += 1
        comments_by_hour[hour] += num_comments
    else:
        counts_by_hour[hour] = 1
        comments_by_hour[hour] = num_comments



In [25]:
avg_by_hour = []

for hour in counts_by_hour:
    num_posts = counts_by_hour[hour]
    num_comments = comments_by_hour[hour]
    avg = num_comments / num_posts
    avg_by_hour.append([hour, avg])
    
explore_data(avg_by_hour, 0, 24)

['02', 11.137546468401487]


['01', 7.407801418439717]


['22', 8.804177545691905]


['21', 8.687258687258687]


['19', 7.163043478260869]


['17', 9.449744463373083]


['15', 28.676470588235293]


['14', 9.692007797270955]


['13', 16.31756756756757]


['11', 8.96474358974359]


['10', 10.684397163120567]


['09', 6.653153153153153]


['07', 7.013274336283186]


['03', 7.948339483394834]


['23', 6.696793002915452]


['20', 8.749019607843136]


['16', 7.713298791018998]


['08', 9.190661478599221]


['00', 7.5647840531561465]


['18', 7.94299674267101]


['12', 12.380116959064328]


['04', 9.7119341563786]


['06', 6.782051282051282]


['05', 8.794258373205741]




In [27]:
swap_avg_by_hour = []

for row in avg_by_hour:
    swap_avg_by_hour.append([row[1], row[0]])
    
sorted_swap = sorted(swap_avg_by_hour, reverse=True)

explore_data(sorted_swap, 0, 5)

[28.676470588235293, '15']


[16.31756756756757, '13']


[12.380116959064328, '12']


[11.137546468401487, '02']


[10.684397163120567, '10']




In [33]:
print("The top five hours for `Ask Posts` comments are:")
for row in sorted_swap[:5]:
    avg = row[0]
    hour = dt.datetime.strptime(row[1], "%H").strftime("%H:%M")
    print("{a}: {b:.2f} average comments per post".format(a = hour, b = avg))

The top five hours for `Ask Posts` comments are:
15:00: 28.68 average comments per post
13:00: 16.32 average comments per post
12:00: 12.38 average comments per post
02:00: 11.14 average comments per post
10:00: 10.68 average comments per post


# Summary of findings for relationship between time of day of post and number of comments
From the summary above we can see that top times to create a "Ask Post" and receive the maximum number of comments are around the middle of the day to mid afternoon US Eastern Time. We can also see that 2AM appears to be a populare time, which may be as a result of the inmpact of a different timezone.

For those based in Melbourne, Australia (like me), these times convert to:
1. 15:00 --> 05:00
2. 13:00 --> 03:00
3. 12:00 --> 02:00
4. 02:00 --> 16:00
5. 10:00 --> 00:00