In [1]:
import pandas as pd
import numpy as np
from os.path import join
import re
from multiprocessing import Pool
import pickle
import datetime
import _pickle


# 1. Loading variables and dataframes

In [2]:
print('Loading variables and dataframes')
gh_events_input = join('/Users', 'Toavina', 'githubdata',
                       '8.transposing_gh_events', '1.saved_df',
                       'gh_events_bymonthusertype_df.pkl')

hn_users_input = join('/Users', 'Toavina', 'githubdata',
                      '6.combining_hn_and_ghusers', '3.df_w_ghuser_data',
                      'agg_df.pkl')

save_folder = join('/Users', 'Toavina', 'githubdata',
                      '9.combining_gh_events_hn_users', '1.pickles')
save_filename = 'agg_df.pkl'

Loading variables and dataframes


In [3]:
gh_events = pickle.load(open(gh_events_input,'rb'))
hn_users = pickle.load(open(hn_users_input,'rb'))

# 2. Grouping HN Users posts by user - only changing month

In [4]:
print('Grouping HN users by username and adding month when applied')
hn_users_name_date = hn_users[['hn_username','date']]

Grouping HN users by username and adding month when applied


In [5]:
# Grouping by month posted on HN and username
hn_grouped = hn_users_name_date.groupby(['hn_username',pd.Grouper(key='date', freq='1M')]).count()
# Unstacking the date
hn_grouped = hn_grouped.unstack('date')
# Renaming the date column to something more explicit
hn_grouped = hn_grouped.rename(columns={'date':'posted_on_hn_in_month'})
# Filling NAs with 0
hn_grouped = hn_grouped.fillna(int(0))
# Changing type to boolean
hn_grouped = hn_grouped.astype('bool')

# 3. Dropping Duplicate HN Users - Keeping only latest value and exploding month posted

In [6]:
# Keeping only the latest HN post for each user
main_df = hn_users.sort_values(by=['hn_username','date']).drop_duplicates('hn_username').sort_index()
main_df = main_df.drop('date', axis=1)
# Saving the hn_username as that will be used as the index to join the dataframes
main_df['hn_username_bckup'] = main_df['hn_username']

# Joining the dataframe by the HN_username common index
main_df = main_df.set_index('hn_username')
main_df = main_df.join(hn_grouped)

# Resetting the index and restoring the hn_username column
main_df = main_df.set_index('old_index')
main_df['hn_username'] = main_df['hn_username_bckup']
main_df.index.name = ''



# 4. Combining GH events and HN users

In [7]:
# Change columns to int for readability
main_df[[col for col in main_df.columns if 'posted_on_hn' in col[0]]] = main_df[
    [col for col in main_df.columns if 'posted_on_hn' in col[0]]].astype('int')

In [8]:
# Save old index to revert afterwards
main_df['old_index'] = main_df.index

In [9]:
# Change the index of main_df to inferred_ghuser for joining
main_df = main_df.set_index('inferred_ghuser')

In [10]:
# Join the dataframes together
main_df = main_df.join(gh_events)



In [11]:
# Set the index of the dataframe to the previous index and sort it
main_df = main_df.set_index('old_index').sort_index()

In [12]:
# Remove the name of the index
main_df.index.name=''

In [14]:
main_df

Unnamed: 0,contact,github_account,linkedin_account,location_hn,linkedin_mention,github_mention,remote,can_relocate,stack,resume,...,"(WatchEvent, 2016-02-29 00:00:00)","(WatchEvent, 2016-03-31 00:00:00)","(WatchEvent, 2016-04-30 00:00:00)","(WatchEvent, 2016-05-31 00:00:00)","(WatchEvent, 2016-06-30 00:00:00)","(WatchEvent, 2016-07-31 00:00:00)","(WatchEvent, 2016-08-31 00:00:00)","(WatchEvent, 2016-09-30 00:00:00)","(WatchEvent, 2016-10-31 00:00:00)","(WatchEvent, 2016-11-30 00:00:00)"
,,,,,,,,,,,,,,,,,,,,,
6,linus@bomben.se,https://github.com/linus,,"Stockholm, Sweden · Remote · Contract",True,True,,,"JavaScript, Redis, Linux, RasPI, Arduino, Lua",http://careers.stackoverflow.com/linus·https:/...,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,http://flurdy.com/contact,https://github.com/flurdy,,"London/Hampshire/Surrey in UK or Remote, Full...",False,True,,,,,...,1.0,0.0,3.0,4.0,4.0,6.0,12.0,9.0,5.0,6.0
23,patz.hwang@gmail.com,https://github.com/xinhuang,,"China, Relocation, Full Time",False,True,,,"C++, C#, Scala (love & can work with), Lua, x...",http://xinhuang.github.io/resume.pdf,...,14.0,17.0,8.0,11.0,6.0,1.0,5.0,7.0,9.0,2.0
25,pyro@feisty.io,https://github.com/pyrotechnick,,"Brisbane, Australia | Remote | Full Time / Con...",False,True,,,"Node.js, JavaScript/CoffeeScript, WebGL",https://gist.github.com/pyrotechnick/d2a0d8bb8...,...,798.0,164.0,33.0,5.0,1.0,34.0,78.0,205.0,193.0,333.0
30,See GitHub,https://github.com/tombenner,,"San Francisco, Local, Full Time",False,True,,,"Ruby, Python, Rails, Django, Node.js, Postgre...",https://github.com/tombenner,...,0.0,0.0,0.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0
32,mikpanko@gmail.com,https://github.com/mikpanko,,"Boston, Relocation, Full Time",False,True,,,"Python, Javascript, MATLAB, SQL, MeteorJS, Gi...","http://bit.ly/mikpanko-resume,https://github.c...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34,https://github.com/datwelk,https://github.com/datwelk,,"Amsterdam, Relocation, Full Time / Contract",True,True,,,"Objective-C, C, Python, Git",http://nl.linkedin.com/pub/damiaan-twelker/46/...,...,0.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
48,balazsbela@gmail.com,https://github.com/balazsbela,,Anywhere in Sweden or Norway | Relocation | Fu...,True,True,,,"Embedded Linux, Qt, C/C++",http://linkedin.com/in/balazsbela|https://gith...,...,5.0,3.0,1.0,4.0,0.0,0.0,1.0,3.0,0.0,3.0
49,,https://github.com/vishalsodani,,"Mumbai, Remote, Full Time(priority)/Contract/P...",True,True,,,,,...,6.0,1.0,2.0,2.0,10.0,1.0,7.0,17.0,2.0,4.0


In [13]:
# 5. Pickling the data
print('Saving results to ' + join(save_folder,save_filename))
_pickle.dump(main_df,open(join(save_folder,save_filename),'wb'))

Saving results to /Users/Toavina/githubdata/9.combining_gh_events_hn_users/1.pickles/agg_df.pkl
