In [1]:
from os import listdir
from os.path import join
import pandas as pd
import numpy as np
import pickle
import tqdm
import re

# 1.Get variables and load dataframes

In [2]:
print('Loading variables')
project_folder = join('/Users','Toavina','githubdata')

hn_df_load_folder = join('6.combining_hn_and_ghusers','2.cleaned_df')
hn_df_filename = 'cleaned_df.pkl'

ghusers_df_load_folder = join('3.gh_users_filter','1.pickles')
ghusers_df_filename = 'user_datas_df.pkl'

save_folder = join('3.df_w_ghuser_data')
save_filename = 'agg_df.pkl'
save_filename_w_gh_only = 'agg_df_gh_only.pkl'

gh_users_pickles_folder = join(project_folder,'2.gh_userinfo_dl','2.pickles')
new_gh_users_pickle_filename = 'hnusers_to_dl_from_gh.pkl'

new_gh_df_path = join(ghusers_df_load_folder,'hn_users_df.pkl')

Loading variables


In [3]:
# TODO - Merge github user dataframes

# 2.Merging dataframes by inferred github username

In [4]:
print('Loading HN users and Github users dataframes')

hn_df = pickle.load(open(join(project_folder,hn_df_load_folder,hn_df_filename),'rb'))
ghusers_df = pickle.load(open(join(project_folder,ghusers_df_load_folder,ghusers_df_filename), 'rb'))

#Drop duplicate rows from ghusers_df
ghusers_df = ghusers_df.drop_duplicates()

# Resetting index of ghusers_df
ghusers_df = ghusers_df.reset_index().drop('index', axis =1)

Loading HN users and Github users dataframes


In [5]:
print('Loading downloaded GH users')
hn_users_from_gh = pickle.load(open(join(project_folder,new_gh_df_path),'rb'))
hn_users_from_gh = hn_users_from_gh.drop_duplicates()
hn_users_from_gh = hn_users_from_gh.reset_index().drop('index', axis=1)

print('Removing users not found')
hn_users_from_gh = hn_users_from_gh[hn_users_from_gh['message'] !='Not Found']

print('There are ' + str(len(hn_users_from_gh)) + ' unique users downloaded that have a Github account')

Loading downloaded GH users
Removing users not found
There are 884 unique users downloaded that have a Github account


In [6]:
print('Merging GH user dataframes')
gh_users = pd.concat([ghusers_df,hn_users_from_gh])

print('Dropping duplicates')
gh_users.drop_duplicates(inplace=True)

Merging GH user dataframes
Dropping duplicates


In [7]:
print('Resetting and saving relevant indices')

# Resetting the index for ghusers_df
gh_users = gh_users.reset_index().drop('index', axis =1)

# Renaming HN_username to make sure not confusing
hn_df = hn_df.rename(columns = {'user': 'hn_username'})

# Save old_index so can be restored if needed
hn_df['old_index'] = hn_df.index

Resetting and saving relevant indices


In [8]:
print('Amending inferred_ghuser column to remove nan string')

def amend_nans(item_in_inferred_ghuser):
    """Replaces nans with real NaNs, otherwise run into issues later"""
    if item_in_inferred_ghuser == 'nan':
        item_in_inferred_ghuser = np.nan
    return item_in_inferred_ghuser

hn_df['inferred_ghuser'] = hn_df['inferred_ghuser'].apply(amend_nans)

Amending inferred_ghuser column to remove nan string


In [9]:
print('Joining GH users database to HN database')
# Save inferred ghuser back to refer to if necessary
hn_df['inferred_ghuser_copy'] = hn_df['inferred_ghuser']

agg_df = hn_df.merge(gh_users,
                     how='left',
                     left_on='inferred_ghuser',
                     right_on='login',
                     suffixes=('_hn','_gh')
                    )

# Drop weird duplicates created by checking for duplicates in old_index, keep first entry
agg_df.drop_duplicates(subset='old_index', inplace=True)

Joining GH users database to HN database


In [10]:
print('There are ' +str(len((agg_df[(~pd.isnull(agg_df['github_account'])) & (~pd.isnull(agg_df['type']))]))) +\
      ' events that match in both tables - does not count for multiple messages')

There are 1267 events that match in both tables - does not count for multiple messages


In [11]:
print('Dropping duplicates where text and date is the same (i.e. only one entry per month)')
agg_df.drop_duplicates(subset=['text','days_ago'],keep='first', inplace=True)
agg_df.drop_duplicates(subset=['hn_username','date'],keep='first', inplace=True)

Dropping duplicates where text and date is the same (i.e. only one entry per month)


In [12]:
print('Saving aggregate dataframe to '+join(save_folder,save_filename))
pickle.dump(agg_df, open(join(save_folder,save_filename),'wb'))

print('Saving aggregate dataframe containing only GH users to ' + join(save_folder,
                                                                      save_filename_w_gh_only))
pickle.dump(agg_df[~pd.isnull(agg_df['url'])], open(join(save_folder,save_filename),'wb'))

Saving aggregate dataframe to 3.df_w_ghuser_data/agg_df.pkl
Saving aggregate dataframe containing only GH users to 3.df_w_ghuser_data/agg_df_gh_only.pkl


In [13]:
print('Process complete')

Process complete


# 3.Merging dataframes by matching emails in both dataframes

In [14]:
# Note - not doing this as so few emails - need to download additional users from github

#matching_emails = []

#for email in tqdm.tqdm_notebook(ghusers_df['email']):
#     for contact in hn_df['contact']:
#         if email == contact:
#             matching_emails.append(email)

# Appendix. Creating list of Github users to download - Used in Step 2

In [15]:
# print('Creating a list of Github users to download')
# gh_accounts_to_dl = agg_df[~pd.isnull(agg_df['github_account'])]['github_account']

In [16]:
# def get_username(gh_link):
    
#     username = str(gh_link)
#     username = re.sub('https://github.com/','',username)
#     username = re.sub('http://github.com/','',username)
#     username = re.sub('http://github/','',username)
#     username = re.sub('https://github/','',username)
#     username = re.sub('github.com/','',username)
#     username = re.sub('\?tab=repositories','',username)
#     username = re.sub('Github:','',username)
#     username = re.sub('\-$','',username)
#     username = re.sub(' ','',username)
#     username = re.sub('/','',username)
    
    
#     return username

In [17]:
# gh_accounts_to_dl = gh_accounts_to_dl.apply(get_username)

In [18]:
# for index, value in gh_accounts_to_dl.items():
#     print(index,value,'\n')

In [19]:
# print('Create manual changes to some usernames - CAUTION - IF INDEX CHANGED (5404 users) NEED TO CHANGE STEP')
# manual_changes = [(5232, 'remyferre'),(4154, 'mrmans0n'),(4077, 'CaioBianchi'),
#  (4091, np.nan),(3692,'kaymckelly'),(3672, 'joeltaylor'),(3049, 'faun'),
#  (2692, 'traverseda'),(1292, 'zura-kh'),(714, 'siscia'), (625, 'martingallagher')]
    
# for index, value in manual_changes:
#     gh_accounts_to_dl.loc[index] = value

In [20]:
# print('pickling list to download to ' + join(gh_users_pickles_folder,new_gh_users_pickle_filename))
# gh_accounts_to_dl = list(gh_accounts_to_dl)

In [21]:
# for each in gh_accounts_to_dl:
#     each = str(each)

In [22]:
# gh_accounts_to_dl = list(set(gh_accounts_to_dl))

In [23]:
# pickle.dump(gh_accounts_to_dl, open(join(gh_users_pickles_folder,new_gh_users_pickle_filename),'wb'))