In [63]:
import pandas as pd
import numpy as np
from os.path import join
import re
from multiprocessing import Pool
import pickle

# 1. Load variables and input csv file

In [39]:
num_partitions = 7
num_cores = 7

In [60]:
input_file = join('/Users', 'Toavina', 'githubdata',
                  '7.regetting_gh_events_data', '2.big_query_results',
                  'hn_users_2dec16_ghevents_hnusers-2dec16.csv.gz')

save_folder = join('/Users', 'Toavina', 'githubdata',
                   '7.regetting_gh_events_data', '3.ghevents_df')
save_filename = join('hnusers_ghevents_df.pkl')

In [41]:
print('Loading GH events dataframe from zipped csv')
ghevents_df = pd.read_csv(input_file, compression='gzip')

Loading GH events dataframe from zipped csv


# 2. Classify events as own repos and organizations

In [42]:
def classify_own_repo(row):
    """Seeks actor_login in the repo_url - if finds a match, reports the action is on the user's own repo"""
    try:
        if row['actor_login'] in row['repo_url']:
            row['own_repo'] = 1
        else:
            row['own_repo'] = 0
    except:
        row['own_repo'] = 0
    return row

In [43]:
def classify_org(row):
    """Seeks an org_login and returns 1 if true, 0 if otherwise"""
    try:
        if pd.isnull(row['org_login']) == False:
            row['org_event'] = 1
        else:
            row['org_event'] = 0
    except:
        row['org_event'] = 0
    return row

In [44]:
def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df


def parallel_classify_repo(data):
    data = data.apply(classify_own_repo, axis=1)
    return data


def parallel_classify_org(data):
    data = data.apply(classify_org, axis=1)
    return data


In [45]:
print('Classifying events as own repo or not')
ghevents_df = parallelize_dataframe(ghevents_df, parallel_classify_repo)

Classifying events as own repo or not


In [46]:
print('Classifying events as org or not')
ghevents_df = parallelize_dataframe(ghevents_df, parallel_classify_org)

Classifying events as org or not


# 3. Pickling dataframe for next step 

In [66]:
print('Pickling dataframe for modifying - saved to '+ join(save_folder,save_filename))
pickle.dump(ghevents_df, open(join(save_folder,save_filename),'wb'))

Pickling dataframe for modifying - saved to /Users/Toavina/githubdata/7.regetting_gh_events_data/3.ghevents_df/hnusers_ghevents_df.pkl
