In [2]:
import pandas as pd
import numpy as np
import _pickle
from os.path import join
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from functools import reduce
from collections import defaultdict
import re

In [70]:
%matplotlib inline
sns.set_style
pd.options.display.max_columns = 500

# 1. Load dataframe

In [71]:
print('Loading main dataframe with aggregated Linkedin, Github and Hacker News data')

inputfile_path = join('/Users/','Toavina','githubdata','11.getting_linkedin_data','4.pickles','merged_df.pkl')

main_df = _pickle.load(open(inputfile_path,'rb'))

# Change below if they have been changed in previous script
max_jobs = 6
max_edu = 5

# Education points for use in classifying degrees
edu_points = {'phd':7,
             'master':6,
             'bachelor':5,
             'associate':4,
             'certificate':3,
             'diploma':2,
             'school':1,
             'unknown':1}

# Cutoff dates for final table
min_date = '2012-12-31'
max_date = '2016-12-31'


Loading main dataframe with aggregated Linkedin, Github and Hacker News data


# 2. Creating timeseries dataframe for better charting and analysis user by user

## 2.1 Github events

In [72]:
print('Setting up functions to create time-series-index for Github events per user')

# -------------------------------------------------------------------------------

def select_indiv_eventcol(df,event_type,loc,weight=1):
    """- Selects columns for number of events for selected event type
    - df must be a dataframe containing tuples for the relevant columns,
    event_type can be one of the following:
    {'CreateEvent', 'PushEvent', 'GollumEvent',
    'PullRequestReviewCommentEvent', 'DeleteEvent',
    'PullRequestEvent', 'GistEvent', 'PublicEvent'}
    - Weight weighs that particular event by a certain factor
    - Returns a dataframe with the relevant columns for the event type
    weighed by the weight factor
    """
    
    # Store username for later aggregation
    gh_username = df['inferred_ghuser_copy'][loc]
    
    # Gets a list of column names which are tuples
    col_list = [(index, col[0]) for index, col in enumerate(df.columns)
            if type(col) is tuple]

    # Returns a list of indices for the relevant event
    relevant_index = [col[0] for col in col_list if event_type in col[1]]

    # Returns relevant columns with values multiplied by the weight
    df = df.iloc[loc,relevant_index] * weight
    
    # Take only the timeseries from the index
    df.index = df.index.map(lambda x: x[1])
    
    return df.rename(event_type+'_'+gh_username)

# -------------------------------------------------------------------------------


def agg_eventcols(df, col_list_weight,loc,colname):
    """Takes a dataframe df as the first argument, and a list of tuples of the
    form [(event_type, weight)] to return an aggregated dataframe that sums
    the frequencies of the event types weighted by the weight factor
    """
    gh_username = df['inferred_ghuser_copy'][loc]
    
    event_cols = [select_indiv_eventcol(df, event_type,loc, weight) for event_type, weight in \
    col_list_weight]

    event_cols = reduce(lambda x,y: x+y, event_cols)

    return event_cols.rename(colname+'_'+gh_username)

# -------------------------------------------------------------------------------


# list of event types to process to add to each user
event_types = ['CreateEvent', 'PushEvent', 'GollumEvent',
    'PullRequestReviewCommentEvent', 'DeleteEvent',
    'PullRequestEvent', 'GistEvent', 'PublicEvent']


# Weights for creating aggregate time series
equal_weights = [('CreateEvent', 1), 
                 ('PushEvent', 1),
                 ('DeleteEvent', 1),
                 ('GistEvent', 1),
                 ('GollumEvent', 1),
                 ('PublicEvent', 1),
                 ('PullRequestEvent', 1),
                 ('PullRequestReviewCommentEvent', 1)
                ]
    
perso_weight_list = [('CreateEvent', 2),
                      ('PushEvent', 1),
                      ('DeleteEvent', 2),
                      ('GistEvent', 1),
                      ('GollumEvent', 1),
                      ('PublicEvent', 2),
                      ('PullRequestEvent', 1),
                      ('PullRequestReviewCommentEvent', 1)
                     ]

# ---------------------------------------------------------------------------------------------------------

print('\nCreating time series of Github events')
user0_ghevents = [select_indiv_eventcol(main_df,event,0,1) for event in event_types]
user0_ghevents = pd.concat(user0_ghevents, axis=1)

print('Adding aggregate Github event columns, equal weights and personalised weights')
aggevent_equalcol = agg_eventcols(main_df,equal_weights,0,'AggEventsEqual')
aggevent_weightedcol = agg_eventcols(main_df,perso_weight_list,0,'AggEventsWeighted')
user0_ghevents = pd.concat([user0_ghevents,aggevent_equalcol,aggevent_weightedcol], axis=1)

# Change at the end as need to add level
# user0_ghevents.columns = user0_ghevents.columns.str.split('_', expand=True)
# user0_ghevents = user0_ghevents.reorder_levels([1,0], axis=1)

# ---------------------------------------------------------------------------------------------------------

Setting up functions to create time-series-index for Github events per user

Creating time series of Github events
Adding aggregate Github event columns, equal weights and personalised weights


## 2.2 Adding Hacker News Posts

In [73]:
# ---------------------------------------------------------------------------------------------------------

def create_hn_post_series(df):
    """Creates a series containing Hacker News Posts"""
    
    posted_dict = {}
    username = df['inferred_ghuser_copy'][0]
    
    for date in df['dates_posted'][0]:
        posted_dict[date] = np.int32(1)
        
    hn_series = pd.Series(posted_dict)
    
    return hn_series.rename('HNPosts' + '_' + username)


def merge_hn_ghevents(ghevents_df,hn_df):
    """Merges above HN series with user dataframe"""
    
    new_df = ghevents_df.join(hn_df)
    
    new_df[[col for col in new_df.columns if 'HNPosts' in col]] = \
    new_df[[col for col in new_df.columns if 'HNPosts' in col]].fillna(value=0)
    
    return new_df.astype('int32')

# ---------------------------------------------------------------------------------------------------------

print('Adding Hacker News Posts')
hn0_series = create_hn_post_series(main_df)
user0_ghevents = merge_hn_ghevents(user0_ghevents,hn0_series)

# posted_dict = {}
# username = main_df['inferred_ghuser_copy'][0]
# for each in main_df['dates_posted'][0]:
#     posted_dict[each] = np.int32(1)
# hn_series = pd.Series(posted_dict)
# hn_series = hn_series.rename('HNPosts_{}'.format(username))

# user0_ghevents = user0_ghevents.join(hn_series)

# user0_ghevents['HNPosts_{}'.format(username)] = user0_ghevents['HNPosts_{}'.format(username)].fillna(value=0)

# user0_ghevents['HNPosts_{}'.format(username)] = user0_ghevents['HNPosts_{}'.format(username)].astype('int32')

Adding Hacker News Posts


## 2.3 Adding Experiences Start Dates, End Dates by Type & with Details

In [74]:
# ---------------------------------------------------------------------------------------------------------

def create_exp_ts(df,loc):
    """Creates an experience time-series dataframe with four columns for job and education starts and ends"""
    
    ghusername = df['inferred_ghuser_copy'][loc]
    
    user_jobstart_dict = defaultdict(list)
    user_jobend_dict = defaultdict(list)
    user_edustart_dict = defaultdict(list)
    user_eduend_dict = defaultdict(list)
    user_expstart_dict = defaultdict(list)
    user_expend_dict = defaultdict(list)
    
    for event in df['all_exp'][loc]:
        
        user_expstart_dict[event['dates'][0]].append(event)
        user_expend_dict[event['dates'][1]].append(event)
        
        if event['exp_type'] == 'job':
            user_jobstart_dict[event['dates'][0]].append(event)
            user_jobend_dict[event['dates'][1]].append(event)
        
        else:
            user_edustart_dict[event['dates'][0]].append(event)
            user_eduend_dict[event['dates'][1]].append(event)
            
    user_expstart_df = pd.DataFrame(pd.Series(user_expstart_dict).rename('ExpStart_{}'.format(ghusername)))
    user_expend_df = pd.DataFrame(pd.Series(user_expend_dict).rename('ExpEnd_{}'.format(ghusername)))
    user_jobstart_df = pd.DataFrame(pd.Series(user_jobstart_dict).rename('JobStart_{}'.format(ghusername)))
    user_jobend_df = pd.DataFrame(pd.Series(user_jobend_dict).rename('JobEnd_{}'.format(ghusername)))
    user_edustart_df = pd.DataFrame(pd.Series(user_edustart_dict).rename('EduStart_{}'.format(ghusername)))
    user_eduend_df = pd.DataFrame(pd.Series(user_eduend_dict).rename('EduEnd_{}'.format(ghusername)))
    
    user_allexp = pd.concat([user_jobstart_df,user_jobend_df,
                            user_edustart_df,user_eduend_df,
                            user_expstart_df,user_expend_df], axis = 1)
    
    return user_allexp

# ---------------------------------------------------------------------------------------------------------

print('Adding experiences (job and education) with start and end dates')
user0_allexp = pd.concat([user0_ghevents,create_exp_ts(main_df,0)],axis=1)

Adding experiences (job and education) with start and end dates


In [75]:
def get_numexp(exp_df, maindf, userloc):
    """Gets the number of experiences for the user and returns columns with the relevant jobs and 
    educational attainments"""
    
    # Use global max_jobs and max_edu variables
    global max_jobs, max_edu
    
    # Get the username to append to each column for final dataframe
    ghusername = maindf['inferred_ghuser_copy'].iloc[userloc]
    
    # Calculate the number of experiences to calculate what to add 
    num_jobs = len([cell for cell in exp_df['JobStart_{}'.format(ghusername)] if pd.notnull(cell)])
    num_edu = len([cell for cell in exp_df['EduStart_{}'.format(ghusername)] if pd.notnull(cell)])
    num_exp = num_jobs + num_edu
    
    diff_jobs = max_jobs - num_jobs
    diff_edu = max_edu - num_edu
    diff_exp = diff_jobs + diff_edu
    
    # List the cells that are relevant to get the data from
    jobs_list = [cell for cell in exp_df['JobStart_{}'.format(ghusername)] if pd.notnull(cell)]
    edu_list = [cell for cell in exp_df['EduStart_{}'.format(ghusername)] if pd.notnull(cell)]
    exp_list = jobs_list + edu_list
    
    # Create relevant columns to populate in the dataframe
    for j in range(max_jobs):
        exp_df['JobExp{}_{}'.format(str(j),ghusername)] = np.nan
        exp_df['JobExp{}_{}'.format(str(j),ghusername)] = \
        exp_df['JobExp{}_{}'.format(str(j),ghusername)].astype(object)
        
    for e in range(max_edu):
        exp_df['EduExp{}_{}'.format(str(e),ghusername)] = np.nan
        exp_df['EduExp{}_{}'.format(str(e),ghusername)] = \
        exp_df['EduExp{}_{}'.format(str(e),ghusername)].astype(object)
    
    # Get the index to locate each relevant bit of information
    ts_index = exp_df.index
    
    
    # Jobs -----------------------------------------------------------------------------------------------
    
    # Populate the new columns with the relevant experience
    for j in range(num_jobs):
        for date in ts_index:
            # Create the relevant cells if the date index is within the beginning and start dates
            if (jobs_list[j][0]['dates'][0] <= date) & (jobs_list[j][0]['dates'][1] >= date):
                exp_df.set_value(date,'JobExp{}_{}'.format(str(j),ghusername),jobs_list[j][0])
    
    # Education -----------------------------------------------------------------------------------------------   
    
    # Populate the new columns with the relevant experience
    for j in range(num_edu):
        for date in ts_index:
            # Create the relevant cells if the date index is within the beginning and start dates
            if (edu_list[j][0]['dates'][0] <= date) and (edu_list[j][0]['dates'][1] >= date):
                exp_df.set_value(date,'EduExp{}_{}'.format(str(j),ghusername), edu_list[j][0])
                

    return exp_df
    
# --------------------------------------------------

user0_allexp = get_numexp(user0_allexp,main_df,0)

In [76]:
def explode_exp_info(row, maindf, userloc):
    "Explodes the information from each job and education cell. To be applied to each row of the experience dataframe"
    global max_edu, max_jobs
    
    ghusername = maindf['inferred_ghuser_copy'].iloc[userloc]
    
    for i in range(max_jobs):
        row['JobExpDates{}_{}'.format(str(i),ghusername)] = None
        row['JobExpDesc{}_{}'.format(str(i),ghusername)] = None
        row['JobExpInstitution{}_{}'.format(str(i),ghusername)] = None
        row['JobExpInstitutionType{}_{}'.format(str(i),ghusername)] = None
        row['JobExpOverallTenure{}_{}'.format(str(i),ghusername)] = None
        row['JobExpTitle{}_{}'.format(str(i),ghusername)] = None
        row['JobExpTitleType{}_{}'.format(str(i),ghusername)] = None
        
    for i in range(max_edu):
        row['EduExpDates{}_{}'.format(str(i),ghusername)] = None
        row['EduExpDesc{}_{}'.format(str(i),ghusername)] = None
        row['EduExpInstitution{}_{}'.format(str(i),ghusername)] = None
        row['EduExpInstitutionType{}_{}'.format(str(i),ghusername)] = None
        row['EduExpOverallTenure{}_{}'.format(str(i),ghusername)] = None
        row['EduExpTitle{}_{}'.format(str(i),ghusername)] = None
        row['EduExpTitleType{}_{}'.format(str(i),ghusername)] = None
    
    
    
    for i in range(max_jobs):
        if pd.notnull(row['JobExp{}_{}'.format(str(i),ghusername)]):
            
            row['JobExpDates{}_{}'.format(str(i),ghusername)] = \
            row['JobExp{}_{}'.format(str(i),ghusername)]['dates']
            
            row['JobExpDesc{}_{}'.format(str(i),ghusername)] = \
            row['JobExp{}_{}'.format(str(i),ghusername)]['desc']
            
            row['JobExpInstitution{}_{}'.format(str(i),ghusername)] = \
            row['JobExp{}_{}'.format(str(i),ghusername)]['institution']
            
            row['JobExpInstitutionType{}_{}'.format(str(i),ghusername)] = \
            row['JobExp{}_{}'.format(str(i),ghusername)]['institution_type']
            
            row['JobExpOverallTenure{}_{}'.format(str(i),ghusername)] = \
            row['JobExp{}_{}'.format(str(i),ghusername)]['tenure']
            
            row['JobExpOverallTenure{}_{}'.format(str(i),ghusername)] = \
            row['JobExp{}_{}'.format(str(i),ghusername)]['tenure']
            
            row['JobExpTitle{}_{}'.format(str(i),ghusername)] = \
            row['JobExp{}_{}'.format(str(i),ghusername)]['title']
            
            row['JobExpTitleType{}_{}'.format(str(i),ghusername)] = \
            row['JobExp{}_{}'.format(str(i),ghusername)]['title_type']
           
        
    for i in range(max_edu):
        if pd.notnull(row['EduExp{}_{}'.format(str(i),ghusername)]):

            row['EduExpDates{}_{}'.format(str(i),ghusername)] = \
            row['EduExp{}_{}'.format(str(i),ghusername)]['dates']

            row['EduExpDesc{}_{}'.format(str(i),ghusername)] = \
            row['EduExp{}_{}'.format(str(i),ghusername)]['desc']

            row['EduExpInstitution{}_{}'.format(str(i),ghusername)] = \
            row['EduExp{}_{}'.format(str(i),ghusername)]['institution']

            row['EduExpInstitutionType{}_{}'.format(str(i),ghusername)] = \
            row['EduExp{}_{}'.format(str(i),ghusername)]['institution_type']

            row['EduExpOverallTenure{}_{}'.format(str(i),ghusername)] = \
            row['EduExp{}_{}'.format(str(i),ghusername)]['tenure']

            row['EduExpOverallTenure{}_{}'.format(str(i),ghusername)] = \
            row['EduExp{}_{}'.format(str(i),ghusername)]['tenure']

            row['EduExpTitle{}_{}'.format(str(i),ghusername)] = \
            row['EduExp{}_{}'.format(str(i),ghusername)]['title']

            row['EduExpTitleType{}_{}'.format(str(i),ghusername)] = \
            row['EduExp{}_{}'.format(str(i),ghusername)]['title_type']
            
    return row

# ----------------------------------------------------------------------------

user0_allexp = user0_allexp.apply(explode_exp_info, args=(main_df,0), axis=1)


In [77]:
user0_allexp['Index'] = user0_allexp.index

In [1]:
def get_more_features(exp_df, maindf, userloc):
    """Extract more features - tenure, whether employed, in education, number of current job titles..."""
    global max_jobs, max_edu

    ghusername = maindf['inferred_ghuser_copy'].iloc[userloc]

    ts_index = exp_df.index

    # Create tenure columns
    for i in range(max_jobs):
        exp_df['JobExpCurrentTenure{}_{}'.format(str(i), ghusername)] = np.nan
        exp_df['JobExpCurrentTenure{}_{}'.format(str(
            i), ghusername)] = exp_df['JobExpCurrentTenure{}_{}'.format(
                str(i), ghusername)].astype(object)

    # Calculate tenure for each job
    for i in range(max_jobs):
        for date in ts_index:
            if np.any(
                    pd.notnull(exp_df.ix[date, 'JobExpDates{}_{}'.format(
                        str(i), ghusername)])):
                exp_df.set_value(
                    date,
                    'JobExpCurrentTenure{}_{}'.format(str(i), ghusername),
                    date - exp_df.
                    ix[date, 'JobExpDates{}_{}'.format(str(i), ghusername)][0])
                
                
    # Calculate number of jobs, education and total experiences at each date
    
    for date in ts_index:
        exp_df.ix[date,'NumCurrentJobs_{}'.format(ghusername)] = \
        int(exp_df.ix[date, [col for col in exp_df.columns if re.search('JobExpInstitution\d', col)]].count())
        
        exp_df.ix[date,'NumCurrentEdu_{}'.format(ghusername)] = \
        int(exp_df.ix[date, [col for col in exp_df.columns if re.search('EduExpInstitution\d', col)]].count())
        
        exp_df.ix[date,'NumCurrentJobsAndEdu_{}'.format(ghusername)] = \
        int(exp_df.ix[date, [col for col in exp_df.columns if re.search('EduExpInstitution\d', col)]].count()) + \
        int(exp_df.ix[date, [col for col in exp_df.columns if re.search('JobExpInstitution\d', col)]].count())
        
    
    # Create EmploymentStatus, EducationStatus, NEET Status
    
    for date in ts_index:
        if exp_df.ix[date,'NumCurrentJobs_{}'.format(ghusername)] == 0:
            exp_df.ix[date,'EmploymentStatus_{}'.format(ghusername)] = 0
        else:
            exp_df.ix[date,'EmploymentStatus_{}'.format(ghusername)] = 1
            
    for date in ts_index:
        if exp_df.ix[date,'NumCurrentEdu_{}'.format(ghusername)] == 0:
            exp_df.ix[date,'StudyingStatus_{}'.format(ghusername)] = 0
        else:
            exp_df.ix[date,'StudyingStatus_{}'.format(ghusername)] = 1
            
    for date in ts_index:
        if exp_df.ix[date,'NumCurrentJobsAndEdu_{}'.format(ghusername)] == 0:
            exp_df.ix[date,'NEET_{}'.format(ghusername)] = 1
        else:
            exp_df.ix[date,'NEET_{}'.format(ghusername)] = 0
            
    
    # Create a status flag for a new job and for an end of jobs
    
    for date in ts_index:
        exp_df.ix[date,'NewJobFlag_{}'.format(ghusername)] = \
        int(exp_df.ix[date, [col for col in exp_df.columns if re.search('JobStart', col)]].count())
        
    for date in ts_index:
        exp_df.ix[date,'EndJobFlag_{}'.format(ghusername)] = \
        int(exp_df.ix[date, [col for col in exp_df.columns if re.search('JobEnd', col)]].count())
    
    for date in ts_index:
        exp_df.ix[date,'StartEduFlag_{}'.format(ghusername)] = \
        int(exp_df.ix[date, [col for col in exp_df.columns if re.search('EduStart', col)]].count())
        
    for date in ts_index:
        exp_df.ix[date,'EndEduFlag_{}'.format(ghusername)] = \
        int(exp_df.ix[date, [col for col in exp_df.columns if re.search('EduEnd', col)]].count())
        

    return exp_df

# -------------------------------------------------------------

user0_allexp = get_more_features(user0_allexp,main_df,0)

NameError: name 'user0_allexp' is not defined

In [79]:
def cum_jobs_edu_todate(exp_df,maindf,userloc):
    
    ghusername = maindf['inferred_ghuser_copy'].iloc[userloc]
    exp_df['CumJobsToDate_{}'.format(ghusername)] = exp_df['NewJobFlag_{}'.format(ghusername)].cumsum()
    
    exp_df['CumEduToDate_{}'.format(ghusername)] = exp_df['StartEduFlag_{}'.format(ghusername)].cumsum()
    
# -------------------------------------------------------------

cum_jobs_edu_todate(user0_allexp,main_df,0)
    

In [80]:
def idenfify_highest_degree(exp_df,maindf,userloc):
    
    ghusername = maindf['inferred_ghuser_copy'].iloc[userloc]
    ts_index = exp_df.index # to change back if doesn't work
    
    # Creates value to initialize best edu_rating
    highest_edu_rating = 0
    highest_degree = 'unknown'
    highest_institution = 'unknown'
    highest_institution_type = 'unknown'
    highest_title = 'unknown'
    

    # Set the first value of the degree to minimum above before iteration
    exp_df.ix[0,'HighestDegree_{}'.format(ghusername)] = highest_degree
    exp_df.ix[0,'HighestInstitution_{}'.format(ghusername)] = highest_institution
    exp_df.ix[0,'HighestInstitutionType_{}'.format(ghusername)] = highest_institution_type
    exp_df.ix[0,'HighestDegreeDesc_{}'.format(ghusername)] = highest_title
    exp_df.ix[0,'HighestDegreeStartDate_{}'.format(ghusername)] = None
    exp_df.ix[0,'HighestDegreeEndDate_{}'.format(ghusername)] = None
    exp_df.ix[0,'HighestDegreeTimeSinceStartDate_{}'.format(ghusername)] = None
    exp_df.ix[0,'HighestDegreeTimeSinceEndDate_{}'.format(ghusername)] = None
    
    
    # Iterate through each row and compare if a higher degree has been achieved
    for i in range(len(ts_index)):
        
        # Ignore blank rows without new education status
        if np.any(pd.notnull(exp_df.ix[i,'EduStart_{}'.format(ghusername)])):
                    
            # Iterate through each education item and update best if better than what is there
            for edu in exp_df.ix[i,'EduStart_{}'.format(ghusername)]:
                edu_rating = edu_points[edu['title_type']]
                # Ignore equal education rating and only update if better
                if edu_rating >= highest_edu_rating:
                    highest_edu_rating = edu_rating
                    highest_degree = edu['title_type']
                    highest_institution = edu['institution']
                    highest_institution_type = edu['institution_type']
                    highest_title = edu['title']                    
                    highest_degree_start_date = edu['dates'][0]
                    highest_degree_end_date = edu['dates'][1]
            
            exp_df.ix[i,'HighestDegree_{}'.format(ghusername)] = highest_degree
            exp_df.ix[i,'HighestInstitution_{}'.format(ghusername)] = highest_institution
            exp_df.ix[i,'HighestInstitutionType_{}'.format(ghusername)] = highest_institution_type
            exp_df.ix[i,'HighestDegreeDesc_{}'.format(ghusername)] = highest_title            
            exp_df.ix[i,'HighestDegreeStartDate_{}'.format(ghusername)] = highest_degree_start_date
            exp_df.ix[i,'HighestDegreeEndDate_{}'.format(ghusername)] = highest_degree_end_date
            
            # Start date becomes relevant once entry is populated
            exp_df.ix[i,'HighestDegreeTimeSinceStartDate_{}'.format(ghusername)] = \
            exp_df.index[i] - exp_df.ix[i,'HighestDegreeStartDate_{}'.format(ghusername)]
            
            # End date only relevant once end date is smaller than the start date - test for that,
            # otherwise stays none
            
            if exp_df.ix[i,'HighestDegreeEndDate_{}'.format(ghusername)] < exp_df.index[i]:
                exp_df.ix[i,'HighestDegreeTimeSinceEndDate_{}'.format(ghusername)] = \
                exp_df.index[i] - exp_df.ix[i,'HighestDegreeEndDate_{}'.format(ghusername)]
            
        else:
            # Special case for first row
            if i == 0:
                pass
            
            else:
                # Set the degree to the previous value in the time series
                exp_df.ix[i,'HighestDegree_{}'.format(ghusername)] = \
                exp_df.ix[i-1,'HighestDegree_{}'.format(ghusername)]

                exp_df.ix[i,'HighestInstitution_{}'.format(ghusername)] = \
                exp_df.ix[i-1,'HighestInstitution_{}'.format(ghusername)]

                exp_df.ix[i,'HighestInstitutionType_{}'.format(ghusername)] = \
                exp_df.ix[i-1,'HighestInstitutionType_{}'.format(ghusername)]

                exp_df.ix[i,'HighestDegreeDesc_{}'.format(ghusername)] = \
                exp_df.ix[i-1,'HighestDegreeDesc_{}'.format(ghusername)]
                                    
                exp_df.ix[i,'HighestDegreeStartDate_{}'.format(ghusername)] = \
                exp_df.ix[i-1,'HighestDegreeStartDate_{}'.format(ghusername)]

                exp_df.ix[i,'HighestDegreeEndDate_{}'.format(ghusername)] = \
                exp_df.ix[i-1,'HighestDegreeEndDate_{}'.format(ghusername)]
                
                if exp_df.ix[i-1,'HighestDegree_{}'.format(ghusername)] != 'unknown':
                    exp_df.ix[i,'HighestDegreeTimeSinceStartDate_{}'.format(ghusername)] = \
                    exp_df.index[i] - exp_df.ix[i,'HighestDegreeStartDate_{}'.format(ghusername)]
                
                    if exp_df.ix[i,'HighestDegreeEndDate_{}'.format(ghusername)] < exp_df.index[i]:
                        exp_df.ix[i,'HighestDegreeTimeSinceEndDate_{}'.format(ghusername)] = \
                        exp_df.index[i] - exp_df.ix[i,'HighestDegreeEndDate_{}'.format(ghusername)]
                
    
    return exp_df
        
# ----------------------------------------------------------------
    
user0_allexp = idenfify_highest_degree(user0_allexp,main_df,0)

In [81]:
def idenfify_bootcamps(exp_df,maindf,userloc):
    
    ghusername = maindf['inferred_ghuser_copy'].iloc[userloc]
    ts_index = exp_df['Index']
    
    # Creates value to initialize best edu_rating
    bootcamp_institution = None
    bootcamp_dates = None
    time_since_bootcamp_start = None
    time_since_bootcamp_end = None
    
    # Set the first value of the degree to above before iteration
    exp_df.ix[0,'MostRecentBootCamp_{}'.format(ghusername)] = bootcamp_institution
    exp_df.ix[0,'BootCampDates_{}'.format(ghusername)] = bootcamp_dates
    exp_df.ix[0,'TimeSinceBootCampStart_{}'.format(ghusername)] = time_since_bootcamp_start
    exp_df.ix[0,'TimeSinceBootCampEnd_{}'.format(ghusername)] = time_since_bootcamp_end
    
    # Iterate through each row and see if a bootcamp has been taken
    for i in range(len(ts_index)):
        
        # Ignore blank rows without new education status
        if np.any(pd.notnull(exp_df.ix[i,'ExpStart_{}'.format(ghusername)])):
                    
            # Iterate through each education item and update best if better than what is there
            for exp in exp_df.ix[i,'ExpStart_{}'.format(ghusername)]:
                # Ignore equal education rating and only update if better
                if exp['institution_type'] == 'bootcamp' or exp['title_type'] == 'certificate':
                    bootcamp_institution = exp['institution']
                    bootcamp_dates = exp['dates']
                    time_since_bootcamp_start = ts_index[i] - bootcamp_dates[0]
                    time_since_bootcamp_end = ts_index[i] - bootcamp_dates[1]
            
            exp_df.ix[i,'MostRecentBootCamp_{}'.format(ghusername)] = bootcamp_institution
            exp_df.ix[i,'BootCampDates_{}'.format(ghusername)] = bootcamp_dates
            exp_df.ix[i,'TimeSinceBootCampStart_{}'.format(ghusername)] = time_since_bootcamp_start
            exp_df.ix[i,'TimeSinceBootCampEnd_{}'.format(ghusername)] = time_since_bootcamp_end
            
        else:
            # Special case for first row
            if i == 0:
                pass
            
            else:
                # Set the degree to the previous value in the time series
                exp_df.ix[i,'MostRecentBootCamp_{}'.format(ghusername)] = \
                exp_df.ix[i-1,'MostRecentBootCamp_{}'.format(ghusername)]

                exp_df.ix[i,'BootCampDates_{}'.format(ghusername)] = \
                exp_df.ix[i-1,'BootCampDates_{}'.format(ghusername)]
                
                if np.any(pd.isnull(exp_df.ix[i-1,'MostRecentBootCamp_{}'.format(ghusername)])):
                                    
                                    exp_df.ix[i,'TimeSinceBootCampStart_{}'.format(ghusername)] = \
                                    exp_df.ix[i-1,'TimeSinceBootCampStart_{}'.format(ghusername)]
                                    
                                    exp_df.ix[i,'TimeSinceBootCampEnd_{}'.format(ghusername)] = \
                                    exp_df.ix[i-1,'TimeSinceBootCampEnd_{}'.format(ghusername)]
                                    
                else:
                                    
                    exp_df.ix[i,'TimeSinceBootCampStart_{}'.format(ghusername)] = \
                    ts_index[i] - exp_df.ix[i,'BootCampDates_{}'.format(ghusername)][0]

                    exp_df.ix[i,'TimeSinceBootCampEnd_{}'.format(ghusername)] = \
                    ts_index[i] - exp_df.ix[i,'BootCampDates_{}'.format(ghusername)][1]
    
    return exp_df
        
# ----------------------------------------------------------------
user0_allexp = idenfify_bootcamps(user0_allexp,main_df,0)


In [82]:
def idenfify_recent_job(exp_df,maindf,userloc):
    
    ghusername = maindf['inferred_ghuser_copy'].iloc[userloc]
    ts_index = exp_df['Index']
    
    # Creates value to initialize best edu_rating
    recent_job = 'unknown'
    recent_institution = 'unknown'
    recent_institution_type = 'unknown'
    recent_title = 'unknown'
    recent_title_type = 'unknown'
    

    # Set the first value of the degree to minimum above before iteration
    exp_df.ix[0,'RecentJob_{}'.format(ghusername)] = recent_job
    exp_df.ix[0,'RecentJobInstitution_{}'.format(ghusername)] = recent_institution
    exp_df.ix[0,'RecentJobInstitutionType_{}'.format(ghusername)] = recent_institution_type
    exp_df.ix[0,'RecentJobDesc_{}'.format(ghusername)] = recent_title
    exp_df.ix[0,'RecentJobStartDate_{}'.format(ghusername)] = None
    exp_df.ix[0,'RecentJobEndDate_{}'.format(ghusername)] = None
    exp_df.ix[0,'RecentJobTimeSinceStartDate_{}'.format(ghusername)] = None
    exp_df.ix[0,'RecentJobTimeSinceEndDate_{}'.format(ghusername)] = None
    
    
    # Iterate through each row and compare if a more recent job has been started
    for i in range(len(ts_index)):
        
        # Ignore blank rows without new job status
        if np.any(pd.notnull(exp_df.ix[i,'JobStart_{}'.format(ghusername)])):
                    
            # Iterate through each education item and update best if better than what is there
            for job in exp_df.ix[i,'JobStart_{}'.format(ghusername)]:
                job_date = job['dates'][0]
                # Ignore equal education rating and only update if better
                if job_date >= ts_index[0]:
                    recent_job = job['title_type']
                    recent_institution = job['institution']
                    recent_institution_type = job['institution_type']
                    recent_title = job['title']                    
                    recent_job_start_date = job['dates'][0]
                    recent_job_end_date = job['dates'][1]
            
            exp_df.ix[i,'RecentJob_{}'.format(ghusername)] = recent_job
            exp_df.ix[i,'RecentJobInstitution_{}'.format(ghusername)] = recent_institution
            exp_df.ix[i,'RecentJobInstitutionType_{}'.format(ghusername)] = recent_institution_type
            exp_df.ix[i,'RecentJobDesc_{}'.format(ghusername)] = recent_title            
            exp_df.ix[i,'RecentJobStartDate_{}'.format(ghusername)] = recent_job_start_date
            exp_df.ix[i,'RecentJobEndDate_{}'.format(ghusername)] = recent_job_end_date
            
            # Start date becomes relevant once entry is populated
            exp_df.ix[i,'RecentJobTimeSinceStartDate_{}'.format(ghusername)] = \
            exp_df.ix[i,'Index'] - exp_df.ix[i,'RecentJobStartDate_{}'.format(ghusername)]
            
            # End date only relevant once end date is smaller than the start date - test for that,
            # otherwise stays none
            
            if exp_df.ix[i,'RecentJobEndDate_{}'.format(ghusername)] < exp_df.ix[i,'Index']:
                exp_df.ix[i,'RecentJobTimeSinceEndDate_{}'.format(ghusername)] = \
                exp_df.ix[i,'Index'] - exp_df.ix[i,'RecentJobEndDate_{}'.format(ghusername)]
            
        else:
            # Special case for first row
            if i == 0:
                pass
            
            else:
                # Set the degree to the previous value in the time series
                exp_df.ix[i,'RecentJob_{}'.format(ghusername)] = \
                exp_df.ix[i-1,'RecentJob_{}'.format(ghusername)]

                exp_df.ix[i,'RecentJobInstitution_{}'.format(ghusername)] = \
                exp_df.ix[i-1,'RecentJobInstitution_{}'.format(ghusername)]

                exp_df.ix[i,'RecentJobInstitutionType_{}'.format(ghusername)] = \
                exp_df.ix[i-1,'RecentJobInstitutionType_{}'.format(ghusername)]

                exp_df.ix[i,'RecentJobDesc_{}'.format(ghusername)] = \
                exp_df.ix[i-1,'RecentJobDesc_{}'.format(ghusername)]
                                    
                exp_df.ix[i,'RecentJobStartDate_{}'.format(ghusername)] = \
                exp_df.ix[i-1,'RecentJobStartDate_{}'.format(ghusername)]

                exp_df.ix[i,'RecentJobEndDate_{}'.format(ghusername)] = \
                exp_df.ix[i-1,'RecentJobEndDate_{}'.format(ghusername)]
                
                if exp_df.ix[i-1,'RecentJob_{}'.format(ghusername)] != 'unknown':
                    exp_df.ix[i,'RecentJobTimeSinceStartDate_{}'.format(ghusername)] = \
                    exp_df.ix[i,'Index'] - exp_df.ix[i,'RecentJobStartDate_{}'.format(ghusername)]
                
                    if exp_df.ix[i,'RecentJobEndDate_{}'.format(ghusername)] < exp_df.index[i]:
                        exp_df.ix[i,'RecentJobTimeSinceEndDate_{}'.format(ghusername)] = \
                        exp_df.ix[i,'Index'] - exp_df.ix[i,'RecentJobEndDate_{}'.format(ghusername)]
                
    
    return exp_df
        
# ----------------------------------------------------------------
    
user0_allexp = idenfify_recent_job(user0_allexp,main_df,0)

In [83]:
def get_min_max_tenure(row,maindf,userloc):
    """Get the minimum tenure of the most recent job - If very low, unlikely to be looking for job"""
    ghusername = maindf['inferred_ghuser_copy'].iloc[userloc]
    
    relevantcols = [col for col in row.index if 'JobExpCurrentTenure' in col]
    
    # Factor to divide result by to get result in days
    divfactor = 3600*24*1000000000
    
    if pd.isnull(row[relevantcols].values.any()):
        row['MinJobTenure_{}'.format(ghusername)] = 0
        row['MaxJobTenure_{}'.format(ghusername)] = 0
        row['MeanJobTenure_{}'.format(ghusername)] = 0
    
    else:
        row['MinJobTenure_{}'.format(ghusername)] = \
        min([item for item in row[relevantcols] if pd.notnull(item)])/divfactor
        
        row['MaxJobTenure_{}'.format(ghusername)] = \
        max([item for item in row[relevantcols] if pd.notnull(item)])/divfactor
        
        row['MeanJobTenure_{}'.format(ghusername)] = \
        np.mean([item for item in row[relevantcols] if pd.notnull(item)])/divfactor
        
    return row

# ----------------------------------------------------------------    

user0_allexp = user0_allexp.apply(get_min_max_tenure,args=(main_df,0),axis=1)

user0_allexp = user0_allexp.drop('Index', axis=1)

## 2.4 Adding All Info from Main Dataframe to TimeSeries

In [84]:
user_data0 = pd.DataFrame(main_df[[
    'linkedin_name', 'name', 'contact', 'gh_acct_created_at', 'updated_at',
    'followers', 'following', 'hireable', 'email', 'inferred_ghuser_copy',
    'login', 'github_account', 'hn_username', 'location_hn',
    'linkedin_location', 'location_gh', 'company', 'remote', 'can_relocate',
    'stack', 'resume', 'links', 'text', 'body', 'bio', 'blog', 'public_gists',
    'public_repos'
]].loc[0,:]).transpose()

user_data0['old_index'] = user_data0.index

username = user_data0['inferred_ghuser_copy'][0]

user_data0 = user_data0.rename(columns={'linkedin_name':'LinkedInName_{}'.format(username),
                           'name': 'Name_{}'.format(username),
                           'contact': 'Contact_{}'.format(username),
                           'gh_acct_created_at': 'GHAcctCreatedAt_{}'.format(username),
                           'updated_at': 'GHAcctUpdatedAt_{}'.format(username),
                           'followers': 'GHFollowers_{}'.format(username),
                           'following': 'GHFollowing_{}'.format(username),
                           'hireable': 'GHHireable_{}'.format(username),
                           'email': 'Email_{}'.format(username) ,
                           'inferred_ghuser_copy': 'InferredGHUserCopy_{}'.format(username) ,
                           'login': 'GHLogin_{}'.format(username),
                           'github_account': 'GHAcct_{}'.format(username),
                           'hn_username': 'HNUsername_{}'.format(username),
                           'location_hn': 'HNLocation_{}'.format(username),
                           'linkedin_location': 'LinkedInLocation_{}'.format(username),
                           'location_gh': 'GHLocation_{}'.format(username),
                           'company': 'GHCompany_{}'.format(username),
                           'remote': 'Remote_{}'.format(username),
                           'can_relocate': 'CanRelocate_{}'.format(username),
                           'stack': 'Stack_{}'.format(username),
                           'resume': 'Resume_{}'.format(username),
                           'links': 'Links_{}'.format(username),
                           'text': 'Text_{}'.format(username),
                           'body': 'Body_{}'.format(username),
                           'bio': 'Bio_{}'.format(username),
                           'blog': 'Blog_{}'.format(username),
                           'public_gists': 'PublicGists_{}'.format(username),
                           'public_repos': 'PublicRepos_{}'.format(username),
                           'old_index': 'OldIndex_{}'.format(username)             
                          })

user_data0.index = [np.datetime64('2013-12-31')]
date_index = pd.date_range('01/31/2013', periods=49, freq='M')
user_data0 = pd.concat([user_data0]*49)
user_data0.index = date_index

In [85]:
def add_gh_info(df,maindf,userloc):
    """ Adds time since GH account created"""
    
    username = maindf['inferred_ghuser_copy'].loc[userloc]
    
    month_GH_created = [item for item in df['GHAcctCreatedAt_{}'.format(username)].unique() \
                        if pd.notnull(item)][0].to_period('M').to_timestamp('M')
    
    df['GHAccountCreationMonthFlag_{}'.format(username)] = 0
    
    ts_index = df.index
    
    for date in ts_index:
        if month_GH_created == date:
            df['GHAccountCreationMonthFlag_{}'.format(username)] =1
            
        df.ix[date,'TimeSinceGHAccountCreated_{}'.format(username)] = date - month_GH_created
        
# ------------------------------------

add_gh_info(user_data0,main_df,0)


In [86]:
user_data0 = pd.concat([user0_allexp,user_data0],axis=1)

user_data0.columns = user_data0.columns.str.split('_', expand=True)
user_data0 = user_data0.reorder_levels([1,0], axis=1)

In [87]:
user_data0 = user_data0[(user_data0.index > min_date) & (user_data0.index < max_date)]