# Summary

The `RepoHealth` notebook reviews descriptive statistics and finds patterns which seem to be
shared by healthy and unhealthy repositories. This `PredictingRepoHealth` notebook focuses
solely on predicting time series. 

$v(t) = \alpha + \beta_1 x_1 + \epsilon$

In [1]:
ls ~/code/*.parquet

/Users/tylerbrown/code/author_2019-04-20_0.parquet
/Users/tylerbrown/code/commit_file_2019-04-20_0.parquet
/Users/tylerbrown/code/commit_file_2019-04-20_1.parquet
/Users/tylerbrown/code/contrib_2019-04-20_0.parquet
/Users/tylerbrown/code/info_2019-04-20_0.parquet
/Users/tylerbrown/code/meta_2019-04-20_0.parquet


## Dependencies



In [2]:
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

## Load Data

Databases have been consolidated and switched to Parquet files with
no more than 2e6 records in each.

In [3]:
commits = pd.read_parquet('/Users/tylerbrown/code/commit_file_2019-04-20_0.parquet', 'pyarrow').append(
    pd.read_parquet('/Users/tylerbrown/code/commit_file_2019-04-20_1.parquet', 'pyarrow')
)
commits.shape

  labels = getattr(columns, 'labels', None) or [
  return pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names)
  labels, = index.labels


(2327964, 5)

In [4]:
meta = pd.read_parquet('/Users/tylerbrown/code/meta_2019-04-20_0.parquet', 'pyarrow')
meta.shape

(928779, 3)

In [5]:
author = pd.read_parquet('/Users/tylerbrown/code/author_2019-04-20_0.parquet')
author.shape

(928779, 4)

## Helper functions

Use these later in a Python script

In [6]:
def acceleration(v):
    """ Convert velocity vector to acceleration """
    v1 = [v[i] - v[i-1] if i > 0 else v[i] for i in range(v.shape[0])]
    return v1

def acceleration_per_repo(df, period):
   
    cols = ['authored','owner_name', 'project_name', 
            'lines_added', 'lines_subtracted']
    
    per = df.authored.dt.to_period(period)
    grp = [per,'owner_name', 'project_name']
    
    dfgrp = df[cols].groupby(grp).sum()
    dfgrp['velocity'] = dfgrp.lines_added - dfgrp.lines_subtracted
    dfgrp = dfgrp.reset_index()
    
    igrp = ['owner_name', 'project_name']
    dfgrp['acceleration'] = 0.0
    for i, row in dfgrp[igrp].groupby(igrp).count().reset_index().iterrows():
        
        subdf = dfgrp[(dfgrp.owner_name == row.owner_name) & (dfgrp.project_name == row.project_name)]
        v = subdf.velocity.values
        acc = acceleration(v)
        dfgrp.loc[(dfgrp.owner_name == row.owner_name) & 
           (dfgrp.project_name == row.project_name), 'acceleration'] = acc
        
    return dfgrp

# Data Preprocessing

We need to set up a time series.

In [7]:
# Create working table

df = pd.merge(pd.merge(meta, commits), author)
df.shape

(2327964, 10)

In [8]:
print("Weird things didn't happen: {}".format(df.shape[0] == commits.shape[0]))

Weird things didn't happen: True


In [9]:
df['ts'] = pd.to_datetime(df.authored)

## Feature Engineering

We need to create some features to predict velocity.

In [10]:
df.columns

Index(['commit_hash', 'owner_name', 'project_name', 'file_id', 'modified_file',
       'lines_added', 'lines_subtracted', 'name', 'email', 'authored', 'ts'],
      dtype='object')

In [21]:
def num_authors(df, period: str):
    """ Number of authors in a given time period. """
    per = df.ts.dt.to_period(period)
    
    cols = ['name','owner_name', 'project_name']
    grp = [per, 'owner_name', 'project_name']
    result = df[cols].groupby(grp).count()
    return result

In [92]:
def num_mentors(df, period:str, subperiod:str, k:int):
    """ 
    Number of authors in a larger time period who have also
    made commits in k number of smaller time periods. 
    
    For example, number of authors in a year who have also 
    committed changes in each month of that year.
    """
    subper = df.ts.dt.to_period(subperiod)
    subcols = ['owner_name', 'project_name',
               'name', 'email']
    subgrp = [subper, 'owner_name', 'project_name', 'name', 'email']
    ok = df[subcols].groupby(subgrp).count()
    ok = ok.reset_index()
    
    if period == 'Y':
        
        ok['year'] = ok.ts.apply(lambda x: x.year)
        cols = ['year', 'owner_name', 'project_name', 'name', 'email']
        grp = ['year','owner_name','project_name', 'name']
        ok = ok[cols].groupby(grp).count()
        
        ok = ok.reset_index()
        ok = ok[['year', 'owner_name', 'project_name', 'email']]
        ok.columns = ['year', 'owner_name', 'project_name', 'mentor_count']
        ok = ok[ok.mentor_count >= k].groupby(['year', 'owner_name', 'project_name']).count()
        result = ok.reset_index()
        return result
        
    else:
        raise Exception("Period '{}' not found".format(period))
    
    return df


In [93]:
#result = num_authors(df, 'M')
#result.head()

In [98]:
#hmm = num_mentors(df, 'Y', 'M', 6)