In [1]:
import numpy as np
import pandas as pd
import os
import glob
from pathlib import Path

In [2]:
def flatten(listOflists):
    '''
    Flattens a list of lists into a regular list.
    '''
    return [item for sublist in listOflists for item in sublist]

In [3]:
def get_summaryLocation():
    '''
    Uses glob to retrieve funtion-summary.json locations.
    Caveat: Assumes that the files are in the same working directory as the python kernel is running in.
    Workaround in pipeline: use a shell/bash cd command to point abs path when using an orchestrator.
    '''
    func_summaryLoc = []
    path = os.getcwd()+'\\hubfiles\\**\\**\\*[?.json]'
    for filepath in glob.glob(path):
        func_summaryLoc.append(filepath)
        func_summaryLocFiltered = [i for i in func_summaryLoc if 'function-summary' in i]
    return func_summaryLocFiltered

In [4]:
def preprocess(func):
    '''
    Function that takes getSummaryLocation as an input, returns a dataframe with username, 
    submission week (string), and function_summaries files as output
    '''
    
    userlist = []
    submission_list = []
    for path in func:
        brokenPath = Path(path).parts
        userlist.append(brokenPath[10])
        submission_list.append(brokenPath[9])
    tdf = pd.DataFrame({'username': userlist,
                            'submission_week': submission_list,
                            'path_to_file': func})
    file_values = []
    for val in tdf['path_to_file']:
        data = pd.read_json(val)
        summaries = data.values
        flattened_summaries = flatten(summaries)
        file_values.append(flattened_summaries)
    tdf['function_summaries'] = file_values
    tdf.drop(['path_to_file'], axis=1, inplace=True)
    return tdf

In [5]:
def map_weeks(dataframe):
    '''
    Takes a dataframe input with a column name "submission_week", and returns mappings of weeks for debugging
    as well as a transformed dataframe with week numbers as an integer.
    '''
    week_mapping = dict()
    for val in dataframe['submission_week'].unique():
        text = val.split('-')
        week_mapping[val] = int(text[1])

    mapped_df = dataframe.replace({'submission_week': week_mapping})
    return week_mapping, mapped_df

In [8]:
def write_ToCSV(dataframe):
    '''
    Simple function to write out a dataframe as a CSV file.
    '''
    dataframe.to_csv('function_summaries.csv', index=False)

In [9]:
data = preprocess(get_summaryLocation())
_ , df = map_weeks(data)
write_ToCSV(df)