In [1]:
import glob
import os
import json
import pandas as pd

from datetime import datetime
from tqdm.notebook import tqdm
from typing import *

In [2]:
def extract_property_into_frame(data: Dict, prop_name: str) -> Optional[pd.DataFrame]:
    data = pd.json_normalize(data, prop_name)
    if len(data) == 0:
        return None

    data['type'] = prop_name
    data.rename(columns={'name': 'value', 'total_seconds': 'total_seconds'}, inplace=True)
    data.drop(columns=['digital', 'decimal', 'text', 'hours', 'minutes', 'seconds', 'percent'], inplace=True)
    return data

In [3]:
def read_jsons_into_df(directory: str) -> pd.DataFrame:
    df_list = []

    json_files = glob.glob(os.path.join(directory, '**/*.json'), recursive=True)
    for file_path in tqdm(json_files):
        with open(file_path, 'r') as f:
            data = json.load(f)

            try:
                date = data.get('end', None)
                date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ")
                date = date.date()
            except:
                print(f'Failure in {file_path}')
    
            data = data.get('data', [{}])[0]
            lang_data = extract_property_into_frame(data, 'languages')
            if lang_data is not None:
                lang_data['date'] = date
                df_list.append(lang_data)
    
            editor_data = extract_property_into_frame(data, 'editors')
            if editor_data is not None:
                editor_data['date'] = date
                df_list.append(editor_data)
    
            project_data = extract_property_into_frame(data, 'projects')
            if project_data is not None:
                project_data['date'] = date
                df_list.append(project_data)

    df = pd.concat(df_list, ignore_index=True)
    df.drop(columns=['color'], inplace=True)

    return df

df = read_jsons_into_df('history')
df

  0%|          | 0/2468 [00:00<?, ?it/s]

Unnamed: 0,value,total_seconds,type,date
0,Rust,9091.801,languages,2021-09-01
1,Markdown,476.291,languages,2021-09-01
2,Other,400.559,languages,2021-09-01
3,C#,340.222,languages,2021-09-01
4,TOML,253.322,languages,2021-09-01
...,...,...,...,...
18860,JSON,0.002,languages,2022-12-31
18861,Rider,11755.267,editors,2022-12-31
18862,CLion,10.808,editors,2022-12-31
18863,Nyris.ImageRecognition,11755.267,projects,2022-12-31


In [4]:
df.to_csv('summarized.csv')

df[df.type == 'languages'].to_csv('redacted.csv')