In [None]:
import pandas as pd
import json
import zipfile
import gzip
import glob
import os

import logging

In [None]:
!ls /disk/download/data/ | head

In [None]:
path = '/disk/download/data/2011-02-12-0.json.gz'

with gzip.open(path, 'rb') as f:
    file_content = f.read().decode()
    
    rows = [json.loads(line) for line in file_content.split('\n') if len(line) > 0]

In [None]:
rows[0]

In [None]:
rows[0].keys()

In [None]:
!pip3 install treelib

In [None]:
import treelib

In [None]:
# build tree based on keys/attributes


In [None]:
tree = treelib.Tree()

n = tree.create_node("test", 0)

In [None]:
n.identifier

In [None]:

def detect_type(value):
    if value is None:
        return 'None'
    elif value == dict:
        return 'dict'
   
    if isinstance(value, dict):
        return 'dict'
    if isinstance(value, list):
        
        if len(value) == 0:
            return 'List[]'
        # check if it's a homogenous list! -> if not type as list[any]
        el_types = {detect_type(el) for el in value}
        if len(el_types) == 1:
            return 'List[{}]'.format(list(el_types)[0])
        
        return 'List[Any]'
    if isinstance(value, str):
        return 'str'
    if isinstance(value, bool):
        return 'bool'
    if isinstance(value, int):
        return 'int'
    if isinstance(value, float):
        return 'float'

    return ''

class TreeData:
    def __init__(self, name, type=None):
        self.name = name
        self.type = type
        self.count = 1
    
    @property
    def desc(self):
        return '{}: {}'.format(self.name, self.type)
    
    def toJSON(self):
        return json.dumps({'type':self.type}, sort_keys=True)
    
def dict_to_tree(d):
    tree = treelib.Tree()
    node = tree.create_node(tag="root", identifier=0, data=TreeData("root", 'dict'))
    
    
    def recursive_helper(tree, parent, d):
        for key, value in d.items():
            node = tree.create_node(key, parent=parent.identifier, data=TreeData(key, detect_type(value)))

            if isinstance(value, dict):
                # recurse!
                recursive_helper(tree, node, value)
            elif isinstance(value, list):
                # recurse!
                # only take first element??
                if len(value) > 0 and isinstance(value[0], dict):
                    recursive_helper(tree, node, value[0])
            else:
                pass
            
    recursive_helper(tree, node, d)
   
    return tree

class ComplexEncoder(json.JSONEncoder):
    def default(self, z):
        if isinstance(z, TreeData):
            return z.toJSON()
        else:
            return super().default(z)

In [None]:
t1 = dict_to_tree(rows[0])
key = json.dumps(t1.to_dict(with_data=True), cls=ComplexEncoder, sort_keys=True)

In [None]:
help(t1.to_json)

In [None]:
dict_to_tree(rows[0]).show(data_property="desc")

In [None]:
dict_to_tree(rows[1]).show(data_property="desc")

In [None]:
t = dict_to_tree(rows[0])

In [None]:
counts = {}

counts[t] = 1

In [None]:
# count different trees and how often they occur!

In [None]:
def compare_trees(t1, t2):
    
    def compare_nodes(tree1, tree2, node1, node2): 
      # reimplement this comparison based on your needs: use data,...
      if(node1.data != node2.data):
        return False
      else:
        # access global trees
        children1 = tree1.children(node1.identifier)
        children2 = tree2.children(node2.identifier)

        if len(children1) != len(children2):
            return False
        elif len(children1) > 0:
            # recusivly compare children
            return all([compare_nodes(tree1, tree2, c1i, c2i) for c1i, c2i in zip(children1, children2)])
        else:
            return True
    
    return compare_nodes(t1, t2, t1.get_node(t1.root), t2.get_node(t2.root))

In [None]:
from tqdm import tqdm

In [None]:
counts = {}

for row in tqdm(rows):
    t = dict_to_tree(row)
    
    key = json.dumps(t.to_dict(with_data=True), cls=ComplexEncoder, sort_keys=True)
    
    if key not in counts.keys():
        counts[key] = (t, 1)
    else:
        t, c = counts[key]
        counts[key] = (t, c + 1)

In [None]:
res = sorted(list(counts.items()), key=lambda x: -x[1][1]) 

In [None]:
t1 = dict_to_tree(rows[0])
t2 = dict_to_tree(rows[1])

In [None]:
s = t1.show(data_property="desc")

In [None]:
t1.save2file('test.txt', data_property="desc")

In [None]:
import io

In [None]:
import tempfile

def tree_to_string(tree):
    tf = tempfile.NamedTemporaryFile(delete=False)
    
    temp_name = tf.name
    if os.path.exists(temp_name):
        os.remove(temp_name)
    tree.save2file(temp_name, data_property="desc")
    with open(temp_name, 'r') as fp:
        s_tree = fp.read()
    os.remove(temp_name)
    return s_tree

In [None]:
import tempfile

In [None]:
tf = tempfile.NamedTemporaryFile(delete=False)

tf.name

In [None]:
s_tree = tree_to_string(t1)

In [None]:
s_tree

In [None]:
counts

In [None]:
# the big transform function to analyze a single json.gz file

def analyze_github_file(path, output_name, max_examples=5):
    
    logging.info('Analyzing {}'.format(path))
    
    logging.info('Loading data')
    
    with gzip.open(path, 'rb') as f:
        file_content = f.read().decode()
        rows = [json.loads(line) for line in file_content.split('\n') if len(line) > 0]
    
    logging.info('Found {} rows'.format(len(rows)))
    logging.info('Counting how often each structure exists')
    counts = {}
    for row in tqdm(rows):
        t = dict_to_tree(row)

        key = json.dumps(t.to_dict(with_data=True), cls=ComplexEncoder, sort_keys=True)

        if key not in counts.keys():
            counts[key] = (t, 1, [row])
        else:
            t, c, examples = counts[key]
            if len(examples) < max_examples:
                examples.append(row)
            counts[key] = (t, c + 1, examples)
    logging.info('Found {} different trees'.format(len(counts)))
    logging.info('Converting to JSON')
    
    ret = []
    for k, v in counts.items():
        ret.append({'path' : path,
                    'count' : v[1],
                    'tree' : k,
                    'formatted_tree': tree_to_string(v[0]),
                    'examples' : v[2]})
    logging.info("Writing result to {}".format(output_name))
    ret = sorted(ret, key=lambda x: -x['count'])
    with open(output_name, 'w') as fp:
        for r in ret:
            fp.write(json.dumps(r) + '\n')
        
    logging.info('Done.')

In [None]:
analyze_github_file(path, os.path.basename(path) + ".stats.json")

In [None]:
s = 'root: dict\n\u251c\u2500\u2500 actor: dict\n\u2502   \u251c\u2500\u2500 avatar_url: str\n\u2502   \u251c\u2500\u2500 gravatar_id: str\n\u2502   \u251c\u2500\u2500 id: int\n\u2502   \u251c\u2500\u2500 login: str\n\u2502   \u2514\u2500\u2500 url: str\n\u251c\u2500\u2500 created_at: str\n\u251c\u2500\u2500 id: str\n\u251c\u2500\u2500 payload: dict\n\u2502   \u251c\u2500\u2500 action: str\n\u2502   \u251c\u2500\u2500 actor: str\n\u2502   \u251c\u2500\u2500 actor_gravatar: str\n\u2502   \u2514\u2500\u2500 repo: str\n\u251c\u2500\u2500 public: bool\n\u251c\u2500\u2500 repo: dict\n\u2502   \u251c\u2500\u2500 id: int\n\u2502   \u251c\u2500\u2500 name: str\n\u2502   \u2514\u2500\u2500 url: str\n\u2514\u2500\u2500 type: str\n"'
print(s)

In [None]:
paths = sorted(glob.glob('/disk/download/data/*.json.gz'))

In [None]:
# use joblib to launch full stats
output_dir = 'github_stats'
os.makedirs(output_dir, exist_ok=True)

from joblib import Parallel, delayed
from joblib import Memory
import time

start = time.time()
results = Parallel(n_jobs=32, verbose=10)(
    delayed(analyze_github_file)(path, os.path.join(output_dir, os.path.basename(path) + ".stats.json")) for path in paths[9500:])
stop = time.time()

print('Elapsed time for the entire processing: {:.2f} s'
      .format(stop - start))

In [192]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("test")

DEBUG:root:test
