In [9]:
#!/usr/bin/env python
# encoding: utf-8

import sys
import os
import math
import pprint
import json
import sqlite3
import time
import datetime
from os import listdir
from os.path import isfile, join

#dbpath = '/Users/mszell/data/githubarchive/derived/'
#dbname = 'githubarchive.db'
#datapath = '/Users/mszell/data/githubarchive/raw/'

dbpath = ''
dbname = 'REPO_TEST4.sqlite'
datapath = os.getcwd()


In [10]:
def date_from_filename(fname):
    dt = datetime.datetime.strptime(fname[len('githubarchive_'):-len('.json')], '%Y-%m-%d')
    return dt.strftime('%Y-%m-%d')

In [11]:
def date_from_json(creation_date):
    dt = datetime.datetime.strptime(creation_date[:19], '%Y-%m-%dT%H:%M:%S')
    return dt

In [12]:

"""Parse all data from githubarchive (daily files) to build a new sqlite db

The database contains the following tables:
repository: (repositoryid INTEGER PRIMARY KEY, repositoryname TEXT, userid TEXT, created_at INTEGER, language TEXT, description TEXT, url TEXT, fork INTEGER)
push: (repositoryid INTEGER PRIMARY KEY, pushed_at DATE, userid TEXT)
watch: (repositoryid INTEGER PRIMARY KEY, watched_at DATE,  userid TEXT, watchers INTEGER)
memberadd: (repositoryid INTEGER PRIMARY KEY, added_at DATE, repositoryid INTEGER, userid INTEGER, byuserid INTEGER)
follow: (repositoryid INTEGER PRIMARY KEY, followed_at DATE, targetuserid TEXT, followers INTEGER, repos  byuserid TEXT)
star:()

The tables contain data relevant only to those repositories which have been created in the timespan that the files cover. Exception is table follows which contains all follow events.

Note: This script will only work with githubarchive data starting at 2012-09-22. Reason: Before this time, the repositoryid does not exist for PushEvents.


Parameters
----------
see below
dbpath: string
datapath: string

Dependencies
------------
see imports below

Returns
-------
n/a

Example
-------
>>> python build_all.py

See Also
--------


Created on  2016-10-27
Last update 2016-10-27
"""

In [None]:
def main():
    build_repository()
    build_push()
    #build_watch()
    #build_memberadd()
    #build_follow()

In [13]:
def parse_create_event(event):
    if event['payload'].get('object')=='repository' and event['repo'].get('id') and event['payload'].get('name') and event.get('actor').get('login'):
        repo_name=event['payload'].get('name')
        repoid=event['repo']['id']
        created_at=date_from_json(event.get('created_at'))
        user=event['actor']['login']
        description='NA'
        #print repoid, repo_name, user, created_at, description
        return repoid, repo_name, user, created_at, description
    elif event.get('payload') and event['payload'].get('ref_type')=='repository' and event.get('repository') and event.get('actor'):
        repo_name=event['repository']['name']
        repoid=event['repository']['id']
        created_at=date_from_json(event['repository'].get('created_at'))
        user=event['actor']
        description=event['payload'].get('description')
        return repoid, repo_name, user, created_at, description
    elif event.get('payload') and event['payload'].get('ref_type')=='repository' and event.get('repo') and event.get('actor'):
        repo_name=event['repo']['name']
        repoid=event['repo']['id']
        created_at=date_from_json(event.get('created_at'))
        user=event['actor']['login']
        description=event['payload'].get('description')
        return repoid, repo_name, user, created_at, description

In [14]:
def parse(event, merging_matter):
    #print event['type']
    if type(event)==dict:
        if event.get('repo') and event['repo'].get('id'):
            if merging_matter==False:
                d=date_from_json(event.get('created_at'))
                if event.get('user'):
                    if event['user']['type']=='User':
                        u=event['user']['login']
                        p=event['repo']['id']
                        return (u, p, d)
                elif event.get('actor'):
                    if 'login' in event['actor'].keys():
                        u=event['actor']['login']
                        p=event['repo']['id']
                        return (u, p, d)
                    elif event.get('payload') and event['payload'].get('actor'):
                        u=event['payload']['actor']
                        p=event['repo']['id']
                        return (u, p, d)
            elif merging_matter==True:
                d=date_from_json(event['payload']['pull_request']['created_at'][:10])
                    #print d
                if 'user' in event.keys():
                    #print event.keys()
                    if event['user']['type']=='User':
                        u=event['user']['login']
                        p=event['repo']['id']
                        return (u, p, d)      
                elif 'actor' in event.keys():
                    u=event['actor']['login']
                    p=event['repo']['id']
                    return (u, p, d)
        elif event.get('repository'):
            u=event['actor']
            p=event['repository']['id']
            d=date_f
            return (u, p, d)

In [15]:
def build_repository():
    # Create table
    conn = sqlite3.connect(dbpath + dbname)
    c = conn.cursor()
    c.execute('''DROP TABLE IF EXISTS repository''')
    c.execute('''CREATE TABLE IF NOT EXISTS repository
                 (repositoryid INTEGER PRIMARY KEY, repositoryname TEXT, userid TEXT, created_at DATE, description TEXT)''')
    conn.commit()
    conn.close()

    # Parse files
    #filenames = [ f for f in listdir(datapath) if isfile(join(datapath,f)) ]
    filenames=sorted([i for i in os.listdir(datapath) if i.startswith("githubarchive_2011") and i.endswith('json')])
    for filename in filenames:
        conn = sqlite3.connect(dbpath + dbname)
        c = conn.cursor()
        i = 0
        with open(filename) as jsonfile:
            for line in jsonfile:
                try: 
                    jsonline = json.loads(unicode(line, errors='ignore').strip())
                    #print jsonline.keys()
                    if jsonline.get("type") == "CreateEvent" and jsonline.get('payload').get('object')=='repository':
                        try:
                            repoid, repo_name, user, created_at, description=parse_create_event(jsonline)
                            query = 'INSERT OR REPLACE INTO repository VALUES (?,?,?,?,?)'
                            c.execute(query, (repoid, repo_name, user, created_at, description))
                            i = i+1
                            #print i
                        except: # Rarely, JSON is not valid even if I tryy super hard
                            pass
                    elif jsonline.get("type") == "CreateEvent" and jsonline.get('payload').get('ref_type')=='repository':
                        repoid, repo_name, user, created_at, description=parse_create_event(jsonline)
                        query = 'INSERT OR REPLACE INTO repository VALUES (?,?,?,?,?)'
                        c.execute(query, (repoid, repo_name, user, created_at, description))
                        i = i+1
                except:
                    for k in ['{"repo"'+l for l in line.split('{"repo"')[1:]]:
                        jsonline=json.loads(unicode(k, errors='ignore').strip())
                        last_event=jsonline
                        if jsonline.get("type") == "CreateEvent" and jsonline.get('payload').get('object')=='repository':
                            try:
                                repoid, repo_name, user, created_at, description=parse_create_event(jsonline)
                                query = 'INSERT OR REPLACE INTO repository VALUES (?,?,?,?,?)'
                                c.execute(query, (repoid, repo_name, user, created_at, description))
                                i = i+1
                            except:
                                pass
                        elif jsonline.get("type") == "CreateEvent" and jsonline['payload'].get('ref_type')=='repository':
                            repoid, repo_name, user, created_at, description=parse_create_event(jsonline)
                            query = 'INSERT OR REPLACE INTO repository VALUES (?,?,?,?,?)'
                            c.execute(query, (repoid, repo_name, user, created_at, description))
                            i = i+1
            print "Done file " + filename + ". Found and inserted " + str(i) + " valid CreationEvents."
        conn.commit()
        conn.close()

In [16]:
def build_push():
    # Create table
    conn = sqlite3.connect(dbpath + dbname)
    c = conn.cursor()
    c.execute('''DROP TABLE IF EXISTS push''')
    c.execute('''CREATE TABLE IF NOT EXISTS push
                 (repositoryid INTEGER PRIMARY KEY, pushed_at DATE, userlogin TEXT)''')
    conn.commit()
    conn.close()
    
    # Parse files
    #filenames = [ f for f in listdir(datapath) if isfile(join(datapath,f)) ]
    filenames=sorted([i for i in os.listdir(datapath) if i.startswith("githubarchive_2011") and i.endswith('json')])
    for filename in filenames:
        conn = sqlite3.connect(dbpath + dbname)
        c = conn.cursor()
        i = 0
        with open(filename) as jsonfile:
            for line in jsonfile:
                try: 
                    jsonline = json.loads(unicode(line, errors='ignore').strip())
                    #print jsonline.keys()
                    if jsonline.get("type") == "PushEvent":
                        try:
                            user, project, date=parse(jsonline, False)
                            query = 'INSERT OR REPLACE INTO push VALUES (?,?,?)'
                            c.execute(query, (project, date, user))
                            i = i+1
                        except: #repoid or user id missing
                            pass
                except:
                    for k in ['{"repo"'+l for l in line.split('{"repo"')[1:]]:
                        jsonline=json.loads(unicode(k, errors='ignore').strip())
                        last_event=jsonline
                        if jsonline.get("type") == "PushEvent":
                            try:
                                user, project, date=parse(jsonline, False)
                                query = 'INSERT OR REPLACE INTO push VALUES (?,?,?)'
                                c.execute(query, (project, date, user))
                                i = i+1
                            except: #repoid or user id missing
                                pass
        print "Done file " + filename + ". Found and inserted " + str(i) + " valid PushEvents."
        conn.commit()
        conn.close()

In [17]:
if __name__ == '__main__':
    main()

Done file githubarchive_2011-02-12.json. Found and inserted 1204 valid CreationEvents.
Done file githubarchive_2011-02-13.json. Found and inserted 1346 valid CreationEvents.
Done file githubarchive_2011-02-14.json. Found and inserted 1565 valid CreationEvents.
Done file githubarchive_2011-03-14.json. Found and inserted 1656 valid CreationEvents.
Done file githubarchive_2011-03-15.json. Found and inserted 1639 valid CreationEvents.
Done file githubarchive_2011-03-28.json. Found and inserted 1799 valid CreationEvents.
Done file githubarchive_2011-03-30.json. Found and inserted 1833 valid CreationEvents.
Done file githubarchive_2011-04-06.json. Found and inserted 1857 valid CreationEvents.
Done file githubarchive_2011-04-08.json. Found and inserted 1766 valid CreationEvents.
Done file githubarchive_2011-02-12.json. Found and inserted 15378 valid PushEvents.
Done file githubarchive_2011-02-13.json. Found and inserted 16889 valid PushEvents.
Done file githubarchive_2011-02-14.json. Found an

In [None]:
#TEST PARSER
with open('githubarchive_2011-03-14.json') as jsonfile:
    for line in jsonfile:
        jsonline = json.loads(unicode(line, errors='ignore').strip())
        if jsonline.get("type") == "PushEvent":
            user, project, date=parse(jsonline , False)
            print user, project, date