# Notebook to convert raw revision log to CSV formatted event table

In [None]:
# git log --date=iso --pretty=format:"%at%x09%ad%x09%H%x09%an%x09%ae%x09%s" --stat --no-merges > raw.revlog

In [1]:
raw_revlog_filename = '../raw.revlog'

In [2]:
grouped_lines = []

with open(raw_revlog_filename) as f:
    group = []
    for line in f:
        if line == '\n':
            grouped_lines.append(group)
            group = []
        else:
            group.append(line.strip())

In [3]:
print len(grouped_lines)

608796


In [4]:
import re

def file_stats_from(description):
    match = re.search('(.*) +\| +(\d+) +(.*)', description, re.IGNORECASE)
    filename = match.group(1)
    n_lines_changed = int(match.group(2))
    changes = match.group(3)
    n_add = int(1.0 * n_lines_changed * changes.count('+') / len(changes))
    n_del = int(1.0 * n_lines_changed * changes.count('-') / len(changes))
    return [filename.strip(), str(n_add), str(n_del)]


In [9]:
import re

timestamps = []
commit_dates = []
hashes = []
names = []
emails = []
subjs = []
filenames = []
n_adds = []
n_dels = []

i = 0
for group in grouped_lines:
    if i % 50000 == 0:
        print '{}% done'.format(100.0*i/len(grouped_lines))
    i+=1
    tsv, remaining = group[0].split("\t"), group[1:]
    # empty subjects
    if group[0].count('\t') == 4:
        ts_author, commit_date_iso, commit_hash, author_name, author_email, subj = tsv[0], tsv[1], tsv[2], tsv[3], tsv[4], "empty"
    else:
        ts_author, commit_date_iso, commit_hash, author_name, author_email, subj = tsv[0], tsv[1], tsv[2], tsv[3], tsv[4], tsv[5]

    # replace separator values from subject
    #subj = unicode(subj.replace(",", " "), 'utf-8')
    subj = subj.replace(",", " ").decode('utf-8','ignore').encode("utf-8")

    # no renames, no binary changes
    file_changes = [r for r in remaining if '|' in r and '=>' not in r and '->' not in r and ('+' in ''.join(r.split('|')[1:]) or '-' in ''.join(r.split('|')[1:]))]
    details = [file_stats_from(changes) for changes in file_changes]

    for detail in details:
        timestamps.append(int(ts_author))
        commit_dates.append(commit_date_iso)
        hashes.append(commit_hash)
        names.append(author_name)
        emails.append(author_email)
        subjs.append(subj)
        filenames.append(detail[0])
        n_adds.append(int(detail[1]))
        n_dels.append(int(detail[2]))

print len(timestamps)

0.0% done
8.21293175382% done
16.4258635076% done
24.6387952615% done
32.8517270153% done
41.0646587691% done
49.2775905229% done
57.4905222768% done
65.7034540306% done
73.9163857844% done
82.1293175382% done
90.342249292% done
98.5551810459% done
1429544


In [10]:
import pandas as pd
import dateutil.parser
def tz_from(date_str):
    d = dateutil.parser.parse(date_str)
    hours_from_utc = d.tzinfo.utcoffset(d).total_seconds() / 3600.0
    return int(hours_from_utc)

df = pd.DataFrame(
    {
        'author_timestamp': timestamps,
        'commit_utc_offset_hours': map(tz_from, commit_dates),
        'commit_hash': hashes,
        'author_name': names,
        'author_email': emails,
        'subject': subjs,
        'filename': filenames,
        'n_additions': n_adds,
        'n_deletions': n_dels
    }
)

df[['commit_hash', 'filename', 'n_additions', 'n_deletions', 'subject']].head()

Unnamed: 0,commit_hash,filename,n_additions,n_deletions,subject
0,f201ebd87652cf1519792f8662bb3f862c76aa33,mm/z3fold.c,7,3,mm/z3fold.c: limit first_num to the actual ran...
1,083fb8edda0487d192e8c117f625563b920cf7a4,include/linux/pagemap.h,0,1,mm: fix <linux/pagemap.h> stray kernel-doc not...
2,c87d1655c29500b459fb135258a93f8309ada9c7,Documentation/ABI/obsolete/sysfs-block-zram,0,119,zram: remove obsolete sysfs attrs
3,c87d1655c29500b459fb135258a93f8309ada9c7,Documentation/ABI/testing/sysfs-block-zram,8,92,zram: remove obsolete sysfs attrs
4,c87d1655c29500b459fb135258a93f8309ada9c7,Documentation/blockdev/zram.txt,34,39,zram: remove obsolete sysfs attrs


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1429544 entries, 0 to 1429543
Data columns (total 9 columns):
author_email               1429544 non-null object
author_name                1429544 non-null object
author_timestamp           1429544 non-null int64
commit_hash                1429544 non-null object
commit_utc_offset_hours    1429544 non-null int64
filename                   1429544 non-null object
n_additions                1429544 non-null int64
n_deletions                1429544 non-null int64
subject                    1429544 non-null object
dtypes: int64(4), object(5)
memory usage: 98.2+ MB


In [12]:
# unique identifiers for our author mapping
unique_author_ids = df.apply(lambda row: row['author_name'] + ' ' + row['author_email'], axis=1).unique()
translation = {author_id: index for index, author_id in enumerate(unique_author_ids)}
df['author_id'] = df.apply(lambda row: translation[row['author_name'] + ' ' + row['author_email']], axis=1)
df.drop('author_email', axis=1, inplace=True)
df.drop('author_name', axis=1, inplace=True)
df.head()


Unnamed: 0,author_timestamp,commit_hash,commit_utc_offset_hours,filename,n_additions,n_deletions,subject,author_id
0,1487807211,f201ebd87652cf1519792f8662bb3f862c76aa33,-8,mm/z3fold.c,7,3,mm/z3fold.c: limit first_num to the actual ran...,0
1,1487807208,083fb8edda0487d192e8c117f625563b920cf7a4,-8,include/linux/pagemap.h,0,1,mm: fix <linux/pagemap.h> stray kernel-doc not...,1
2,1487807205,c87d1655c29500b459fb135258a93f8309ada9c7,-8,Documentation/ABI/obsolete/sysfs-block-zram,0,119,zram: remove obsolete sysfs attrs,2
3,1487807205,c87d1655c29500b459fb135258a93f8309ada9c7,-8,Documentation/ABI/testing/sysfs-block-zram,8,92,zram: remove obsolete sysfs attrs,2
4,1487807205,c87d1655c29500b459fb135258a93f8309ada9c7,-8,Documentation/blockdev/zram.txt,34,39,zram: remove obsolete sysfs attrs,2


In [13]:
df.to_csv('../kaggle_linux_kernel_git_revlog.csv', index=False)