# Notebook to convert raw revision log to CSV formatted event table

In [None]:
# git log --date=iso --pretty=format:"%at%x09%ad%x09%H%x09%an%x09%ae%x09%s" --stat --no-merges > raw.revlog

In [None]:
raw_revlog_filename = 'raw.revlog'

In [82]:
grouped_lines = []

with open(raw_revlog_filename) as f:
    group = []
    for line in f:
        if line == '\n':
            grouped_lines.append(group)
            group = []
        else:
            group.append(line.strip())

In [122]:
import re

def file_stats_from(description):
    match = re.search('(.*) +\| +(\d+) +(.*)', description, re.IGNORECASE)
    filename = match.group(1)
    n_lines_changed = int(match.group(2))
    changes = match.group(3)
    n_add = int(1.0 * n_lines_changed * changes.count('+') / len(changes))
    n_del = int(1.0 * n_lines_changed * changes.count('-') / len(changes))
    return [filename.strip(), str(n_add), str(n_del)]


In [123]:
import re

timestamps = []
commit_dates = []
hashes = []
names = []
emails = []
subjs = []
filenames = []
n_adds = []
n_dels = []

for group in grouped_lines:
    tsv, remaining = group[0].split("\t"), group[1:]
    ts_author, commit_date_iso, commit_hash, author_name, author_email, subj = tsv[0], tsv[1], tsv[2], tsv[3], tsv[4], tsv[5]
    file_changes = [r for r in remaining if not ('(-)' in r or '(+)' in r)]
    details = [file_stats_from(changes) for changes in file_changes]

    for detail in details:
        timestamps.append(ts_author)
        commit_dates.append(commit_date_iso)
        hashes.append(commit_hash)
        names.append(author_name)
        emails.append(author_email)
        subjs.append(subj)
        filenames.append(detail[0])
        n_adds.append(detail[1])
        n_dels.append(detail[2])


In [129]:
import pandas as pd
import dateutil.parser
def tz_from(date_str):
    d = dateutil.parser.parse(date_str)
    hours_from_utc = d.tzinfo.utcoffset(d).total_seconds() / 3600.0
    return int(hours_from_utc)

df = pd.DataFrame(
    {
        'author_timestamp': map(lambda e: long(e), timestamps),
        'commit_utc_offset_hours': map(tz_from, commit_dates),
        'commit_hash': hashes,
        'author_name': names,
        'author_email': emails,
        'subject': subjs,
        'filename': filenames,
        'n_additions': map(lambda e: int(e), n_adds),
        'n_deletions': map(lambda e: int(e), n_dels)
    }
)

df.head()

Unnamed: 0,author_email,author_name,author_timestamp,commit_hash,commit_utc_offset_hours,filename,n_additions,n_deletions,subject
0,zhongjiang@huawei.com,zhong jiang,1487807211,f201ebd87652cf1519792f8662bb3f862c76aa33,-8,mm/z3fold.c,7,3,mm/z3fold.c: limit first_num to the actual ran...
1,rdunlap@infradead.org,Randy Dunlap,1487807208,083fb8edda0487d192e8c117f625563b920cf7a4,-8,include/linux/pagemap.h,0,1,mm: fix <linux/pagemap.h> stray kernel-doc not...
2,sergey.senozhatsky.work@gmail.com,Sergey Senozhatsky,1487807205,c87d1655c29500b459fb135258a93f8309ada9c7,-8,Documentation/ABI/obsolete/sysfs-block-zram,0,119,zram: remove obsolete sysfs attrs
3,sergey.senozhatsky.work@gmail.com,Sergey Senozhatsky,1487807205,c87d1655c29500b459fb135258a93f8309ada9c7,-8,Documentation/ABI/testing/sysfs-block-zram,8,92,zram: remove obsolete sysfs attrs
4,sergey.senozhatsky.work@gmail.com,Sergey Senozhatsky,1487807205,c87d1655c29500b459fb135258a93f8309ada9c7,-8,Documentation/blockdev/zram.txt,34,39,zram: remove obsolete sysfs attrs


In [125]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47 entries, 0 to 46
Data columns (total 9 columns):
author_email               47 non-null object
author_name                47 non-null object
author_timestamp           47 non-null int64
commit_hash                47 non-null object
commit_utc_offset_hours    47 non-null int64
filename                   47 non-null object
n_additions                47 non-null int64
n_deletions                47 non-null int64
subject                    47 non-null object
dtypes: int64(4), object(5)
memory usage: 3.4+ KB


In [130]:
# unique identifiers for our author mapping
unique_author_ids = df.apply(lambda row: row['author_name'] + ' ' + row['author_email'], axis=1).unique()
translation = {author_id: index for index, author_id in enumerate(unique_author_ids)}
df['author_id'] = df.apply(lambda row: translation[row['author_name'] + ' ' + row['author_email']], axis=1)
df.drop('author_email', axis=1, inplace=True)
df.drop('author_name', axis=1, inplace=True)
df.head()


Unnamed: 0,author_timestamp,commit_hash,commit_utc_offset_hours,filename,n_additions,n_deletions,subject,author_id
0,1487807211,f201ebd87652cf1519792f8662bb3f862c76aa33,-8,mm/z3fold.c,7,3,mm/z3fold.c: limit first_num to the actual ran...,0
1,1487807208,083fb8edda0487d192e8c117f625563b920cf7a4,-8,include/linux/pagemap.h,0,1,mm: fix <linux/pagemap.h> stray kernel-doc not...,1
2,1487807205,c87d1655c29500b459fb135258a93f8309ada9c7,-8,Documentation/ABI/obsolete/sysfs-block-zram,0,119,zram: remove obsolete sysfs attrs,2
3,1487807205,c87d1655c29500b459fb135258a93f8309ada9c7,-8,Documentation/ABI/testing/sysfs-block-zram,8,92,zram: remove obsolete sysfs attrs,2
4,1487807205,c87d1655c29500b459fb135258a93f8309ada9c7,-8,Documentation/blockdev/zram.txt,34,39,zram: remove obsolete sysfs attrs,2


In [127]:
df.to_csv('kaggle_linux_kernel_git_revlog.csv', index=False)