#Sample Dataset of git commit related data for OpenStack

Copyright Doug Williams - 2014, 2015

###Updated: 3/12/2015

###Sources:
- Launchpad (https://launchpad.net/nova)
- Gerrit (https://review.openstack.org/#/q/status:open,n,z)
- Git (https://github.com/openstack/nova)

###Data Structures:
- combined_commits.jsonz : Basic record for each commit, integrating data from Launchpad, Gerrit and Git, as stored on disk
- Enhanced combined_commits:  Dataset enhanced with 'guilt' data based on selected bug severity threshold.
- Extracted features for use by dict vectorizer


Note:  All .jsonz files are gzipped json, with individual json entries for each record.  Apply json.loads() to each individual line.

###Examples:
- Sample entry from combined_commits.jsonz
- Sample entry from all_blame.jsonz
- Example of blame normalization

###History
- 3/12/2015: Major update

# Imports

In [1]:
from pprint import pprint
from collections import defaultdict

import numpy as np
import numpy as np
import math
import matplotlib.pyplot as plt


import sys
sys.path.append('../dev')

from commit_analysis import load_combined_commits

from Git_Extract import  filter_bug_fix_combined_commits
from Git_Extract import get_commit_ordering_min_max

from commit_analysis import load_core_analysis_data
from commit_analysis import fit_features
from commit_analysis import extract_features
from commit_analysis import extract_features_helper
from commit_analysis import compute_guilt
from commit_analysis import autoset_threshold
from BugFixWorkflow import compute_selected_bug_fixes

from BugFixWorkflow import commit_postprocessing
from BugFixWorkflow import find_legacy_cutoff

# from commit_analysis import blame_compute_normalized_guilt
# from commit_analysis import normalize_blame_by_file

# Configuration

In [2]:
# PROJECT = 'nova'
# PROJECT = 'swift'
# PROJECT = 'cinder'
PROJECT = 'heat'
# PROJECT = 'glance'

# IMPORTANCE = 'high+'
IMPORTANCE = 'med+'

# Sample entry from < project >_combined_commits.jsonz

In [3]:
combined_commits = load_combined_commits(PROJECT)

### Basic Commit entry (dict)

###### Structure

In [4]:
pprint(combined_commits.values()[0].keys())

[u'files',
 u'committer',
 u'ancestors',
 u'unfiltered_files',
 u'blueprint',
 u'author',
 u'change_details',
 u'cid',
 u'on_mainline',
 u'date',
 u'parents',
 u'bug_details',
 u'msg',
 u'distance_from_mainline',
 u'children',
 u'is_master_commit',
 u'on_master_branch']


###### Sample data

In [5]:
pprint(combined_commits.items()[0])

(u'a7d10dc43b0d2baa48cabd97781098d14208ecec',
 {u'ancestors': False,
  u'author': u'<git.Actor "Jenkins <jenkins@review.openstack.org>">',
  u'blueprint': False,
  u'bug_details': {},
  u'change_details': False,
  u'children': [u'ad039c26828cecaa9a657bedb5106304b58634b0',
                u'b0903a5598b6812342e230a03ff0310d068cb450',
                u'8faef30895d5b3550d50672fc454acc37992008a'],
  u'cid': u'a7d10dc43b0d2baa48cabd97781098d14208ecec',
  u'committer': u'<git.Actor "Gerrit Code Review <review@openstack.org>">',
  u'date': 1411637242,
  u'distance_from_mainline': 0,
  u'files': [u'heat/api/openstack/v1/stacks.py'],
  u'is_master_commit': False,
  u'msg': u'Merge "Convert parse error to text type"',
  u'on_mainline': True,
  u'on_master_branch': True,
  u'parents': [u'b8ed38807e227ab3a28ad5576615656b8f781fdf',
               u'58a094238bbfa4c652d63a000854e77cf894d345'],
  u'unfiltered_files': [u'heat/api/openstack/v1/stacks.py']})


### Commit with associated bug

###### Dict Structure has data in 'bugs' and 'bug_details' field

In [6]:
sample_commits = [v for k,v in combined_commits.items() if 'bug_details' in v and v['bug_details']]
pprint (sample_commits[0].keys() )

[u'files',
 u'committer',
 u'ancestors',
 u'unfiltered_files',
 u'blueprint',
 u'author',
 u'change_id',
 u'is_master_commit',
 u'bugs',
 u'on_mainline',
 u'msg',
 u'cid',
 u'parents',
 u'bug_details',
 u'change_details',
 u'date',
 u'distance_from_mainline',
 u'children',
 u'on_master_branch']


In [7]:
pprint (sample_commits[0]['bugs'] )

[u'1268614']


###### bug_details wit a dict with one entry per bug

In [8]:
pprint (sample_commits[0]['bug_details'].keys() )

[u'1268614']


In [9]:
pprint (sample_commits[0]['bug_details'].values()[0].keys())

[u'status',
 u'security_related',
 u'description',
 u'tags',
 u'importance',
 u'commits',
 u'title',
 u'messages',
 u'name',
 u'message_count',
 u'heat',
 u'date_created',
 u'activity',
 u'self_link',
 u'information_type',
 u'cves',
 u'duplicate_of',
 u'id',
 u'resource_type',
 u'attachments']


###### Detailed bug entry

In [10]:
pprint (sample_commits[0]['bug_details'])

{u'1268614': {u'activity': [],
              u'attachments': [],
              u'commits': [],
              u'cves': [],
              u'date_created': u'2014-01-13 14:46:21.161906+00:00',
              u'description': u"I see several changes, including https://review.openstack.org/#/c/63735/ , failed pep8 gating with error from check_uptodate tool:\n\n\n2014-01-13 14:06:39.643 | pep8 runtests: commands[1] | /home/jenkins/workspace/gate-nova-pep8/tools/config/check_uptodate.sh\n2014-01-13 14:06:39.649 |   /home/jenkins/workspace/gate-nova-pep8$ /home/jenkins/workspace/gate-nova-pep8/tools/config/check_uptodate.sh \n2014-01-13 14:06:43.581 | 2741,2746d2740\n2014-01-13 14:06:43.581 | < # (optional) indicate whether to set the X-Service-Catalog\n2014-01-13 14:06:43.581 | < # header. If False, middleware will not ask for service\n2014-01-13 14:06:43.581 | < # catalog on token validation and will not set the X-Service-\n2014-01-13 14:06:43.581 | < # Catalog header. (boolean value)\n2014-01

### Sample commit with Gerrit Data

###### Contains data in 'change_id' and 'change_detail' fields

In [11]:
sample_commits = [v for k,v in combined_commits.items() if 'change_details' in v and v['change_details']]
pprint (sample_commits[0].keys() )

[u'files',
 u'committer',
 u'ancestors',
 u'unfiltered_files',
 u'blueprint',
 u'author',
 u'change_id',
 u'is_master_commit',
 u'bugs',
 u'on_mainline',
 u'msg',
 u'cid',
 u'parents',
 u'bug_details',
 u'change_details',
 u'date',
 u'distance_from_mainline',
 u'children',
 u'on_master_branch']


In [12]:
pprint (sample_commits[0]['change_id'] )

u'I3caf012d1ecfd852b736f52a1269f742449ffee9'


###### Structure of change detail - data extracted from Gerrit

In [13]:
pprint (sample_commits[0]['change_details'].keys() )

[u'status',
 u'project',
 u'updated',
 u'created',
 u'change_id',
 u'labels',
 u'messages',
 u'kind',
 u'topic',
 u'owner',
 u'branch',
 u'_sortkey',
 u'_number',
 u'id',
 u'subject']


###### Sample change_detail entry

In [14]:
pprint (sample_commits[0]['change_details'] )

{u'_number': 66428,
 u'_sortkey': u'002a6e090001037c',
 u'branch': u'master',
 u'change_id': u'I3caf012d1ecfd852b736f52a1269f742449ffee9',
 u'created': u'2014-01-13 20:52:50.000000000',
 u'id': u'openstack%2Fheat~master~I3caf012d1ecfd852b736f52a1269f742449ffee9',
 u'kind': u'gerritcodereview#change',
 u'labels': {u'Code-Review': {u'all': [{u'_account_id': 4257,
                                        u'name': u'Zane Bitter',
                                        u'value': 2}],
                              u'approved': {u'_account_id': 4257,
                                            u'name': u'Zane Bitter'}},
             u'Verified': {u'all': [{u'_account_id': 3,
                                     u'name': u'Jenkins',
                                     u'value': 2}],
                           u'approved': {u'_account_id': 3,
                                         u'name': u'Jenkins'}},
             u'Workflow': {u'all': [{u'_account_id': 4257,
                              

# Data after guilt processing

In [15]:
combined_commits = commit_postprocessing(PROJECT, importance=IMPORTANCE)

loading bug data
  total LP bugs: 1388
  Entries annotated: 536
loading Git commit data
  total git_commits: 7566
  bug fix commits: 1353
  commits with change_id: 4084
  bug fix with change_id: 1353
loading change data
  total gerrit changes with detail: 4126
  all_change_details: 4126
  total gerrit changes: 4132
  all_changes: 4132
combined_commits: 7566
Determining legacy cut-off
  Setting cutoff to: 12/11/2012
Collecting data on commits with bug fixes
  Mainline Commits ignored due to legacy: 995  out of: 4507
  Total commite requiring blame computation: 1113

Computing Blame
Loaded blame
  Initial Blame cache size: 939
  bug fix commits: 1113
. . . . . . . . . . 100 . . . . . . . . . . 200 . . . . . . . . . . 300 . . . . . . . . . . 400 . . . . . . . . . . 500 . . . . . . . . . . 600 . . . . . . . . . . 700 . . . . . . . . . . 800 . . . . . . . . . . 900 . . . . . . . . . . 1000 . . . . . . . . . . 1100 .
  Saving updated Blame Cache

Annotating Guilt
Identify reachable commits
 

######Commit entries are now annotated based on reachability (visivbility within master branch) and imputed guilt

Commit entries are also annotates with additional feature data

In [16]:
sample_commits = [v for k,v in combined_commits.items() if 'reachable' in v and v['reachable']]
pprint (sample_commits[0].keys() )

[u'committer',
 'loc_add',
 u'children',
 u'on_master_branch',
 u'ancestors',
 u'blueprint',
 u'author',
 u'on_mainline',
 u'parents',
 'loc_detail',
 u'msg',
 'reachable',
 u'files',
 'tagged_bug_fix',
 u'unfiltered_files',
 'file_order',
 'is_tracked_change',
 u'date',
 'loc_change',
 u'distance_from_mainline',
 u'cid',
 u'change_id',
 u'is_master_commit',
 u'bugs',
 'guilt',
 u'bug_details',
 u'change_details',
 'author_order',
 'order',
 'file_order_for_author']


###### Actual data

In [17]:
pprint (sample_commits[0] )

{u'ancestors': False,
 u'author': u'<git.Actor "Roman Podoliaka <rpodolyaka@mirantis.com>">',
 'author_order': 1,
 u'blueprint': False,
 u'bug_details': {u'1245863': {u'activity': [],
                               u'attachments': [],
                               u'commits': [{u'change_id': u'I364da88f1c581b406d51e1b584778c7dd4e2564f',
                                             u'cid': u'11ac4dd3f899dc0f28d356bad89ec559e9d1e81f'}],
                               u'cves': [],
                               u'date_created': u'2013-10-29 12:07:38.147412+00:00',
                               u'description': u'Change https://review.openstack.org/#/c/54063/ makes Fn::Select accept only string selector values. At the same time, our doc states, that the selector can be either of type string or of type integer (http://docs.openstack.org/developer/heat/template_guide/functions.html#fn-select)\n\nIf this is considered to be the correct behavior, we must update Heat docs and fix templates we 

#Features

In [18]:
legacy_cutoff = find_legacy_cutoff(combined_commits)
min_order, max_order = get_commit_ordering_min_max(combined_commits)
actual_bugs = compute_selected_bug_fixes(combined_commits,
                                             legacy_cutoff=legacy_cutoff,
                                             min_order=min_order,
                                             max_order=max_order)
guilt_threshold, labeled_bugs = autoset_threshold(combined_commits,
                                                      actual_bugs)
extract_state = fit_features(combined_commits, min_order=min_order,
                                 max_order=max_order)

  Total Alias: 37
Total features: 2374


###### Note:  Ordinarily one would not call extract_features_helper directly, but calling to expose raw feature fectors

In [19]:
cid, Y, features = extract_features_helper(combined_commits,
                                               min_order, max_order,
                                               0, 0,
                                               **extract_state['feat_kwargs'])

  Total Alias: 37


###### Sample Feature Vector - for input to Scikit-Learn scaler and DictVectorizer

In [20]:
features[0]

{'author': u'sbaker@redhat.com',
 'author_order': 3.2188758248682006,
 'author_org': u'redhat.com',
 'blueprint': True,
 'cherry_picked_from': False,
 'cherry_picked_to': False,
 'committer': 'same',
 'gerrit_approved_code': u'Angus Salkeld',
 'gerrit_approved_workflow': u'Angus Salkeld',
 'gerrit_has_data': True,
 u'gerrit_reviewer_Angus Salkeld': 2,
 'gerrit_revision': 1,
 'gerrit_votes': 2,
 u'includes_file_heat/engine/identifier.py': 1,
 u'includes_file_heat/engine/service.py': 1,
 'lauchpad_bugs': 0,
 'loc_add': 2,
 u'loc_add_heat/engine/identifier.py': 0,
 u'loc_add_heat/engine/service.py': 2,
 'loc_change': 15,
 u'loc_changes_heat/engine/identifier.py': 10,
 u'loc_changes_heat/engine/service.py': 5,
 'log_max_file_order': 1.3862943611198906,
 'log_min_file_order': 1.3862943611198906,
 'log_order': 5.958424693029782}

### Generation of data for use with Scikit-Learn

In [21]:
all_cid, Y, X, col_names = extract_features(combined_commits,
                                                extract_state,
                                                min_order=min_order,
                                                max_order=max_order,
                                                threshold=guilt_threshold)


  Total Alias: 37
Total feature vectors: 2586
  bugs based on threshold: 703


###### Commit ID corresponsing to each feature vector

In [22]:
all_cid[:10]

(u'89caca42d693a686fc190bfd9e35606b4a21f208',
 u'9096b586e2246cbc13b13141216a775720770c80',
 u'bc2c6dbcfe097b45f53b162f6433e6fdd2ca9fa7',
 u'a425ae02af7a25aa3654e644dc966d07f408a60c',
 u'ba00a025d4a5b72638a002ce4e123f6e824d9aa9',
 u'40ed0ade378bb0289c546e7c2cc16891275c3e2b',
 u'bd8edd25fa8423ded878b4f3d0d0403e64c3cd17',
 u'2622c2ac4ecfc499f947d749b76ea1538148d4bb',
 u'27c5ea86f3c888be909788e40688624cc9b71b10',
 u'ad1e45573ece5bb5ca5d6b2273d0d3ab8faba806')

###### Labels for each feature vector

In [23]:
Y

array([ True, False,  True, ..., False, False, False], dtype=bool)

###### Matrix of Feature Vetors, on row per commit

In [24]:
X

array([[ 0.        ,  0.        ,  0.        , ...,  0.24982436,
         0.25143961,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.35067282,
         0.1257198 ,  0.00126591],
       [ 0.        ,  0.        ,  0.        , ...,  0.64578659,
         0.29191234,  0.00252857],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.24982436,
         0.1257198 ,  0.99966978],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.99983492],
       [ 0.        ,  0.        ,  0.        , ...,  0.80686167,
         0.        ,  1.        ]])

###### Column Heading for feature matrix

In [25]:
col_names

[u'author=Niu.ZGlinux@gmail.com',
 u'author=achudnovets@mirantis.com',
 u'author=adrien.verge@numergy.com',
 u'author=agordeev@mirantis.com',
 u'author=aigerim.sametkhanova@gmail.com',
 u'author=aignatov@mirantis.com',
 u'author=aivanitskiy@mirantis.com',
 u'author=aj@suse.de',
 u'author=akurilin@mirantis.com',
 u'author=akuznetsova@mirantis.com',
 u'author=alex.gaynor@gmail.com',
 u'author=amagarw3@cisco.com',
 u'author=anant.patil@hp.com',
 u'author=andersonvom@gmail.com',
 u'author=andrea.rosa@hp.com',
 u'author=andrew.plunk@rackspace.com',
 u'author=apevec@redhat.com',
 u'author=asalkeld@mirantis.com',
 u'author=barnold@us.ibm.com',
 u'author=bartosz.gorski@ntti3.com',
 u'author=berendt@b1-systems.de',
 u'author=bk@theboxes.org',
 u'author=bknudson@us.ibm.com',
 u'author=bnemec@redhat.com',
 u'author=bwiedemann@suse.de',
 u'author=cbjchen@cn.ibm.com',
 u'author=chenxiao@cn.ibm.com',
 u'author=chmouel@enovance.com',
 u'author=chris.armstrong@rackspace.com',
 u'author=chrisroberts.co