In [14]:
import re
from datetime import datetime

import numpy as np
import pandas as pd

%load_ext autotime

import plotly
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime


time: 3.99 ms


In [30]:
commits = pd.read_csv('data/commits.csv', parse_dates=True)

time: 151 ms


In [31]:
commits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4702 entries, 0 to 4701
Data columns (total 62 columns):
Unnamed: 0                       4702 non-null int64
sha                              4702 non-null object
node_id                          4702 non-null object
url                              4702 non-null object
html_url                         4702 non-null object
comments_url                     4702 non-null object
parents                          4702 non-null object
repo_name                        4702 non-null object
owner                            4702 non-null object
commit.author.name               4702 non-null object
commit.author.email              4702 non-null object
commit.author.date               4702 non-null object
commit.committer.name            4702 non-null object
commit.committer.email           4702 non-null object
commit.committer.date            4702 non-null object
commit.message                   4702 non-null object
commit.tree.sha               

## Data Preprocessing

In [32]:
del commits['parents']

time: 3.01 ms


In [33]:
commits['date'] =  pd.to_datetime(commits['commit.committer.date'])

time: 12 ms


In [34]:
commits['date'] =  pd.to_datetime(commits['date'], utc=True)

time: 2 ms


In [35]:
commits['commit_date'] = commits['date'].dt.date

time: 4 ms


In [48]:
commits['commit_week'] = commits['date'].dt.week

time: 3 ms


In [36]:
commits['commit_hour'] = commits['date'].dt.hour

time: 3 ms


In [37]:
commits['commit_month'] = commits['date'].dt.month

time: 2 ms


In [38]:
commits['commit_year'] = commits['date'].dt.year

time: 3 ms


In [39]:
commits.head()

Unnamed: 0.1,Unnamed: 0,sha,node_id,url,html_url,comments_url,repo_name,owner,commit.author.name,commit.author.email,...,committer.received_events_url,committer.type,committer.site_admin,author,committer,date,commit_date,commit_hour,commit_month,commit_year
0,0,191aca1fb01b797dae21657021d8fcd1e836c40a,MDY6Q29tbWl0Mzk0NjQwMTg6MTkxYWNhMWZiMDFiNzk3ZG...,https://api.github.com/repos/apache/incubator-...,https://github.com/apache/incubator-superset/c...,https://api.github.com/repos/apache/incubator-...,incubator-superset,apache,Daniel Vaz Gaspar,danielvazgaspar@gmail.com,...,https://api.github.com/users/web-flow/received...,User,False,,,2019-12-22 22:07:50+00:00,2019-12-22,22,12,2019
1,1,36c6f4ca3adfb237e1a4931563b782a4d7e7932a,MDY6Q29tbWl0Mzk0NjQwMTg6MzZjNmY0Y2EzYWRmYjIzN2...,https://api.github.com/repos/apache/incubator-...,https://github.com/apache/incubator-superset/c...,https://api.github.com/repos/apache/incubator-...,incubator-superset,apache,Evan Rusackas,evan@preset.io,...,https://api.github.com/users/mistercrunch/rece...,User,False,,,2019-12-20 20:58:36+00:00,2019-12-20,20,12,2019
2,2,d0efd0e4c925207f7c7ae51c32a3f65b126bff4b,MDY6Q29tbWl0Mzk0NjQwMTg6ZDBlZmQwZTRjOTI1MjA3Zj...,https://api.github.com/repos/apache/incubator-...,https://github.com/apache/incubator-superset/c...,https://api.github.com/repos/apache/incubator-...,incubator-superset,apache,Chan Chak Shing,ccspasu@gmail.com,...,https://api.github.com/users/dpgaspar/received...,User,False,,,2019-12-20 10:57:33+00:00,2019-12-20,10,12,2019
3,3,3a468a53d9dbf9d8fc44c3e1c396a8d7eb1e8f27,MDY6Q29tbWl0Mzk0NjQwMTg6M2E0NjhhNTNkOWRiZjlkOG...,https://api.github.com/repos/apache/incubator-...,https://github.com/apache/incubator-superset/c...,https://api.github.com/repos/apache/incubator-...,incubator-superset,apache,David Aaron Suddjian,1858430+suddjian@users.noreply.github.com,...,https://api.github.com/users/dpgaspar/received...,User,False,,,2019-12-20 10:46:25+00:00,2019-12-20,10,12,2019
4,4,3d9181d27047f919109701ef5007654c494614c4,MDY6Q29tbWl0Mzk0NjQwMTg6M2Q5MTgxZDI3MDQ3ZjkxOT...,https://api.github.com/repos/apache/incubator-...,https://github.com/apache/incubator-superset/c...,https://api.github.com/repos/apache/incubator-...,incubator-superset,apache,Daniel Vaz Gaspar,danielvazgaspar@gmail.com,...,https://api.github.com/users/web-flow/received...,User,False,,,2019-12-20 10:16:57+00:00,2019-12-20,10,12,2019


time: 24 ms


## Data Analysis

In [40]:
commits['commit.author.name'].unique().size

454

time: 2.99 ms


In [41]:
commits_by_hour = commits.groupby('commit_hour')[['sha']].count()
commits_by_hour = commits_by_hour.rename(columns = {'sha': 'commit_count'})

time: 13 ms


In [42]:
fig = go.Figure([go.Bar(
    x=commits_by_hour.index, 
    y=commits_by_hour.commit_count, 
    text=commits_by_hour.commit_count, 
    textposition='auto')])
fig.update_layout(title = 'Commits by Hour', xaxis_title = 'Hour', yaxis_title = 'Commits Count', xaxis_tickmode = 'linear')
fig.show()

time: 164 ms


In [49]:
commits_by_day = commits.groupby('commit_date')[['sha']].count()
commits_by_day = commits_by_day.rename(columns = {'sha': 'commit_count'})

time: 8 ms


In [47]:
fig = go.Figure([go.Scatter(
    x=commits_by_day.index, 
    y=commits_by_day.commit_count, 
    text=commits_by_day.commit_count, 
    fill='tozeroy')])
fig.update_layout(title = 'Commits by Day', xaxis_title = 'Day', yaxis_title = 'Commits Count')
fig.show()

time: 389 ms
