In [None]:
%pylab inline

In [None]:
from __future__ import print_function
from __future__ import division
from IPython.display import display, HTML

In [None]:
import seaborn as sns
import pandas as pd
import MySQLdb as mdb
import bs4
import datetime
from collections import defaultdict
from matplotlib import pyplot as plt
from ipywidgets import widgets

### Load the Data

In [None]:
def table_to_dataframe(name, connection):
    return pd.read_sql("SELECT * FROM {};".format(name) , con=connection)

def project_table_to_dataframe(name, connection):
    # project_id 1 is monorail
    return pd.read_sql("SELECT * FROM {} where project_id = 1;".format(name) , con=connection)

In [None]:
connection = mdb.connect(host="localhost", user="root", db="monorail")

In [None]:
cursor = connection.cursor()

In [None]:
# Only look at monorail issues, and only look at issues opened in the past year.
issue = pd.read_sql("SELECT * FROM Issue where project_id = 1 and opened > 1436396241;", con=connection)
comment = pd.read_sql("SELECT * FROM Comment where project_id = 1 and created > 1436396241;", con=connection)
status_def = project_table_to_dataframe("StatusDef", connection)
issue_summarny = table_to_dataframe("IssueSummary", connection)
issue_label = table_to_dataframe("Issue2Label", connection)
issue_component = table_to_dataframe("Issue2Component", connection)
issue_update = table_to_dataframe("IssueUpdate", connection)
issue.rename(columns={"id":"issue_id"}, inplace=True)

In [None]:
print("Number of Issues", issue.shape[0])
print("Number of IssueUpdates", issue_update.shape[0])


### Associate IssueUpdates with their Issues
This next step is resource intensive and can take a while.

In [None]:
updates_by_issue = defaultdict(list)
i = 0
for index, row in issue_update.iterrows():
    updates_by_issue[row["issue_id"]].append(row)
    if i % 1000000 == 0:
        print(i)
    i += 1

In [None]:
issues_by_id = {}
i = 0
for index, row in issue.iterrows():
    issues_by_id[row["issue_id"]] = row
    if i % 1000000 == 0:
        print(i)
    i += 1

In [None]:
status_by_id = {}
i = 0
for index, row in status_def.iterrows():
    status_by_id[row["id"]] = row
    if i % 1000000 == 0:
        print(i)
    i += 1

In [None]:
issue["updates"] = issue["issue_id"].apply(lambda i_id: [u for u in sorted(updates_by_issue[i_id], key=lambda x: x.id)])
issue["num_updates"] = issue["updates"].apply(lambda updates: len(updates))

In [None]:
sns.distplot(issue["num_updates"], kde=False)

In [None]:
def StatusPath(i_id, updates):
    statuses = []
    for update in updates:
        if update.field == 'status':
            if len(statuses) == 0:
                statuses.append(update.old_value if update.old_value else 'none')
            statuses.append(update.new_value if update.new_value else 'none')

    if len(statuses) == 0:
        # use ~np.isnan here instead?
        if issues_by_id[i_id].status_id == issues_by_id[i_id].status_id: # cheap NaN hack
            status_id = int(issues_by_id[i_id].status_id)
            if status_id is not NaN and status_id in status_by_id:
                statuses = [status_by_id[status_id].status]
            else:
                statuses = ['mystery status id: %d' % status_id]
        else:
            statuses = ['never had status']
    statuses = [s.decode('utf-8', errors='replace') for s in statuses]
    return u'->'.join(statuses)



In [None]:
issue["status_path"] = issue["issue_id"].apply(lambda i_id: StatusPath(i_id, sorted(updates_by_issue[i_id], key=lambda x: x.id)))

In [None]:
plt.rcParams['figure.figsize']=(10,25)
by_path = issue.groupby(["status_path"]).size()
by_path.sort()
by_path.plot(kind='barh')

In [None]:
# Find distributions of time-to-close for various closed states.

closed_issue = issue[issue["closed"] > 0]
    
closed_issue["time_to_close"] = closed_issue["issue_id"].apply(lambda i_id: issues_by_id[i_id].closed - issues_by_id[i_id].opened)
closed_issue["issue_state"] = closed_issue["status_id"].apply(lambda s_id: status_by_id[s_id].status)
print("Number of closed issues %d" % closed_issue.shape[0])

In [None]:
plt.rcParams['figure.figsize']=(10,5)
sns.distplot(closed_issue[closed_issue["time_to_close"] < 1e7]["time_to_close"], kde=False)

In [None]:
# filter for time_to_close < 1e7 (~11 days since timestamps are seconds)
# since the time_to_close distribution skews waaaay out
sns.boxplot(data=closed_issue, x="time_to_close", y="issue_state", palette="colorblind")