# MSR 2018 Challenge Notebook
This notebook contains the work done for the analysis for Tyson Bulmer's MSR 2018 Challgne paper using the supplied dataset.

In [None]:
import psycopg2
import psycopg2.extras
import pandas as pd
import numpy as np

# To deal with right skewness we can take the log of the values
import math

import matplotlib.pyplot as plt

from tqdm import tqdm

# Configure display of dataframe visualizations
pd.options.display.mpl_style = 'default'

## Connect to database and get event data

In [None]:
conn = psycopg2.connect("dbname='msr2018' user='tbulmer' host='localhost' password='msr2018'")

In [None]:
cur = conn.cursor()

In [None]:
cur.execute("""select event_type, idesessionuuid, triggeredat, commandid, cancelled, terminatedstate, action, typeofnavigation, wasaborted from events""")

In [None]:
colnames = [desc[0] for desc in cur.description]

In [None]:
results = cur.fetchall()

In [None]:
results = pd.DataFrame([list(row) for row in results], columns=colnames)

In [None]:
# Clean up events column strings
results['event_type'] = results['event_type'].apply(lambda x:'.'.join(x.split(',')[0].split('.')[4:]))

In [None]:
# Filter results to only applicable events
events_to_use = [
    'CommandEvent', 'NavigationEvent', 'VisualStudio.WindowEvent',
    'CompletionEvents.CompletionEvent',
       'VisualStudio.EditEvent', 'VisualStudio.DocumentEvent',
       'VisualStudio.DebuggerEvent',
       'VisualStudio.SolutionEvent', 'VisualStudio.IDEStateEvent', 'VisualStudio.BuildEvent',
       'TestRunEvents.TestRunEvent',
       'VersionControlEvents.VersionControlEvent', 'VisualStudio.FindEvent'
]
results = results[results['event_type'].isin(events_to_use)].fillna('')

In [None]:
#Keep only the sessions which have multiple entries
results = results[results.duplicated(subset=['idesessionuuid'], keep=False)]

In [None]:
# Clean up commandid column strings
results['commandid'] = results['commandid'].apply(lambda x:'.'.join(x.split(':')[-1].split('.')[-2:]))

In [None]:
results = results.sort_values('triggeredat')
results

In [None]:
d = {'CommandEvent':'commandid',
     'CompletionEvents.CompletionEvent':'terminatedstate',
     'VisualStudio.DocumentEvent': 'action',
     'VisualStudio.FindEvent':'cancelled',
     'VisualStudio.SolutionEvent':'action',
     'VisualStudio.WindowEvent': 'action',
     'NavigationEvent': 'typeofnavigation',
     'TestRunEvents.TestRunEvent': 'wasaborted'
    }

d_keys = d.keys()
    
results['event_type_complete'] = results.apply(lambda x: x['event_type']+'-'+str(x[d[x['event_type']]]) if x['event_type'] in d_keys else x['event_type'], axis=1)

In [None]:
results

In [None]:
results = results[['event_type_complete', 'idesessionuuid', 'triggeredat']]

In [None]:
results

In [None]:
# Save to a file so we can cache load it
results.to_csv('results.csv', index=False)

In [None]:
groups = pd.read_csv('results.csv').groupby('idesessionuuid')

In [None]:
for sessionid, group in groups:
    group.to_csv('./sessions/'+sessionid + '.csv')

In [None]:
from os import walk

f = []
for (dirpath, dirnames, filenames) in walk('./sessions/'):
    f = filenames
    break
# filenames

In [None]:
data = []
for f in filenames:
    group = pd.read_csv('./sessions/' +f)
#     for index, group in pd.read_csv(f).groupby('idesessionuuid', as_index=False):
    events = group['event_type_complete'].tolist()
    indices = [i for i, x in enumerate(events) if x.split('-')[0] == "CommandEvent"]

    for ind in indices:
        x = events[:ind][:-10] # Max length of event chains to 10
        y = events[ind]
        if len(x) > 0:
            data.append([' '.join(x), y])
del group
del events
del indices
    
data = pd.DataFrame(data, columns=['events', 'command'])

In [None]:
data