# MSR 2018 Challenge Notebook
This notebook contains the work done for the analysis for Tyson Bulmer's MSR 2018 Challenge paper using the supplied dataset.

In [None]:
import psycopg2
import psycopg2.extras
import pandas as pd
import numpy as np
import math

import matplotlib.pyplot as plt

# Configure display of dataframe visualizations
pd.options.display.mpl_style = 'default'

## Connect to database and get event data

In [None]:
conn = psycopg2.connect("dbname='msr2018' user='' host='' password=''")

In [None]:
cur = conn.cursor()

In [None]:
cur.execute("""select event_type, idesessionuuid, triggeredat, duration from events""")

In [None]:
colnames = [desc[0] for desc in cur.description]

In [None]:
rows = cur.fetchall()

In [None]:
results = pd.DataFrame([list(row) for row in rows], columns=colnames)

In [None]:
# Clean up events column strings
results['event_type'] = results['event_type'].apply(lambda x:'.'.join(x.split(',')[0].split('.')[4:]))

In [None]:
results

## Filter events to user induced events

In [None]:
user_events = [
    'CommandEvent', 'NavigationEvent', 'VisualStudio.WindowEvent',
       'ActivityEvent', 'CompletionEvents.CompletionEvent',
       'VisualStudio.EditEvent', 'VisualStudio.DocumentEvent',
       'VisualStudio.DebuggerEvent', 'SystemEvent',
       'VisualStudio.SolutionEvent', 'VisualStudio.IDEStateEvent',
       'UserProfiles.UserProfileEvent', 'VisualStudio.BuildEvent',
       'TestRunEvents.TestRunEvent',
       'VersionControlEvents.VersionControlEvent', 'VisualStudio.FindEvent'
]

In [None]:
results = results[results['event_type'].isin(user_events)]
results

## Define some helper functions for analyzing the data

In [None]:
take_log = lambda x: math.log(float(x)+1)

In [None]:
def plot_distribution(data, *, x_label, y_label, title, logged_distribution=True, save_as=None):
    plt.clf()
    fig, axes = plt.subplots(nrows=1, ncols=1)
    if logged_distribution:
        data = data.apply(take_log)
    data.hist(ax=axes)
    plt.title(title)
    fig.text(0.5, 0.04,x_label, ha='center', rotation='horizontal')
    fig.text(.04, 0.5, y_label, va='center', rotation='vertical')
    if save_as:
        plt.savefig(save_as)
    plt.show()

## Look into the data

In [None]:
unique_events = results['event_type'].unique()

In [None]:
print("Number of unique events:", len(unique_events))

In [None]:
unique_events

## Group the sessions and analyze counts of events per session

In [None]:
grouped_sessions = results.groupby('idesessionuuid')

In [None]:
grouped_sessions.size().describe(percentiles=[.1, .25, .5, .75, .9])

In [None]:
grouped_sessions.size().apply(take_log).describe()

In [None]:
plot_distribution(grouped_sessions.size(), x_label='Logged number events in session', y_label='Occurences', title='Events Per Session', save_as='eventspersession.png')

In [None]:
grouped_sessions.size().plot(kind='box')

## Group the data by event_type to gt an idea of its distribution

In [None]:
grouped_events = results.groupby('event_type')

In [None]:
grouped_events.size().describe()

In [None]:
grouped_events.size().apply(take_log).plot(kind='barh', title='Distribution of Events')
plt.savefig('eventtypedistibution.png',bbox_inches='tight')

## Past here was looking into finding patterns, but none is included in the report

## Identifying patterns

### Method 1: See if any sessions did have exact same patterns

In [None]:
from collections import defaultdict
naive_pattern_dict = defaultdict(int)
for session, group in grouped_sessions:
    event_string = '-'.join(group['event_type'].str.lower())
    naive_pattern_dict[event_string] += 1

In [None]:
print("Unique session patterns: ", len(naive_pattern_dict))

In [None]:
sorted(naive_pattern_dict.items(), key=lambda x: x[1], reverse=True)[:20]

### Method 2: Abstract down patterns
Reduce the patterns of events to things such as command+, selection, command ...

Few possible options here:

1) abstract with the + notation

2) abstract with essentially the * notation

3) instead do sets, so order doesnt matter, then see which commands are used together more than not

In [None]:
# Option 1
abstracted_pattern_dict_1 = defaultdict(int)
for session, group in grouped_sessions:
    prev = ''
    new = []
    for token in group['event_type'].str.lower():
        if token != prev:
            new.append(token)
            prev = token
        else:
            new.pop()
            new.append(token + '+')
    event_string = '-'.join(new)
    abstracted_pattern_dict_1[event_string] += 1

In [None]:
print("Option 1: Unique session patterns: ", len(abstracted_pattern_dict_1))

In [None]:
sorted(abstracted_pattern_dict_1.items(), key=lambda x: x[1], reverse=True)[:10]

In [None]:
# Option 2
abstracted_pattern_dict_2 = defaultdict(int)
for session, group in grouped_sessions:
    prev = ''
    new = []
    for token in group['event_type'].str.lower():
        if token != prev:
            new.append(token)
            prev = token
    event_string = '-'.join(new)
    abstracted_pattern_dict_2[event_string] += 1

In [None]:
print("Option 2: Unique session patterns: ", len(abstracted_pattern_dict_2))

In [None]:
sorted(abstracted_pattern_dict_2.items(), key=lambda x: x[1], reverse=True)[:10]

In [None]:
# Option 3
abstracted_pattern_dict_3 = defaultdict(int)
for session, group in grouped_sessions:
    event_string = '-'.join(sorted(group['event_type'].unique())).lower()
    abstracted_pattern_dict_3[event_string] += 1

In [None]:
# Given that the number of possible unique event combinations is 2^16 - 1 -> 65535 
print("Option 3: Unique session patterns: ", len(abstracted_pattern_dict_3))

In [None]:
sorted(abstracted_pattern_dict_3.items(), key=lambda x: x[1], reverse=True)[:10]

### Look deeper into Option 3
Only 407 unique combinations of patterns. We can investigate which events rarely happen together and which happen togther frequently

In [None]:
occurrence_dict = defaultdict(int)
original = []
duplicate = []
for key, value in abstracted_pattern_dict_3.items():
    for e in key.split('-'):
        for m in key.split('-'):
            if e == m: continue
            if e+'-'+m not in original and e+'-'+m not in duplicate:
                original.append(e+'-'+m)
                duplicate.append(m+'-'+e)
            elif e+'-'+m in original:
                occurrence_dict[e+'-'+m] += value
            else:
                # Pair is a duplicate so ignore
                pass
                
occurrence_dict

In [None]:
sorted(occurrence_dict.items(), key=lambda x:x[1], reverse=True)

In [None]:
# Successive actions for tools only
pair_dict = defaultdict(int)
for index, group in grouped_sessions:
    l = list(group['event_type'])
    for i, event in enumerate(l[1:]):
        pair_dict[l[i]+'-'+l[i+1]] += 1
sorted(pair_dict.items(), key=lambda x: x[1], reverse=True)

In [None]:
# Successive actions for all
pair_dict = defaultdict(int)
for index, group in results.groupby('idesessionuuid'):
    l = list(group['event_type'])
    for i, event in enumerate(l[1:]):
        pair_dict[l[i]+'-'+l[i+1]] += 1
sorted(pair_dict.items(), key=lambda x: x[1], reverse=True)

In [None]:
for index, group in results.groupby('idesessionuuid'):
    l = list(group['event_type'])
    for i, event in enumerate(l[1:]):

### Method 3: Find sequences of patterns
Find sequences from small patterns in large ones