In [None]:
import glob
import json
import pandas as pd
import datetime
import numpy as np
import scipy.stats as sp
import matplotlib.pyplot as plt

In [None]:
# gets every log from directory
all_logs = glob.glob("logs/*.json")

In [None]:
# formats all logs properly 
# key = session ID
# value = json w/ same format as what was originally held in nb metadata
formatted_logs = {}

# all_logs = ["logs/85b319c4eaae7abe4a4ce5275bee6447_flvg4gvka.json"]
for log in all_logs:
    f = open(log, 'r')
    
    # merges body sent from various HTTP Posts
    entries = f.read().split('}{')
    for i in range(len(entries)):
        entries[i] = '{' + entries[i] + '}'
    entries[0] = entries[0][1:]
    entries[-1] = entries[-1][:-1]
    formatted_log = json.loads(entries[0])
    for i in range(1,len(entries)):
        j = json.loads(entries[i])
        history = j['history']
        formatted_log['history'].extend(history)
    
    # get time user started and ended using ipynb
    formatted_log['startTime'] = formatted_log['history'][0]['time']
    formatted_log['endTime'] = formatted_log['history'][len(formatted_log['history'])-1]['time']
    
    # separates user hash from file name (sessionID)
    names = log.split('_')
    formatted_log['userHash'] = names[0][5:]
    
    # excluding Jerry and Doris and potentially others
    exclude_list = ['bca87887a1cc89312f7d073fd007ea68', '1a735d0ee6a6f9d7fdab573b50851da7']
    if names[0][5:] not in exclude_list:
        formatted_logs[names[1][:-5]] = formatted_log


In [None]:
df = pd.DataFrame(formatted_logs).transpose().sort_values('nbName')
df.index = df.index.rename("sessionHash")
df = df.reset_index()

In [None]:
# Creates userGroups based on when notebook was first used
def groupDate(x):
    if x['startTime'].date() <= datetime.date(2020, 11, 1):
        return 'RiseCamp'
    elif x['startTime'].date() > datetime.date(2020, 11, 3) and x['startTime'].date() <= datetime.date(2020, 11, 5):
        return 'USF'
    else:
        return 'Other'
df['startTime'] = pd.to_datetime(df['startTime'])
df['endTime'] = pd.to_datetime(df['endTime'])
df['userGroup'] = df.apply(groupDate, axis=1)
df = df.sort_values('startTime')

In [None]:
playground_df = df[df['nbName'] == '4-Data-Playground.ipynb']
playground_df

In [None]:
lst = list(playground_df['endTime'] - playground_df['startTime'])
playground_df['timeSpent'] = playground_df['endTime'] - playground_df['startTime']
lst2 = [x.total_seconds()%3600 / 60 for x in lst]
lst2 = [lst2[0]] + lst2[2:]

In [None]:
plt.boxplot(lst2)
plt.ylabel("Minutes")
plt.title("Time Spent in Playground")
plt.show()

In [None]:
# takes care of scrolling issue where if they stop scrolling for a moment, it treats as separate scrolls

function_freq = {}
timing = {}
for log in playground_df['history']:
    prev = ''
    for entry in log:
        if 'stopScroll' not in prev:
            if entry['type'] in function_freq.keys():
                function_freq[entry['type']] += 1
            else:
                function_freq[entry['type']] = 1
        prev = entry['type']
        
function_freq['stopScroll'] = function_freq['startScroll']
function_freq['initWidget'] = function_freq['initWidget']/2
# del function_freq['selectCodeCell']
plt.bar(function_freq.keys(), function_freq.values())
plt.xticks(list(function_freq.keys()), rotation=90)
plt.title('Log Event Frequency')
plt.ylabel('Counts')
plt.show()
        

In [None]:
lux_events = ['initWidget', 'startScroll', 'stopScroll', 'openWarning', 'switchTab', 'clickVis', 'unclickVis', 'toggleBtnClick', 'intentBtnClick', 'closeWarning', 'exportBtnClick']
lux_counts = []
for event in lux_events:
    lux_counts.append(function_freq[event])
plt.bar(lux_events, lux_counts)
plt.xticks(lux_events, rotation=90)
plt.title('Lux Event Frequency')
plt.ylabel('Counts')
plt.show()

In [None]:
def print_nb_events(history):
    prev = ''
    for event in history:
        label = event['type']
        if label == 'startScroll' and prev != 'stopScroll':
            print(label)
        elif 'startScroll' != label and 'stopScroll' == prev:
            print(prev)
        elif 'initWidget' not in prev and label in lux_events and 'Scroll' not in label:
            print(label)
        prev = label



In [None]:
print_nb_events(list(playground_df['history'])[0])

In [None]:
# Modify index number HERE to step through and print through every single solution
submission = playground_df.iloc[16]["history"]

In [None]:
relevant_event = []
for event in submission: 
    if event["type"]!='selectMarkdownCell':
        if "code" in event:
            relevant_event.append(event["code"])

# lots of repetitive code, deduplicate event
deduplicated_relevant_event = set(relevant_event)
print ("Number of unique number of attempts:", len(deduplicated_relevant_event))
for idx,event in enumerate(deduplicated_relevant_event):
    print (f'\n--Iteration {idx+1}--')
    print (event)

Measures to compute: 

- Average number of iterations per notebook
- How often are they creating Vis? 
- How often are they creating VisList?
- How often are they setting intent? 
- How often do they switch from Pandas view to Lux view, and vice versa? (look at `param` in `toggleBtn` events)
    - `{'type': 'toggleBtnClick', 'time': '2020-11-04T22:25:33.438Z', 'param': 'pandas'}`

In [None]:
x = '2020-11-04T22:25:33.438Z'
y = '2020-11-04T22:26:33.438Z'
x=datetime.datetime.strptime(x,'%Y-%m-%dT%H:%M:%S.%fZ')
y=datetime.datetime.strptime(y,'%Y-%m-%dT%H:%M:%S.%fZ')
z=y-x
z.total_seconds()

Other interesting things: 
  - User actually brought in their own dataset beyond the provided dataset (coffee dataset `4wrrsyero`, AIDS dataset `z31s78hs1` )
  - Several users `mnf9invp3` attempted to change a column via pd.to_datetime

In [None]:
num_iters = []
num_intent = []
num_vis = []
num_vis_list = []
num_toggle_pandas = []
num_toggle_lux = []
scroll_times = {}
tab_freq = {}

for i in range(len(playground_df)):
    submission = playground_df.iloc[i]["history"]
    relevant_event = []
    toggle_pandas_count = 0
    toggle_lux_count = 0
    start_time = None
    for event in submission: 
        if event["type"]!='selectMarkdownCell':
            if "code" in event:
                relevant_event.append(event["code"])
        if event['type'] == 'toggleBtnClick':
            if event['param'] == 'pandas':
                toggle_pandas_count += 1
            else:
                toggle_lux_count += 1
        if event['type'] == 'startScroll':
            start_time = datetime.datetime.strptime(event['time'], '%Y-%m-%dT%H:%M:%S.%fZ')
        if event['type'] == 'stopScroll':
            scroll_time = datetime.datetime.strptime(event['time'], '%Y-%m-%dT%H:%M:%S.%fZ') - start_time
            scroll_time = scroll_time.total_seconds()
            if event['param'] in scroll_times:
                scroll_times[event['param']] += scroll_time
            else:
                scroll_times[event['param']] = scroll_time
        if event['type'] == 'switchTab':
            if event['param'] in tab_freq:
                tab_freq[event['param']] += 1
            else:
                tab_freq[event['param']] = 1

    # lots of repetitive code, deduplicate event
    deduplicated_relevant_event = set(relevant_event)
#     print ("Number of unique number of attempts:", len(deduplicated_relevant_event))
    intent_count = 0
    vis_count = 0
    vis_list_count = 0
    for idx,event in enumerate(deduplicated_relevant_event):
        if '.intent' in event:
            intent_count += 1
        if 'Vis(' in event:
            vis_count += 1
        if 'VisList(' in event:
            vis_list_count += 1
#         print (f'\n--Iteration {idx+1}--')
#         print (event)
    num_iters.append(len(deduplicated_relevant_event))
    num_intent.append(intent_count)
    num_vis.append(vis_count)
    num_vis_list.append(vis_list_count)
    num_toggle_pandas.append(toggle_pandas_count)
    num_toggle_lux.append(toggle_lux_count)

In [None]:
plt.bar(scroll_times.keys(), scroll_times.values())
plt.xticks(list(scroll_times.keys()), rotation=45)
plt.ylabel("Seconds")
plt.title("Time Spent Scrolling in Tab")
plt.show()

In [None]:
plt.bar(tab_freq.keys(), tab_freq.values())
plt.xticks(list(tab_freq.keys()), rotation=45)
plt.ylabel("Occurances")
plt.title("Frequency of Switching to Tab")
plt.show()

In [None]:
playground_df['num_iters'] = num_iters
playground_df['num_intent'] = num_intent
playground_df['num_vis'] = num_vis
playground_df['num_vis_list'] = num_vis_list
playground_df['num_toggle_pandas'] = num_toggle_pandas
playground_df['num_toggle_lux'] = num_toggle_lux




In [None]:
playground_df

