In [None]:
import glob
import json
import pandas as pd
import datetime
import numpy as np
import scipy.stats as sp
import matplotlib.pyplot as plt

In [None]:
# gets every log from directory
all_logs = glob.glob("../logs/*.json")

In [None]:
# formats all logs properly 
# key = session ID
# value = json w/ same format as what was originally held in nb metadata
formatted_logs = {}

for log in all_logs:
    f = open(log, 'r')
    
    # merges body sent from various HTTP Posts
    entries = f.read().split('}{')
    for i in range(len(entries)):
        entries[i] = '{' + entries[i] + '}'
    entries[0] = entries[0][1:]
    entries[-1] = entries[-1][:-1]
    formatted_log = json.loads(entries[0])
    for i in range(1,len(entries)):
        j = json.loads(entries[i])
        history = j['history']
        formatted_log['history'].extend(history)
    
    # get time user started ipynb
    formatted_log['startTime'] = formatted_log['history'][0]['time']
    formatted_log['endTime'] = formatted_log['history'][len(formatted_log['history'])-1]['time']
    
    # separates user hash from file name (sessionID)
    names = log.split('_')
    formatted_log['userHash'] = names[0][5:]
    
    # excluding Jerry and Doris and potentially others
    exclude_list = ['bca87887a1cc89312f7d073fd007ea68', '1a735d0ee6a6f9d7fdab573b50851da7']
    if names[0][5:] not in exclude_list:
        formatted_logs[names[1][:-5]] = formatted_log


In [None]:
df = pd.DataFrame(formatted_logs).transpose().sort_values('userHash')

In [None]:
df

In [None]:
# Number of unique IPs data was sent from
df['userHash'].nunique()

In [None]:
# Creates userGroups based on when notebook was first used
def groupDate(x):
    if x['startTime'].date() <= datetime.date(2020, 11, 1):
        return 'RiseCamp'
    elif x['startTime'].date() > datetime.date(2020, 11, 3) and x['startTime'].date() <= datetime.date(2020, 11, 5):
        return 'USF'
    else:
        return 'Other'
df['startTime'] = pd.to_datetime(df['startTime'])
df['endTime'] = pd.to_datetime(df['endTime'])
df['userGroup'] = df.apply(groupDate, axis=1)
df = df.sort_values('startTime')

In [None]:
df.sort_values('userGroup')

In [None]:
nb_map = {}
nbs = df[df['userGroup'] == 'Other']
for nb in nbs['nbName']: 
    if nb in nb_map:
        nb_map[nb] += 1
    else:
        nb_map[nb] = 1
nb_map

In [None]:
len(nbs)

In [None]:
diff = nbs['endTime'] - nbs['startTime']
diff = [x.total_seconds()%3600 / 60 for x in diff]

In [None]:
plt.hist(diff)
plt.xlabel("time in minutes")
plt.ylabel('counts')
plt.title('time spent in notebooks from new users')
plt.show()



In [None]:
def has_0(nbNames):
    for nbName in nbNames:
        if '0' in nbName:
            return 1
    return 0
def has_1(nbNames):
    for nbName in nbNames:
        if '1' in nbName:
            return 1
    return 0
def has_2(nbNames):
    for nbName in nbNames:
        if '2' in nbName:
            return 1
    return 0
def has_3(nbNames):
    for nbName in nbNames:
        if '3' in nbName:
            return 1
    return 0
def has_4(nbNames):
    for nbName in nbNames:
        if '4' in nbName:
            return 1
    return 0

In [None]:
# Groups users by hash and computes information as to whether user attempt notebook
grouped_df = df.groupby('userHash').agg(userGroup=('userGroup', sp.mode), startTime=('startTime', 'min'), endTime=('endTime', 'max'), 
                                        tried_0=('nbName', has_0), tried_1=('nbName', has_1), tried_2=('nbName', has_2),
                                        tried_3=('nbName', has_3), tried_4=('nbName', has_4))
grouped_df['userGroup'] = grouped_df['userGroup'].apply(lambda x: x[0][0])
grouped_df.sort_values('startTime')

In [None]:
grouped_df.groupby("userGroup").count()

In [None]:
# Plots user attemps at tutorial notebooks
def plot_nb_progression(df, userGroup, n):
    nbNames = ['1-Specify-Intent.ipynb', '2-Quick-Vis.ipynb', '3-widget-vis-export.ipynb', '4-Data-Playground.ipynb'] 
    counts = [sum(df['tried_1']),sum(df['tried_2']),sum(df['tried_3']),sum(df['tried_4'])]
    plt.bar(nbNames , counts)
    plt.xticks(nbNames, rotation=45)
    plt.title(str(userGroup) + ' Attempts of Tutorial Notebooks n=' + str(n))
    plt.ylabel('Counts')
    plt.show()


In [None]:
plot_nb_progression(grouped_df, 'Overall', len(grouped_df))

In [None]:
riseCamp_df = grouped_df[grouped_df['userGroup']=='RiseCamp']
plot_nb_progression(riseCamp_df, 'RiseCamp', len(riseCamp_df))

In [None]:
usf_df = grouped_df[grouped_df['userGroup']=='USF']
plot_nb_progression(usf_df, 'USF', len(usf_df))

In [None]:
playground_df = df[df['nbName'] == '4-Data-Playground.ipynb']
playground_df

In [None]:
# takes care of scrolling issue where if they stop scrolling for a moment, it treats as separate scrolls

function_freq = {}
timing = {}
for log in playground_df['history']:
    prev = ''
    for entry in log:
        if 'stopScroll' not in prev:
            if entry['type'] in function_freq.keys():
                function_freq[entry['type']] += 1
            else:
                function_freq[entry['type']] = 1
        prev = entry['type']
        
function_freq['stopScroll'] = function_freq['startScroll']
function_freq['initWidget'] = function_freq['initWidget']/2
# del function_freq['selectCodeCell']
plt.bar(function_freq.keys(), function_freq.values())
plt.xticks(list(function_freq.keys()), rotation=90)
plt.title('Log Event Frequency')
plt.ylabel('Counts')
plt.show()
        

In [None]:
lux_events = ['initWidget', 'startScroll', 'stopScroll', 'openWarning', 'switchTab', 'clickVis', 'unclickVis', 'toggleBtnClick', 'intentBtnClick', 'closeWarning', 'exportBtnClick']
lux_events

In [None]:
playground_df['endTime'] - playground_df['startTime']

In [None]:
grouped_df.reset_index().drop(columns=["startTime","endTime"]).to_csv("user_groups.csv",index=None)

In [None]:
other_df = grouped_df[grouped_df['userGroup']=='Other']
plot_nb_progression(other_df, 'Other', len(other_df))

In [None]:
other_df