In [None]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [None]:
# read in all user data
df = pd.read_csv('./../freetext/FreeText-Dataset-31-USERS.csv')

# data as array
user_data = [df.loc[df['user'] == i] for i in range(1, 32)]

In [None]:
# visualize all user data
# compare to [13, 18, 26]
for (i, x) in enumerate(user_data):
    if i in [13, 18, 26]:
        print(i, "user: ", i+1)
        timestamp = np.array(x['timestamp'])

        plt.xlabel('Number of Inputs')
        plt.ylabel('Time')
        plt.plot(np.arange(0, len(timestamp)), timestamp)
        plt.show()



In [None]:
# analyse user profiles
# dict_keys(['digraphs', 'trigraphs', 'fourgraphs'])


# n-grpahs for all users(0-31) for all sets(0-15) 
digraphs = []
trigraphs = []
fourgraphs = []

with open("./../freetext/__DATA/original_data_profiles", "rb") as fp:
    user_profiles = pickle.load(fp)

    # extrace n-graphs into lists
    digraphs = [userdata['digraphs'] for userdata in user_profiles]
    trigraphs = [userdata['trigraphs'] for userdata in user_profiles]
    fourgraphs =  [userdata['fourgraphs'] for userdata in user_profiles]


def compact_data(ngraphs:list[list[dict]]) -> list[dict]:
    '''
    Compacts data from all user profiles into key indicators.
    '''

    dicts = []

    # visualize n-graphs
    for user in range(0, 31):

        # create dict containing lists for each n-graph from each set
        dict = {}

        # from each set, add n-graph data
        for set in range(0, 15):
            # get all n-graph combinations
            keys = list(ngraphs[user][set].keys())
            # insert each n-graph from this set into dict
            for key in keys:
                # if this is a new n-graph, add a list, 
                if dict.get(key) is None:
                    dict[key] = []

                # then append to list
                dict[key].append(ngraphs[user][set][key])

        # compact n-graph data
        for key in dict:
            # get n-graph data
            ngraph_data = np.array(dict[key])
            # calculate key values
            mean = ngraph_data.mean()
            std = ngraph_data.std()
            median = np.median(ngraph_data)
            l = len(ngraph_data)

            # update dict to contain only key data
            dict[key] = {
                "mean": mean, 
                "median": median,
                "std": std,
                "len": l
            }
        
        # append user dict to list
        dicts.append(dict)
    
    return dicts


compact_digraphs = compact_data(digraphs)
compact_trigraphs = compact_data(trigraphs)
compact_fourgraphs = compact_data(fourgraphs)


In [None]:
# try visualize

def visualize(ngraphs_compact:list[list[dict]], all: bool):

    for (user, data) in enumerate(ngraphs_compact):


        if user in [13, 18, 26] or all:
            figure, axis = plt.subplots(3)
            print("found")
            


            keys = list(data.keys())
            means = [data[k]['mean'] for k in keys]
            meadians = [data[k]['median'] for k in keys]
            stds = [data[k]['std'] for k in keys]

            print("user index: ", user)

            axis[0].set_title("Means")
            axis[0].bar(range(len(means)), means)

            axis[1].set_title("Medians")
            axis[1].bar(range(len(meadians)), meadians)

            axis[2].set_title("Stds")
            axis[2].bar(range(len(stds)), stds)

            plt.show()


#visualize(compact_digraphs, False)
#visualize(compact_data(trigraphs), False)
#visualize(compact_data(fourgraphs), False)

In [None]:
# average / export
def average(ngraphs_compact:list[list[dict]]) -> list[dict]:
    out = []
    for (user, data) in enumerate(ngraphs_compact):

        keys = list(data.keys())

        # list of mean data of each n-graph across all sets for one user
        means = np.array([data[k]['mean'] for k in keys])

        # list of median data of each n-graph across all sets for one user
        meadians = np.array([data[k]['median'] for k in keys])

        # list of std data of each n-graph across all sets for one user
        stds = np.array([data[k]['std'] for k in keys])

        # calculate average data across all n-graphs
        means_mean = round(means.mean())
        meadians_mean = round(meadians.mean())
        stds_mean = round(stds.mean())

        #print(user, " ", means_mean, " ", meadians_mean, " ", stds_mean)

        out.append({
            "user": user,
            "means_mean": means_mean,
            "meadians_means": meadians_mean,
            "stds_mean": stds_mean
        })

    return out



digraph_avgs = average(compact_digraphs)
df = pd.DataFrame.from_records(digraph_avgs)
df.to_csv("./user_data_digraphs.csv", index=False)


trigraphs_avgs = average(compact_trigraphs)
df = pd.DataFrame.from_records(trigraphs_avgs)
df.to_csv("./user_data_trigraphs.csv", index=False)


fourgraphs_avgs = average(compact_fourgraphs)
df = pd.DataFrame.from_records(fourgraphs_avgs)
df.to_csv("./user_data_fourgraphs.csv", index=False)