In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
from retentioneering import analysis, utils

import matplotlib.pyplot as plt
import matplotlib.colors as colors
%matplotlib inline


def ser2list(ser):
    return ser.tolist()

def CalcDeltas(df):
    df['dt'] = df.event_timestamp.diff()
    df['dn'] = df['rank'].diff()

    df['prev_event'] = df.event_name.shift(1)
    
    return df

def GetIntervalStats(targetOnly, split_by='kuni'):
    mechStats = pd.DataFrame()

    meta = targetOnly.copy()
    meta['dt'] = np.nan
    meta['dn'] = np.nan
    meta['prev_event'] = np.nan 

    meta = meta.groupby(split_by).apply(CalcDeltas).dropna()
    
    result = meta.groupby(['event_name', 'prev_event']).agg({
        'user_pseudo_id': lambda x: len(set(x)), 'dn': ser2list, 'dt': ser2list}).reset_index()
    
    return result

def get_stats_from_list(lst):
    return ['{:.2f}'.format(x(lst)) for x in [min, max, np.median, np.mean, np.std, len]]
    
def get_stats_from_timedelta(lst):
    return get_stats_from_list(list(map(lambda x: x.total_seconds(), lst)))

def GetSingleEventFromSessions(df):
    '''
    Get target events that were the only target event in session and count such sessions
    '''
    
    return pd.DataFrame({mech: df[df.mechanics == mech]\
                         .groupby('kuni').filter(lambda x: x['rank'].count() == 1).event_name.value_counts()
                         for mech in tqdm(set(mechanics.mechanics))}).dropna(how='all', axis=1)
def GetOccurFromList(df):
    return df.num_stats.map(lambda x: x[-1]).astype(float)

def GetJaccardOccurenceMatrix(df, split_by='kuni'):
    tmp = df.groupby('event_name')[split_by].unique() 
    tmp.index = tmp.index.map(lambda x: mechMap[x] + '-' + x)
    tmp.sort_index(inplace=True)
    cats = tmp.index
    
    mat = pd.DataFrame(index = cats, columns = cats, dtype=int)
    
    for i in tmp.index:
        for j in tmp.index:
            mat[i][j] = len(set(tmp[i]) & set(tmp[j]))/len(set(tmp[i]) | set(tmp[j]))
#         mat[i][i] = 0
        
    return mat, GetLinesPos(cats)

def GetLinesPos(lst):
    return (pd.Series([x.split('-')[0] for x in lst]).value_counts(sort=False).sort_index().cumsum()).values

def ExtendStats(mechStats):
    mechStats['mech_prev'] = mechStats.prev_event.map(lambda x: mechMap[x] + '-' + x)
    mechStats['mech_event'] = mechStats.event_name.map(lambda x: mechMap[x] + '-' + x)

    mechStats['time_stats'] = mechStats['dt'].map(get_stats_from_timedelta)
    mechStats['num_stats'] = mechStats['dn'].map(get_stats_from_list)

def CalculateJaccardUsers(mechStats):
    weight_node = pd.DataFrame({'a': mechStats.groupby('prev_event').user_pseudo_id.sum(),
                            'b': mechStats.groupby('event_name').user_pseudo_id.sum()}).fillna(0).agg(sum, axis=1).to_dict()

    mechStats['jaccard'] = mechStats.user_pseudo_id/(mechStats.event_name.map(weight_node.get) 
                                          + mechStats.prev_event.map(weight_node.get)
                                         - mechStats.user_pseudo_id)

def FillMetrics(row, jaccMat):
    jaccMat[row.mech_prev][row.mech_event] = row.jaccard

def GetAdjMat(df, metric):
    jaccMat = pd.DataFrame(columns = extMechList, index=extMechList, dtype=float)
    df.apply(metric, axis=1, args=(jaccMat,))
    
    return jaccMat.fillna(0.00001)

def PlotHeatmapWithLines(mat, barPos, filename, label, clip=False, gamma=1.0):

    plt.figure(figsize=(20,15))
    plt.title(label)
    ax = sns.heatmap(mat, cmap='Greys', square=True, robust=clip, norm=colors.PowerNorm(gamma=gamma))
    
    ax.hlines(barPos, color='tab:gray', *ax.get_xlim())
    ax.vlines(barPos, color='tab:gray', *ax.get_ylim())
    if not os.path.exists('./experiments/'):
        os.mkdir('./experiments/')
    plt.savefig('./experiments/test_target_action_aj_{}.svg'.format(filename), bbox_inches='tight', pad_inches=0.5)
    plt.show()
    plt.close()

In [None]:
path = '../example_datasets/train.csv'
df = pd.read_csv(path, sep=',')


df.sort_values(by=['user_pseudo_id', 'event_timestamp'], inplace=True)

# Convert time from µs to DateTime
df['event_timestamp'] = pd.to_datetime(df['event_timestamp']/1e6, unit='s')

# Split data in sessions

model = utils.preparing.SessionSplitter(n_components=3)
model.fit(df, columns_config = {'event_name_col': 'event_name',
                                'event_timestamp_col': 'event_timestamp',
                                'user_id_col': 'user_pseudo_id'})
data = model.predict(df, thr_prob=0.95, sort=True)

# kuni stands for "kompletly unique"
data['kuni'] = data[['user_pseudo_id', 'session']].apply(tuple, 1)
data = data.sort_values(by=['kuni', 'event_timestamp'])

# put ranks
data['rank'] = data.groupby('user_pseudo_id').cumcount()

# Load table with triples mecahics - event - label
labelMap = pd.read_csv('../example_datasets/event_label_map.csv').astype(str)

toTake = labelMap[labelMap.label != 'nan'].apply(lambda x: '__'.join(tuple(x)), 1)
targetEvents = toTake.append(labelMap[labelMap.label == 'nan'].event)

# construct dataframe with "mechanics - extended event" correspondence
mechanics = pd.DataFrame({'mechanics': labelMap['mechanics'], 'event_name': targetEvents}, columns=['mechanics', 'event_name'])

mechMap = dict(mechanics[['event_name','mechanics']].apply(tuple, 1).tolist())

# select only target events
targetOnly = data[data.event_name.isin(targetEvents)]


# analyse adjacent target events
mechStats = GetIntervalStats(targetOnly, split_by='user_pseudo_id')

# add statistics to DataFrame
ExtendStats(mechStats)

# get sorted list of machanics + events
extMechList = list(sorted(list(set(mechStats.mech_prev) | set(mechStats.mech_event))))

In [None]:
CalculateJaccardUsers(mechStats)

def JaccardUsersMetrics(row, jaccMat):
    jaccMat[row.mech_prev][row.mech_event] = row.jaccard

mat = GetAdjMat(mechStats, JaccardUsersMetrics)
barPos = GetLinesPos(extMechList)

PlotHeatmapWithLines(mat, barPos, 'test', label = 'Jaccard index for users: |had A→B| / |had A or B|', gamma=0.25)    

In [None]:
mat, barPos = GetJaccardOccurenceMatrix(targetOnly)

PlotHeatmapWithLines(mat, barPos, 'jac_occ_test', label='Jaccard index for occurence in sessions', gamma=0.25)

In [None]:
def MeanEventDistanceMetrics(row, jaccMat):
    jaccMat[row.mech_prev][row.mech_event] = row.num_stats[3]
    
mat = GetAdjMat(mechStats, MeanEventDistanceMetrics)
barPos = GetLinesPos(extMechList)

PlotHeatmapWithLines(mat, barPos, 'test_mean_dn', label = 'Mean distance in events')