In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from copy import deepcopy
import networkx as nx
import os

os.chdir("/Users/amkagan/Desktop/gltm_experiments")

from collections import Counter

from itertools import product, chain
from tqdm import tqdm
from typing import *
import pickle
import dill

from utils.pseudo_trace_utils import compute_edge_stats_from_pseudo_traces, construct_pseudo_traces
from InfluenceDiffusion.Graph import Graph
from InfluenceDiffusion.Trace import Trace

## Open data

In [2]:
trunc_edge_df = pd.read_csv("flixster_data/connected_comp_action_trunc_edge_df_new.csv")
trunc_action_df = pd.read_csv("flixster_data/trunc_popular_action_df_new.csv")

In [3]:
graph = Graph(trunc_edge_df.values, directed=False)
graph.count_edges()

101030

## Extract pseud-traces

In [4]:
pseudo_traces = construct_pseudo_traces(trunc_action_df, graph)

with open(f"flixster_data/pseudo_traces_simplified/pseudo_traces.pkl", "wb") as f:
    pickle.dump(pseudo_traces, f, protocol=pickle.HIGHEST_PROTOCOL)

100%|██████████| 7041/7041 [04:57<00:00, 23.63it/s]


## Compute some statistics

In [None]:
def compute_num_active_parents_before_activation(action_df, graph):
    user_2_num_active_parents_across_actions = {}
    for user in tqdm(action_df["user"].unique()):
        user_2_num_active_parents_across_actions[user] = []
        user_subdf = action_df[action_df["user"] == user]
        parents = graph.get_parents(user)
        for action in user_subdf["action"].unique():
            user_time = user_subdf["time"][user_subdf["action"] == action].item()
            users_active_before = set(action_df["user"][(action_df["time"] < user_time) & \
                                                        (action_df["action"] == action)])
            num_active_parents_before_active = len(parents & users_active_before)
            user_2_num_active_parents_across_actions[user].append(num_active_parents_before_active)
    return user_2_num_active_parents_across_actions

user_2_num_active_parents_across_actions = \
    compute_num_active_parents_before_activation(trunc_action_df, graph)

In [None]:
user_2_num_pseuodtrace = {user: np.sum(np.array(num_active_parents) != 0) 
                    for user, num_active_parents in user_2_num_active_parents_across_actions.items()}
num_pseudotraces = np.array(list(user_2_num_pseuodtrace.values()))
print((num_pseudotraces > 0).sum(), (num_pseudotraces != 0).mean())

In [None]:
pseudo_trace_stats = compute_edge_stats_from_pseudo_traces(train_pseudo_traces, trunc_edge_df.values)

In [None]:
prop_edges_with_no_info = np.mean(np.vstack(list(pseudo_trace_stats.values())).sum(1) == 0)

In [None]:
print("Prop edges with no information", prop_edges_with_no_info)
for pos_app in [1, 2, 5]:
    prop_pos_app = np.mean(np.vstack(list(pseudo_trace_stats.values()))[:, 0] >= pos_app)
    print(f"Proportion of edges with at least {pos_app} positive appearences", prop_pos_app)

In [None]:
print("Prop edges with no information", prop_edges_with_no_info)
for pos_app in [1, 2, 5]:
    prop_pos_app = np.mean(np.vstack(list(pseudo_trace_stats.values()))[:, 0] >= pos_app)
    print(f"Proportion of edges with at least {pos_app} positive appearences", prop_pos_app)

In [None]:
# sorted(list(pseudo_trace_stats.values()), key=lambda tup: tup[0], reverse=True)

In [None]:
user_2_num_traces_active = {vertex: np.sum([len(trace[1]) > 0 for trace in vertex_traces]) 
                            for  vertex, vertex_traces in pseudo_traces.items()}


## Analyze activation time deltas between neighbor activations

In [None]:
def extract_parent_activation_time_deltas(action_df, graph: Graph, scale=1):
    user_2_time_deltas = {}
    for user in tqdm(action_df["user"].unique()):
        user_2_time_deltas[user] = []
        user_subdf = action_df[action_df["user"] == user]
        parents = graph.get_parents(user)
        for action in user_subdf["action"].unique():
            user_time = user_subdf["time"][user_subdf["action"] == action].item()
            user_active_before_subdf = action_df[(action_df["time"] < user_time) & \
                                                 (action_df["action"] == action)]
            user_2_time_deltas[user] += list(np.diff(user_active_before_subdf["time"].sort_values())
                                             / scale)
    return user_2_time_deltas

med_deltas = list([np.median(deltas) for deltas in user_2_time_deltas.values()])
plt.boxplot(med_deltas)
plt.show()