In [13]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
%autoreload 2

In [15]:
cd /raid/home/myang349/recsys-filterbubbles/

/raid/home/myang349/recsys-filterbubbles


# Imports

In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import torch.optim as optim
from os import listdir
from os.path import isfile, join
from tracin.tracin import (
    save_tracin_checkpoint,
    load_tracin_checkpoint,
    calculate_tracin_influence,
)
import pandas as pd
from LSTM_clean.utils import filter_and_split_data, sequence_generator, load_community_dict, get_communities 
from LSTM_clean.model import LSTM 
from collections import Counter
import numpy as np
import pickle
from collections import defaultdict
import copy
from torch.optim import SGD
from lstm_wrapper import train_model, get_topk_predictions
from dataclasses import dataclass

# Loading Data

In [22]:
# Data Location
# Use SAVE_PREFIX to mark different datasets
SAVE_PREFIX = "104k"
SAVE_FOLDER = "/raid/home/myang349/recsys-filterbubbles/data/twitch_sequence/"
SAVE_TRAIN_NAME = SAVE_PREFIX + "train.data"
SAVE_VALID_NAME = SAVE_PREFIX + "valid.data"
SAVE_TEST_NAME = SAVE_PREFIX + "test.data"
SAVE_COMMUNITY_NAME = SAVE_PREFIX + "lstm_idx_communities.data"

In [23]:
train_data = np.load(os.path.join(SAVE_FOLDER, SAVE_TRAIN_NAME), allow_pickle=True)
valid_data = np.load(os.path.join(SAVE_FOLDER, SAVE_VALID_NAME), allow_pickle=True)
test_data = np.load(os.path.join(SAVE_FOLDER, SAVE_TEST_NAME), allow_pickle=True)

# Training

In [None]:
model = train_model()

Device is cuda
Train: 74049, Valid: 9202
Model is  LSTM(
  (item_emb): Embedding(3312, 128, padding_idx=0)
  (LSTM): LSTM(128, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=3312, bias=True)
)

Training and testing
train # = 74049, test # = 9202

Epoch 0	Train Loss: 0.01587924685014117	Test MRR: 0.006441187991632963	Test Recall@10: 0.010215170615083678	Elapsed time: 2.4003243446350098
saving checkpoint to /raid/home/myang349/recsys-filterbubbles/checkpoints/lstm_checkpoint_epoch0.pt
Epoch 5	Train Loss: 0.015569483266807082	Test MRR: 0.05171927357058986	Test Recall@10: 0.11562703760052162	Elapsed time: 8.231169939041138
Epoch 10	Train Loss: 0.014784718266869511	Test MRR: 0.05384537881947244	Test Recall@10: 0.10910671593131928	Elapsed time: 8.797767639160156
saving checkpoint to /raid/home/myang349/recsys-filterbubbles/checkpoints/lstm_checkpoint_epoch10.pt
Epoch 15	Train Loss: 0.014091254752061149	Test MRR: 0.06597926250997509	Test Recall@10: 0.13660073896978916	Elaps

# Experimentation

## Verify the recall

In [9]:
# Generate sequences and gt
sequences = [pt[0] for pt in valid_data]
# sequences = [list(reversed(pt[0]))[:10] for pt in valid_data]
gt = [pt[1] for pt in valid_data]

In [10]:
# Generate predictions
predictions = get_topk_predictions(model, sequences, k=50)

In [11]:
assert len(sequences) == len(predictions)
hit = 0
total = 0
for i, val in enumerate(gt):
    if val in predictions[i]:
        hit += 1
    total += 1  
hit/total

0.7622256031297544

## Computing community vectors

In [12]:
community_dict = load_community_dict(os.path.join(SAVE_FOLDER, SAVE_COMMUNITY_NAME))

In [13]:
# # Converting prev and predictions to be equal length and in community embedding
# prev_comm = []
# pred_comm = []
# for i in range(len(sequences)):
#     prev = sequences[i]
#     n = len([x for x in prev if x != 0])
#     pred = predictions[i][:n]
    
#     prev_comm.append(get_communities(prev, community_dict))
#     pred_comm.append(get_communities(pred, community_dict))

In [14]:
LOOKBACK = 10
THRESHOLD = 10

# Alternate version where we just get the last lookback from prev
prev_comm = []
pred_comm = []
for i in range(len(sequences)):
    prev = list(reversed(sequences[i]))[:LOOKBACK]
    n = len([x for x in prev if x != 0])
    
    if n < THRESHOLD:
        continue
        
    pred = predictions[i][:n]
    prev_comm.append(get_communities(prev, community_dict))
    pred_comm.append(get_communities(pred, community_dict))

In [15]:
for i in range(100):
    assert len(prev_comm[i]) ==len(pred_comm[i])

In [16]:
hm_hist = defaultdict(lambda: Info(0,0))
hm_pred = defaultdict(lambda: Info(0,0))

In [17]:
prev_num_comm = 0
pred_num_comm = 0
n = len(prev_comm)
threshold = 0

for i in range(n):
    if len(prev_comm[i]) < threshold:
        continue
    prev_num_comm += len(set(prev_comm[i]))
    pred_num_comm += len(set(pred_comm[i]))

In [18]:
prev_num_comm

16524

In [19]:
pred_num_comm

16614

# Analyzing recommendation distribution

## Quantifying Diversity Based on Watch History vs Recommendation

In [24]:
def f(c):
    return len(set(c))
    
def f(c):
    hm = Counter(c)
    for val in hm.values():
        if val >= 0.9 * len(c):
            return 1
    return 0

In [25]:
prev_comm_stats = []
pred_comm_stats = []
n = len(prev_comm)
threshold = 0

for i in range(n):
    c1, c2 = prev_comm[i], pred_comm[i]
    if len(c1) < threshold:
        continue
    prev_comm_stats.append(f(c1))
    pred_comm_stats.append(f(c2))

In [26]:
sum(prev_comm_stats)

4947

In [27]:
sum(pred_comm_stats)

5394

# Quantifying Filter-Bubble Based on Combination of Watch History + Recommendation

## Quantify Functions

In [28]:
# Does one of the top 3 predictions belong in a community in the previous history?
def f(prev, pred, threshold=3):
    for comm in pred[:threshold]:
        if comm in prev:
            return 1
    return 0

# Does one of the top-threshold belong in the majority community in the previous history?
def f(prev, pred, threshold=3):
    hm = Counter(prev)
    most_common = max(hm.keys(), key=hm.get)
    
    for comm in pred[:threshold]:
        if comm == most_common:
            return 1
    return 0

# What % of the top-threshold belongs in the most common community in the previous history
def f(prev, pred, threshold=10):
    hm = Counter(prev)
    most_common = max(hm.keys(), key=hm.get)
    
    count = 0
    for comm in pred[:threshold]:
        if comm == most_common:
            count += 1
    return count / threshold

In [30]:
# What percentage of the top-threshold predictions belong in a community in the previous history?
def f(prev, pred, threshold=5):
    count = 0
    for comm in pred[:threshold]:
        if comm in prev:
            count += 1
    return count / threshold

# What percentage of the top-threshold predictions belong in the communities of the last threshold-items in the history?
def f(prev, pred, threshold=10):
    count = 0
    for comm in pred[:threshold]:
        if comm in list(reversed(prev))[:threshold]:
            count += 1
    return count / threshold

## Filter functions

In [460]:
def filter(c):
    hm = Counter(c)
    for val in hm.values():
        if val >= 0.9 * len(c):
            return True
    return False

## This cell actually computes values

In [31]:
counts = []
n = len(prev_comm)
threshold = 0

for i in range(n):
    c1, c2 = prev_comm[i], pred_comm[i]
#     if not filter(c1):
#         continue
    if len(c1) < threshold:
        continue
    counts.append(f(c1,c2))

In [32]:
sum(counts)/len(counts)

0.9367967332123275

In [33]:
sum(counts)/len(counts)

0.9367967332123275

In [1]:
train

NameError: name 'train' is not defined

In [10]:
np.array(train_data[:5])

  np.array(train_data[:5])


array([[list([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
        1],
       [list([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]),
        2],
       [list([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2]),
        2],
       [list([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2]),
        3],
       [list([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 3]),
        4]], dtype=object)

In [None]:
1 -> 1
12 -> 2
122 -> 3
1223 -> 4

In [None]:
valid_data