In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
cd /raid/home/myang349/recsys-filterbubbles/

/raid/home/myang349/recsys-filterbubbles


# Imports

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import torch.optim as optim
from os import listdir
from os.path import isfile, join
from tracin.tracin import (
    save_tracin_checkpoint,
    load_tracin_checkpoint,
    calculate_tracin_influence,
)
import pandas as pd
from LSTM_clean.utils import filter_and_split_data, sequence_generator, load_community_dict, get_communities 
from LSTM_clean.model import LSTM 
from collections import Counter
import numpy as np
import pickle
from collections import defaultdict
import copy
from torch.optim import SGD
from lstm_wrapper import train_model, get_topk_predictions
from dataclasses import dataclass

# Loading Data

In [46]:
# Data Location
# Use SAVE_PREFIX to mark different datasets
SAVE_PREFIX = "104k"
SAVE_FOLDER = "/raid/home/myang349/recsys-filterbubbles/data/twitch_sequence/"
SAVE_TRAIN_NAME = SAVE_PREFIX + "train.data"
SAVE_VALID_NAME = SAVE_PREFIX + "valid.data"
SAVE_TEST_NAME = SAVE_PREFIX + "test.data"
SAVE_COMMUNITY_NAME = SAVE_PREFIX + "lstm_idx_communities.data"

In [47]:
train_data = np.load(os.path.join(SAVE_FOLDER, SAVE_TRAIN_NAME), allow_pickle=True)
valid_data = np.load(os.path.join(SAVE_FOLDER, SAVE_VALID_NAME), allow_pickle=True)
test_data = np.load(os.path.join(SAVE_FOLDER, SAVE_TEST_NAME), allow_pickle=True)

# Training

In [None]:
model = train_model()

# Experimentation

In [54]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device is", device)
model = LSTM(input_size=128, output_size=3312, hidden_dim=64, n_layers=1, device=device) 
model.LSTM.flatten_parameters()
optimizer = optim.SGD(model.parameters(), lr=5e-2, momentum=0.9)

Device is cuda


In [55]:
ls checkpoints

lstm_checkpoint_epoch0.pt    lstm_checkpoint_epoch370.pt
lstm_checkpoint_epoch100.pt  lstm_checkpoint_epoch380.pt
lstm_checkpoint_epoch10.pt   lstm_checkpoint_epoch390.pt
lstm_checkpoint_epoch110.pt  lstm_checkpoint_epoch400.pt
lstm_checkpoint_epoch120.pt  lstm_checkpoint_epoch40.pt
lstm_checkpoint_epoch130.pt  lstm_checkpoint_epoch410.pt
lstm_checkpoint_epoch140.pt  lstm_checkpoint_epoch420.pt
lstm_checkpoint_epoch150.pt  lstm_checkpoint_epoch430.pt
lstm_checkpoint_epoch160.pt  lstm_checkpoint_epoch440.pt
lstm_checkpoint_epoch170.pt  lstm_checkpoint_epoch450.pt
lstm_checkpoint_epoch180.pt  lstm_checkpoint_epoch460.pt
lstm_checkpoint_epoch190.pt  lstm_checkpoint_epoch470.pt
lstm_checkpoint_epoch200.pt  lstm_checkpoint_epoch480.pt
lstm_checkpoint_epoch20.pt   lstm_checkpoint_epoch490.pt
lstm_checkpoint_epoch210.pt  lstm_checkpoint_epoch500.pt
lstm_checkpoint_epoch220.pt  lstm_checkpoint_epoch50.pt
lstm_checkpoint_epoch230.pt  lstm_checkpoint_epoch510.pt
lstm_checkpoint_epoch240.pt  lstm

In [56]:
model, _, epoch, loss = load_tracin_checkpoint(model, optimizer, "/raid/home/myang349/recsys-filterbubbles/checkpoints/lstm_checkpoint_epoch600.pt")

In [57]:
model.to(device)

LSTM(
  (item_emb): Embedding(3312, 128, padding_idx=0)
  (LSTM): LSTM(128, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=3312, bias=True)
)

## Verify the recall

In [58]:
# Generate sequences and gt
sequences = [pt[0] for pt in valid_data]
# sequences = [list(reversed(pt[0]))[:10] for pt in valid_data]
gt = [pt[1] for pt in valid_data]

In [59]:
# Generate predictions
predictions = get_topk_predictions(model, sequences, k=50)

In [60]:
assert len(sequences) == len(predictions)
hit = 0
total = 0
for i, val in enumerate(gt):
    if val in predictions[i]:
        hit += 1
    total += 1  
hit/total

0.7596174744620735

## Computing community vectors

In [61]:
community_dict = load_community_dict(os.path.join(SAVE_FOLDER, SAVE_COMMUNITY_NAME))

In [62]:
# # Converting prev and predictions to be equal length and in community embedding
# prev_comm = []
# pred_comm = []
# for i in range(len(sequences)):
#     prev = sequences[i]
#     n = len([x for x in prev if x != 0])
#     pred = predictions[i][:n]
    
#     prev_comm.append(get_communities(prev, community_dict))
#     pred_comm.append(get_communities(pred, community_dict))

In [99]:
# Converting prev and predictions to be equal length and in community embedding
LOOKBACK = 10
THRESHOLD = 10

# Alternate version where we just get the last lookback from prev
prev_comm = []
pred_comm = []
for i in range(len(sequences)):
    prev = list(reversed(sequences[i]))[:LOOKBACK]
    n = len([x for x in prev if x != 0 and x in community_dict])
    
    if n < THRESHOLD:
        continue
        
    pred = predictions[i][:n]
    prev_comm.append(get_communities(prev, community_dict))
    pred_comm.append(get_communities(pred, community_dict))
    
for i in range(len(prev_comm)):
    assert len(prev_comm[i]) ==len(pred_comm[i])
    assert THRESHOLD <= len(prev_comm[i]) <= LOOKBACK

In [100]:
prev_num_comm = 0
pred_num_comm = 0
n = len(prev_comm)
threshold = 0

for i in range(n):
    if len(prev_comm[i]) < threshold:
        continue
    prev_num_comm += len(set(prev_comm[i]))
    pred_num_comm += len(set(pred_comm[i]))

In [101]:
prev_num_comm

16524

In [102]:
pred_num_comm

16773

# Analyzing recommendation distribution

## Quantifying Diversity Based on Watch History vs Recommendation

In [123]:
def f(c):
    return len(set(c))
    
def f(c):
    hm = Counter(c)
    for val in hm.values():
        if val >= 0.8 * len(c):
            return 1
    return 0

In [124]:
prev_comm_stats = []
pred_comm_stats = []
n = len(prev_comm)
threshold = 0

for i in range(n):
    c1, c2 = prev_comm[i], pred_comm[i]
    if len(c1) < threshold:
        continue
    prev_comm_stats.append(f(c1))
    pred_comm_stats.append(f(c2))

In [125]:
sum(prev_comm_stats)

6063

In [122]:
sum(pred_comm_stats)

8809

# Quantifying Filter-Bubble Based on Combination of Watch History + Recommendation

## Quantify Functions

In [126]:
# Does one of the top 3 predictions belong in a community in the previous history?
def f(prev, pred, threshold=3):
    for comm in pred[:threshold]:
        if comm in prev:
            return 1
    return 0

# Does one of the top-threshold belong in the majority community in the previous history?
def f(prev, pred, threshold=3):
    hm = Counter(prev)
    most_common = max(hm.keys(), key=hm.get)
    
    for comm in pred[:threshold]:
        if comm == most_common:
            return 1
    return 0

# What % of the top-threshold belongs in the most common community in the previous history
def f(prev, pred, threshold=10):
    hm = Counter(prev)
    most_common = max(hm.keys(), key=hm.get)
    
    count = 0
    for comm in pred[:threshold]:
        if comm == most_common:
            count += 1
    return count / threshold

In [127]:
# What percentage of the top-threshold predictions belong in a community in the previous history?
def f(prev, pred, threshold=5):
    count = 0
    for comm in pred[:threshold]:
        if comm in prev:
            count += 1
    return count / threshold

# What percentage of the top-threshold predictions belong in the communities of the last threshold-items in the history?
def f(prev, pred, threshold=10):
    count = 0
    for comm in pred[:threshold]:
        if comm in list(reversed(prev))[:threshold]:
            count += 1
    return count / threshold

## Filter functions

In [128]:
def filter(c):
    hm = Counter(c)
    for val in hm.values():
        if val >= 0.9 * len(c):
            return True
    return False

## This cell actually computes values

In [129]:
counts = []
n = len(prev_comm)
threshold = 0

for i in range(n):
    c1, c2 = prev_comm[i], pred_comm[i]
#     if not filter(c1):
#         continue
    if len(c1) < threshold:
        continue
    counts.append(f(c1,c2))

In [130]:
sum(counts)/len(counts)

0.9331102540834695

In [131]:
sum(counts)/len(counts)

0.9331102540834695

In [None]:
1 -> 1
12 -> 2
122 -> 3
1223 -> 4

In [None]:
valid_data