In [10]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
%autoreload 2

In [12]:
cd /raid/home/myang349/recsys-filterbubbles/

/raid/home/myang349/recsys-filterbubbles


# Imports

In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import torch.optim as optim
from os import listdir
from os.path import isfile, join
from tracin.tracin import (
    save_tracin_checkpoint,
    load_tracin_checkpoint,
    calculate_tracin_influence,
)
import pandas as pd
from LSTM_clean.utils import filter_and_split_data, sequence_generator, printl, reindex_and_save_communities
from LSTM_clean.model import LSTM 
from collections import Counter
import numpy as np
import pickle
from collections import defaultdict
import copy
from torch.optim import SGD

In [14]:
# Setting Cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device is ", device)

Device is  cuda


# Config

In [18]:
DATA_FOLDER = "/raid/home/myang349/recsys-filterbubbles/data/"
DATA_NAME = "twitch100k.csv"
SAVE_FOLDER = "/raid/home/myang349/recsys-filterbubbles/data/twitch_sequence/"
SAVE_TRAIN_NAME = "train.data"
SAVE_VALID_NAME = "valid.data"
SAVE_TEST_NAME = "test.data"

# Load in data

In [27]:
# The format is:
# N x 2 x (sequence, 
train_data = np.load(os.path.join(SAVE_FOLDER, SAVE_TRAIN_NAME), allow_pickle=True)
valid_data = np.load(os.path.join(SAVE_FOLDER, SAVE_VALID_NAME), allow_pickle=True)
test_data = np.load(os.path.join(SAVE_FOLDER, SAVE_TEST_NAME), allow_pickle=True)

# Concat all data together

In [17]:
# NOTE: very important that this is NOT a copy
# We use all-data to change the indices for all the data
all_data = train_data + valid_data + test_data
assert len(all_data) == len(train_data) + len(valid_data) + len(test_data)

In [10]:
len(all_data)

90609

# Obtaining Mapping Info

In [12]:
# Union all items from sequence
unique_items = set()
for data_point in all_data:
    sequence = data_point[0]
    unique_items |= set(sequence)
print(len(unique_items))

# Union all GT items
unique_items = unique_items.union(data_point[1] for data_point in all_data)
print(len(unique_items))

5367
5400


In [13]:
0 in unique_items

True

In [18]:
# Remove gaps from items
item_to_lstm_idx = {item:idx for (idx,item) in enumerate(unique_items)}
lstm_idx_to_item = {v: k for k,v in item_to_lstm_idx.items()}

# Apply mapping
for data_point in all_data:
    sequence = data_point[0]
    gt = data_point[1]
    for i, item in enumerate(sequence):
        sequence[i] = item_to_lstm_idx[item]
    data_point[1] = item_to_lstm_idx[gt]


In [None]:
lstm_idx_to_item

In [15]:
for i in range(len(train_data)):
    assert train_data[i] == all_data[i]

AssertionError: 

In [14]:
print("Train: {}, Test: {}".format(len(train_data),len(test_data)))

## We don't have to add 1 to output size because we don't exclude 0 from len(unique_items)
model = LSTM(input_size=128, output_size=len(unique_items), hidden_dim=64, n_layers=1, device=device).to(device)
model.LSTM.flatten_parameters()
print("Model is ", model)
print("Training and testing")
original_prediction = model.traintest(train=train_data,test=valid_data, epochs=100)
print("Finished")

Train: 71465, Test: 10231
Model is  LSTM(
  (item_emb): Embedding(5400, 128, padding_idx=0)
  (LSTM): LSTM(128, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=5400, bias=True)
)
Training and testing
train # = 71465	test # = 8913
Epoch 0	Train Loss: 0.016842331176023323	Test MRR: 0.0023048682363635974	Test Recall@10: 0.0030292830696735107	Elapsed time: 0.9526858329772949
saving checkpoint to /raid/home/myang349/recsys-filterbubbles/checkpoints/lstm_checkpoint_epoch0.pt
Epoch 5	Train Loss: 0.016645551046447393	Test MRR: 0.03504889731721179	Test Recall@10: 0.06181981375518905	Elapsed time: 3.5559792518615723
Epoch 10	Train Loss: 0.01615201595372853	Test MRR: 0.04133498552676434	Test Recall@10: 0.08392236059688096	Elapsed time: 3.790386199951172
saving checkpoint to /raid/home/myang349/recsys-filterbubbles/checkpoints/lstm_checkpoint_epoch10.pt
Epoch 15	Train Loss: 0.01543171929402339	Test MRR: 0.043249023377024985	Test Recall@10: 0.08268820823516213	Elapsed time: 3.5517

# Exploring Data

## Loading Model from checkpoint

In [19]:
model = LSTM(input_size=128, output_size=len(unique_items), hidden_dim=64, n_layers=1, device=device).to(device)
optimizer = SGD(model.parameters(), lr=5e-2, momentum=0.9)

In [21]:
load_tracin_checkpoint(model, optimizer, '/raid/home/myang349/recsys-filterbubbles/checkpoints/lstm_checkpoint_epoch90.pt')

(LSTM(
   (item_emb): Embedding(5400, 128, padding_idx=0)
   (LSTM): LSTM(128, 64, batch_first=True)
   (fc): Linear(in_features=64, out_features=5400, bias=True)
 ),
 SGD (
 Parameter Group 0
     dampening: 0
     lr: 0.005
     momentum: 0.9
     nesterov: False
     weight_decay: 0
 ),
 90,
 895.6995186805725)

## Converting data to embedding space

In [20]:
train = copy.deepcopy(train_data)
for i in range(len(train)):
    train[i][0] = model.item_emb(torch.LongTensor(train[i][0]).to(model.device))

## Examining single vectors to manually inspect filter bubble

In [22]:
i = 300

In [23]:
## dataset format
full_hist = train_data[i][0]
hist = [x for x in full_hist if x != 0]
gt = train_data[i][1]
print(hist, gt)

[166, 167, 168, 169, 16, 170, 171, 113, 16, 16] 172


In [24]:
# Rig data here if not using i (above)
hist = [667, 667, 667, 667, 667]
print(hist)

[667, 667, 667, 667, 667]


In [25]:
## model format
# curr = torch.stack((train[i][0],), dim=0).detach()
# convert single vector to embed
curr = model.item_emb(torch.LongTensor(hist).to(model.device))
curr = torch.stack((curr,), dim=0).detach()

## Item Extraction

In [9]:
output, hidden = model.forward(curr)
output1 = output.view(-1, model.num_items)
prob = nn.functional.softmax(output1, dim=1).data.cpu()
k = 50

NameError: name 'model' is not defined

## Item Extraction Method 1: Torch

In [27]:
topk = torch.topk(output, k).indices.tolist()
print(topk[0])

[44, 113, 212, 41, 1954, 69, 667, 537, 3220, 460, 16, 1741, 54, 43, 427, 147, 39, 353, 888, 647, 463, 553, 428, 277, 431, 573, 502, 66, 341, 629, 169, 140, 569, 165, 462, 709, 536, 1412, 559, 46, 2234, 21, 148, 64, 149, 928, 215, 457, 1000, 2359]


## Item Extraction Method 2: Numpy

In [49]:
print(list(np.argsort(-prob.numpy())[0][:k]))

[44, 113, 212, 41, 1954, 667, 3220, 537, 69, 147, 16, 277, 502, 39, 341, 353, 66, 647, 140, 43, 888, 629, 460, 169, 1741, 70, 410, 54, 553, 64, 71, 215, 431, 2234, 428, 819, 46, 536, 928, 463, 380, 573, 925, 427, 1958, 1371, 72, 307, 2359, 711]


In [68]:
np_prob

array([[4.6355075e-05, 1.1106163e-02, 5.6383542e-05, ..., 2.5252750e-05,
        2.7392212e-05, 3.5436089e-05]], dtype=float32)

# TESTING THE MAPPING

In [28]:
print(f"Starting data retrieval for: {DATA_NAME}")
df = pd.read_csv(os.path.join(DATA_FOLDER, DATA_NAME))

Starting data retrieval for: twitch100k.csv


In [29]:
lstm_idx_to_community, unique_items, item_to_lstm_idx, lstm_idx_to_df_item = reindex_and_save_communities(train_data, valid_data, test_data, df)

In [30]:
all_data = train_data + valid_data + test_data

In [31]:
max(unique_items)

5672

In [32]:
max(lstm_idx_to_df_item.values())

5671

In [33]:
max(item_to_lstm_idx.keys())

5672

In [35]:
# Max of df is 1 lower than our dataset
assert max(lstm_idx_to_df_item.values()) == max(item_to_lstm_idx.keys()) - 1

In [39]:
# Max of df is 1 lower than our dataset
assert max(lstm_idx_to_df_item.values()) == max(item_to_lstm_idx.keys()) - 1
# Min of df is 0
assert min(lstm_idx_to_df_item.values()) == 0
# Min of lstm_idx mapping is 1
assert min(lstm_idx_to_df_item.keys()) == 1

In [42]:
Counter(df['community'])

Counter({5: 4915,
         10: 6514,
         7: 5858,
         6: 8952,
         9: 19021,
         0: 19746,
         12: 5484,
         3: 3461,
         1: 11309,
         4: 5087,
         11: 4444,
         13: 831,
         2: 2679,
         8: 1699})

In [45]:
hm = defaultdict(int)
for data_point in all_data:
    gt = data_point[1]
    hm[gt] += 1

In [48]:
comms = defaultdict(int)
for k, v in hm.items():
    community = lstm_idx_to_community[k]
    comms[community] += v

In [49]:
comms

defaultdict(int,
            {5: 4519,
             10: 6039,
             7: 5349,
             6: 8432,
             9: 17281,
             0: 17072,
             12: 5072,
             3: 3090,
             1: 10405,
             11: 4181,
             13: 757,
             2: 2490,
             4: 4421,
             8: 1501})

In [58]:
with open('/raid/home/myang349/recsys-filterbubbles/myang_code_refactor/test.data', "wb+") as f:
    pickle.dump(lstm_idx_to_community, f)

In [59]:
with open('/raid/home/myang349/recsys-filterbubbles/myang_code_refactor/test.data', "rb") as f:
    hm2 = pickle.load(f)

In [61]:
hm2

{1: 5,
 2: 5,
 3: 5,
 4: 5,
 5: 5,
 6: 5,
 7: 5,
 8: 5,
 9: 5,
 10: 5,
 11: 5,
 12: 5,
 13: 5,
 14: 5,
 15: 5,
 16: 10,
 17: 5,
 18: 5,
 19: 7,
 20: 7,
 21: 7,
 22: 7,
 23: 7,
 24: 7,
 25: 7,
 26: 7,
 27: 7,
 28: 7,
 29: 6,
 30: 9,
 31: 9,
 32: 6,
 33: 9,
 34: 0,
 35: 5,
 36: 5,
 37: 0,
 38: 5,
 39: 0,
 40: 12,
 41: 0,
 42: 5,
 43: 5,
 44: 0,
 45: 5,
 46: 0,
 47: 0,
 48: 7,
 49: 7,
 50: 7,
 51: 9,
 52: 7,
 53: 7,
 54: 7,
 55: 7,
 56: 7,
 57: 7,
 58: 7,
 59: 7,
 60: 7,
 61: 0,
 62: 0,
 63: 0,
 64: 0,
 65: 0,
 66: 0,
 67: 0,
 68: 0,
 69: 0,
 70: 0,
 71: 0,
 72: 0,
 73: 3,
 74: 3,
 75: 3,
 76: 3,
 77: 10,
 78: 10,
 79: 10,
 80: 5,
 81: 5,
 82: 6,
 83: 6,
 84: 6,
 85: 6,
 86: 6,
 87: 6,
 88: 6,
 89: 6,
 90: 6,
 91: 6,
 92: 6,
 93: 6,
 94: 6,
 95: 6,
 96: 6,
 97: 6,
 98: 6,
 99: 6,
 100: 6,
 101: 6,
 102: 3,
 103: 6,
 104: 6,
 105: 6,
 106: 6,
 107: 6,
 108: 1,
 109: 1,
 110: 10,
 111: 6,
 112: 9,
 113: 0,
 114: 6,
 115: 6,
 116: 3,
 117: 6,
 118: 9,
 119: 9,
 120: 1,
 121: 9,
 122: 1,
 123

In [54]:
'/raid/home/myang349/recsys-filterbubbles/myang_code_refactor/test.data'

'/raid/home/myang349/recsys-filterbubbles/myang_code_refactor'