In [None]:
import pickle
import numpy as np
import pandas as pd
import os
from ivis import Ivis
from scipy.spatial import distance_matrix
import sys

sys.path.append('../')

In [2]:
from userdata_mining.embedding import Embedding

In [None]:
def popt20(y_true, y_pred):
    """
    Returns the popt20 metric. Lower is better.
    
    :param y_true: The true labels.
    :param y_pred: The prediction probabilities.
    """
    total = len(y_true)
    sort_idx = reversed(np.argsort(y_pred))
    count = 0
    used = 0
    
    for idx in sort_idx:
        used += 1
        if y_true[idx]:
            count += 1
        
        if count >= 0.2 * total:
            break
    
    return used / total

## In n-d space

In [2]:
with open('../saved/embeddings/rahul.pickle', 'rb') as f:
    data = pickle.load(f)

In [3]:
data.keys()

dict_keys(['Autofill', 'Browser History', 'Hangouts', 'Travel', 'Nearby Places', 'Email', 'YouTube comments', 'YouTube subscriptions', 'YouTube liked videos', 'YouTube watch history'])

In [4]:
hangouts = data['Hangouts']
yt_history = data['YouTube watch history']
email = data['Email']
yt_liked = data['YouTube liked videos']

In [26]:
products_df = pd.read_csv('../data/rahul/labels.csv', 
                          usecols=range(2), 
                          header=0, 
                          lineterminator='\n')
products = products_df['product']

In [29]:
products = []
for file in os.listdir('../products/'):
    with open(f'../products/{file}', 'r') as f:
        lines = f.readlines()[1:]
    lines = [x.replace(',', ' ') for x in lines]
    
    products.extend(lines)

In [30]:
products

['Fire TV Stick 4K streaming device with Alexa Voice Remote | Dolby Vision | 2018 release Watch favorites from Netflix  YouTube  Prime Video  Disney+  Apple TV+  HBO Max  and more. Stream for free with Pluto TV  IMDb TV  and more.\n',
 'Echo Dot (3rd Gen) - Smart speaker with Alexa - Charcoal Meet Echo Dot - Our most popular smart speaker with a fabric design. It is our most compact smart speaker that fits perfectly into small spaces.\n',
 'Roku Express | HD Streaming Media Player with High Speed HDMI Cable and Simple Remote Streaming made easy: Roku express lets you stream free  live and premium TV over the internet right to your TV; it’s perfect for new users  secondary TVs and easy gifting but powerful enough for seasoned pros\n',
 'Amazon Smart Plug  works with Alexa – A Certified for Humans Device Amazon Smart Plug works with Alexa to add voice control to any outlet.\n',
 'All-new Echo Buds (2nd Gen) | Wireless earbuds with active noise cancellation and Alexa | Black Dynamic audio

In [32]:
product_embeddings = []
model = Embedding(model='bert-base-uncased')

for product in products:
    product_embeddings.append(model.embed(product))

In [33]:
product_embeddings = np.array(product_embeddings)
product_embeddings.shape

(50, 768)

In [34]:
distances = distance_matrix(hangouts, product_embeddings, p=1)

In [35]:
distances.shape

(24105, 50)

In [40]:
np.unravel_index(np.argsort(distances, axis=None)[:10], distances.shape)

(array([ 7388,  5051, 15046,  7019, 14513, 19974, 20611, 21418, 18864,
        10621]),
 array([12, 12, 12, 12, 12, 12, 12, 12, 12, 12]))

In [37]:
products[12]

'Beautiful Scars Merry Clayton\n'

In [41]:
distances = distance_matrix(email, product_embeddings, p=1)
np.unravel_index(np.argsort(distances, axis=None)[:10], distances.shape)

(array([21513, 15122, 21514, 14286, 13040,  5010, 15990, 23532, 25783,
        19833]),
 array([12, 12, 12, 12, 12, 12, 12, 12, 12, 12]))

In [45]:
distances = distance_matrix(yt_history, product_embeddings, p=1)
np.unravel_index(np.argsort(distances, axis=None)[:100], distances.shape)

(array([4072,  263, 2671, 2171, 3799, 5643, 3920, 3924, 1720, 5352, 4020,
        4196, 5464, 4406, 4590, 4640, 4228, 4271, 4123,  867, 1592, 1069,
        3729, 5332, 1215,  959,  961,  863,  955,  963,  981,  997, 1021,
        1297,  182, 1081, 1131, 5623, 5661, 5679, 5694, 5696, 2349, 2372,
          14, 1971, 4573, 4682, 4277, 3212, 4782, 2599,  362, 1876, 4302,
        4752, 3821, 4080, 3606, 4429, 4751, 4171, 4431, 3596, 4585, 4167,
        4739, 4613, 4531, 4759, 4611, 4529, 4441, 4793, 3294, 4615, 4761,
        4523, 3614, 3602, 3553, 4778, 3983, 4550, 3229, 5204,  483,  368,
        4743, 4934,  174, 3810, 5649, 3099,  874, 5375, 1024, 3782, 3646,
        2367]),
 array([12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
        12, 12

In [46]:
distances = distance_matrix(yt_liked, product_embeddings, p=1)
np.unravel_index(np.argsort(distances, axis=None)[:100], distances.shape)

(array([101, 150, 268, 266, 334, 356, 277,  97,  34, 291, 228, 371, 139,
        169, 118, 193, 322, 327, 399,  37, 383, 198, 304, 157, 227,  56,
        280, 419, 158, 380, 237, 276, 375, 200, 230, 336, 185, 366, 295,
         96, 234, 111, 354, 404,  41, 178, 190, 100,  48, 119, 117, 216,
        382, 352, 391,  43,  66, 256, 290, 182, 197, 222, 137, 349, 195,
        239, 350, 251, 326, 365, 134,  81, 106, 292, 305, 342, 402,  42,
        283, 324, 325,  26, 373, 400, 186, 309, 348, 297, 170, 102, 196,
         36,  88,  35,  84, 116, 110, 120, 335, 113]),
 array([12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
        12, 12, 12, 12, 12, 12, 18, 12, 12, 12, 12, 12

## In 2-d space

In [47]:
with open('2.pickle', 'rb') as f:
    hangouts = pickle.load(f)
with open('4.pickle', 'rb') as f:
    email = pickle.load(f)
with open('7.pickle', 'rb') as f:
    yt_liked = pickle.load(f)
with open('8.pickle', 'rb') as f:
    yt_history = pickle.load(f)

In [56]:
def _reduce_dims(arg):
    """
    Uses ivis to reduce dimensionality to 2.

    :param {Iterable} arg - an array-like object
    :return {np.ndarray} embedded object
    """
    m = arg.shape[0]
    if m > 200:
        k = int(0.01 * m)
    elif m > 50:
        k = int(0.1 * m)
    elif m > 10:
        k = int(0.2 * m)
    else:
        k = max(int(0.4 * m), m-3)

    ivis = Ivis(embedding_dims=2, k=k, batch_size=2)
    return ivis.fit_transform(arg)

In [61]:
prod_embeddings = _reduce_dims(product_embeddings)

100%|██████████| 50/50 [00:00<00:00, 3584.20it/s]

Building KNN index
Extracting KNN neighbours



100%|██████████| 50/50 [00:17<00:00,  2.90it/s] 


Training neural network
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000


In [62]:
distances = distance_matrix(hangouts, prod_embeddings, p=1)
np.unravel_index(np.argsort(distances, axis=None)[:10], distances.shape)

(array([15403, 18855, 19427,  6401,  4368, 12331,  2368, 18077, 20544,
         8789]),
 array([19, 19, 12, 12, 19, 14, 12, 11, 10, 12]))

In [63]:
products[19], products[12], products[14], products[11], products[10]

('Where Have You Gone Alan Jackson',
 'Beautiful Scars Merry Clayton\n',
 'In Another World Cheap Trick\n',
 "Fearless (Taylor's Version) [2 CD] Taylor Swift\n",
 'My Savior Carrie Underwood\n')

2/5

In [65]:
distances = distance_matrix(email, prod_embeddings, p=1)
np.unravel_index(np.argsort(distances, axis=None)[:10], distances.shape)

(array([ 8150, 13632, 18120, 11538, 13261, 15086, 19994, 22009, 13688,
        25708]),
 array([43, 49, 49, 49, 49, 49, 49, 49, 49, 49]))

In [66]:
products[43], products[49]

('Hammermill Printer Paper  20 lb Copy Paper  11 x 17 - 1 Ream (500 Sheets) - 92 Bright  Made in the USA  HAMMERMILL’S BEST SELLING PRINTER PAPER – You will receive one ream of paper with 500 sheets of 20lb  92 bright  11" x 17" white copy paper. Ledger Size  99.99% JAM-FREE PRINTER PAPER - Everyone hates paper jams...You can trust Hammermill paper quality to keep your printer running smoothly. Scroll down to view the Product Description for details  COLORLOK TECHNOLOGY INCLUDED - Colors on Hammermill copy paper are 30% brighter; blacks are up to 60% bolder and inks dry 3 times faster for less smearing. Acid-free Hammermill paper ensures long-lasting archival quality  MADE IN USA - Hammermill copying and printing papers are 100% made in the USA  helping to support 2. 4 million sustainable forestry jobs in America  including family tree farmers. Hammermill is more than just paper  MAKE SURE IT’S ORIGINAL HAMMERMILL QUALITY - Look for “Ships from and sold by ” at the top of the page  und

1/2

In [67]:
distances = distance_matrix(yt_liked, prod_embeddings, p=1)
np.unravel_index(np.argsort(distances, axis=None)[:10], distances.shape)

(array([357, 385, 221, 221, 219, 262, 357, 357, 211, 254]),
 array([41, 37, 41, 35, 41, 41, 35, 34,  2, 34]))

In [68]:
products[41], products[37], products[35], products[34], products[2]

('Scotch Heavy Duty Shipping Packaging Tape  6 Rolls with Dispenser \n',
 'Twin Sheet Set - 3 Piece College Dorm Room Bed Sheets - Hotel Luxury Bed Sheets - Extra Soft Sheets - Deep Pockets - Easy Fit - Breathable & Cooling Sheets – Bed Sheets - Twin - Twin Mattress Sheets\n',
 'LiBa PEVA 8G Bathroom Small Shower Stall Curtain Liner "36"" W x 72"" H Narrow Size  Clear  8G Heavy Duty Waterproof Shower Stall Curtain Liner"\n',
 'Beckham Hotel Collection Bed Pillows for Sleeping  "Queen Size  Set of 2 - Soft Allergy Friendly  Cooling  Luxury Gel Pillow for Back  Stomach or Side Sleepers"\n',
 'Roku Express | HD Streaming Media Player with High Speed HDMI Cable and Simple Remote Streaming made easy: Roku express lets you stream free  live and premium TV over the internet right to your TV; it’s perfect for new users  secondary TVs and easy gifting but powerful enough for seasoned pros\n')

2/5

In [69]:
distances = distance_matrix(yt_history, prod_embeddings, p=1)
np.unravel_index(np.argsort(distances, axis=None)[:10], distances.shape)

(array([5202,  636,   44, 2747, 3699, 3863, 3736, 3769, 2569, 3363]),
 array([43, 42, 46,  7,  2, 37, 46,  5, 30, 44]))

In [70]:
products[42], products[46], products[7], products[5], products[30], products[44]

('HP 63 | Ink Cartridge | Black | F6U62AN   6 Rolls with Dispenser Make sure this fits by entering your model number.  HP ENVY 4511  4512  4513  4520  4522  4523  4524. HP Officejet 3830  3831  3833  4650  4652  4654  4655  5220  5222  5230  5232  5255  5260  5264. HP 63 ink cartridges work with: HP Deskjet 1112  2130  2132  3630  3631  3632  3633  3634  3636  3637  Up to 2x more prints with Original HP ink vs refill cartridges.  Cartridge yield (approx.): 190 pages  Original HP ink cartridges: genuine ink for your HP printer.\n',
 'Amazon Basics 6-Sheet Cross-Cut Paper and Credit Card Home Office Shredder  6 sheet crosscut paper/credit card shredder  Auto Start and overheat protection  Thermal Protection to prevent overheating  manual reverse to clear paper jams  shreds credit cards; small paper clips; staples  Important Note: Do not spray or keep any aerosol products in or around the shredder and do not shred items like metallic credit cards\n',
 'Roku Streaming Stick+ | HD/4K/HDR St

1/6

precision = 6 / 18 = 0.33

In [73]:
from pprint import pprint
pprint(list(enumerate(products)))

[(0,
  'Fire TV Stick 4K streaming device with Alexa Voice Remote | Dolby Vision | '
  '2018 release Watch favorites from Netflix  YouTube  Prime Video  Disney+  '
  'Apple TV+  HBO Max  and more. Stream for free with Pluto TV  IMDb TV  and '
  'more.\n'),
 (1,
  'Echo Dot (3rd Gen) - Smart speaker with Alexa - Charcoal Meet Echo Dot - '
  'Our most popular smart speaker with a fabric design. It is our most compact '
  'smart speaker that fits perfectly into small spaces.\n'),
 (2,
  'Roku Express | HD Streaming Media Player with High Speed HDMI Cable and '
  'Simple Remote Streaming made easy: Roku express lets you stream free  live '
  'and premium TV over the internet right to your TV; it’s perfect for new '
  'users  secondary TVs and easy gifting but powerful enough for seasoned '
  'pros\n'),
 (3,
  'Amazon Smart Plug  works with Alexa – A Certified for Humans Device Amazon '
  'Smart Plug works with Alexa to add voice control to any outlet.\n'),
 (4,
  'All-new Echo Buds (2nd Ge

In [3]:
actual = [0, 1, 2, 6, 7, 9, 10, 11, 16, 40, 41, 43]

In [4]:
preds = np.unique([19, 12, 14, 10, 11, 12, 49, 43, 41, 37, 35, 34, 2, 42, 46, 2, 7, 37, 5, 30, 44])

In [5]:
tp = 0
fp = 0
fn = 0
tn = 0
for p in preds:
    found = False
    for a in actual:
        if a == p:
            tp += 1
            found = True
    
    if not found:
        fp += 1

for a in actual:
    found = False
    for p in preds:
        if a == p:
            found = True
    
    if not found:
        fn += 1

for i in range(50):
    if i not in actual and i not in preds:
        tn += 1

In [6]:
precision = tp * 1. / (tp + fp)
recall = tp * 1. / (tp + fn)
acc = (tp + tn) / (tp + tn + fp + fn)

In [7]:
precision, recall, acc

(0.3333333333333333, 0.5, 0.64)

In [8]:
f1 = 2 * precision * recall / (precision + recall)

In [9]:
f1

0.4