# Import necessary modules

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch.nn as nn
import pytorch_lightning as pl
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch
from collections import Counter
from tensorflow.keras.callbacks import TensorBoard

from sklearn.metrics import roc_auc_score, ndcg_score
from sklearn.preprocessing import LabelBinarizer

import os
os.environ['CRYPTOGRAPHY_OPENSSL_NO_LEGACY'] = '1'
import psycopg2


2023-12-04 04:35:25.270932: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-04 04:35:25.271019: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-04 04:35:25.278648: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Establish a connection to the PostgreSQL database
conn = psycopg2.connect(
    dbname="recome",
    user="postgres",
    password="postgres",
    host="localhost",
    port="5432"
)

# Create a cursor object
cur = conn.cursor()

# Close the connection
# cur.close()
# conn.close()

## Manual pre-processing of data
### behaviors.tsv

#### From documentation:  
The behaviors.tsv file contains the impression logs and users' news click histories. It has 5 columns divided by the tab symbol:
- Impression ID. The ID of an impression.  
- User ID. The anonymous ID of a user.  
- Time. The impression time with format "MM/DD/YYYY HH:MM:SS AM/PM".  
- History. The news click history (ID list of clicked news) of this user before this impression. The clicked news articles are ordered by time.  
- Impressions. List of news displayed in this impression and user's click behaviors on them (1 for click and 0 for non-click). The orders of news in a impressions have been shuffled.  

In [78]:
# Execute a query for behaviours
cur.execute("SELECT * FROM behaviours")
# Fetch all the rows for news
rows = cur.fetchall()
column_names = [desc[0] for desc in cur.description] # Get the column names 
raw_behaviour = pd.DataFrame(rows, columns=column_names) # Create a DataFrame from the rows, with the column names

print(f"The dataset originally consists of {len(raw_behaviour)} number of interactions.")
raw_behaviour.head()

The dataset originally consists of 156966 number of interactions.


Unnamed: 0,id,userid,timestamp,click_history,impressions
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...


In [79]:
## Indexize users
unique_userIds = raw_behaviour['userid'].unique()
# Allocate a unique index for each user, but let the zeroth index be a UNK index:
ind2user = {idx +1: itemid for idx, itemid in enumerate(unique_userIds)}
user2ind = {itemid : idx for idx, itemid in ind2user.items()}
print(f"We have {len(user2ind)} unique users in the dataset")

# Create a new column with userIdx:
raw_behaviour['userIdx'] = raw_behaviour['userid'].map(lambda x: user2ind.get(x,0))
raw_behaviour.head()

We have 50001 unique users in the dataset


Unnamed: 0,id,userid,timestamp,click_history,impressions,userIdx
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0,1
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...,2
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...,3
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0,4
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...,5


## Load article data
We also need to get the content information of each article. We will use the news.tsv file to index the items.

In [80]:
# Execute a query for news
cur.execute("SELECT * FROM news1")
# Fetch all the rows for news
rows = cur.fetchall()
column_names = [desc[0] for desc in cur.description] # Get the column names 
news = pd.DataFrame(rows, columns=column_names) # Create a DataFrame from the rows, with the column names
print(f"The news data consist in total of {len(news)} number of news.")

# Build index of items
ind2item = {idx +1: itemid for idx, itemid in enumerate(news['id'].values)}
item2ind = {itemid : idx for idx, itemid in ind2item.items()}

news.head()

The news data consist in total of 51282 number of news.


Unnamed: 0,id,category,subcategory,title,abstract,url,title_entities,abstract_entities,author,date
0,N22592,finance,personalfinance,How much do you need to be 'rich' in each state?,Here's what 'rich' is in each state,https://assets.msn.com/labs/mind/BBWKtgZ.html,[],[],Bob Wilson,"May 28, 2019"
1,N41390,weather,weathertopstories,WBZ Forecast For November 9,Dave Epstein has your latest weather forecast.,https://assets.msn.com/labs/mind/BBWvTFM.html,[],[],Eleanor Miller,"November 28, 2019"
2,N30900,sports,more_sports,"The Day in Sports: Thursday, Nov 7, 2019","The Day in Sports: Thursday, Nov 7, 2019",https://assets.msn.com/labs/mind/BBWqH4M.html,[],[],Bob Miller,"February 2, 2019"
3,N4581,sports,football_nfl,"Dolphins at Colts final score, recap, and imme...",,https://assets.msn.com/labs/mind/BBWyJ7j.html,"[{""Label"": ""Indianapolis Colts"", ""Type"": ""O"", ...",[],Alice Smith,"April 1, 2019"
4,N50311,sports,more_sports,Stolen gun found in car at Union County footba...,,https://assets.msn.com/labs/mind/BBWvUCR.html,[],[],Eleanor Jones,"May 3, 2019"


Now we need to process the click history and impressions. We need to both indexize the strings, but also to decode impressions into clicks and non-clicks.

In [81]:
# Indexize click history field
def process_click_history(s):
    list_of_strings = str(s).split(" ")
    return [item2ind.get(l, 0) for l in list_of_strings]
        
raw_behaviour['click_history_idx'] = raw_behaviour.click_history.map(lambda s:  process_click_history(s))
raw_behaviour.head()

Unnamed: 0,id,userid,timestamp,click_history,impressions,userIdx,click_history_idx
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0,1,"[23289, 10193, 15030, 19962, 23924, 5757, 4571..."
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...,2,"[23471, 10198, 28316, 28306, 13020, 13870, 125..."
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...,3,"[6226, 5079, 11612, 23917, 8527, 20061, 21160,..."
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0,4,"[30014, 18078, 41100, 7543, 20856, 15901, 1506..."
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...,5,"[21031, 5662, 19933, 27864]"


In [82]:
# collect one click and one no-click from impressions:
def process_impression(s):
    list_of_strings = s.split(" ")
    itemid_rel_tuple = [l.split("-") for l in list_of_strings]
    noclicks = []
    for entry in itemid_rel_tuple:
        # print(entry[1])
        if entry[1] == '0':
            noclicks.append(entry[0])
        if entry[1] == '1':
            click = entry[0]
    return noclicks, click

raw_behaviour['noclicks'], raw_behaviour['click'] = zip(*raw_behaviour['impressions'].map(process_impression))
# We can then indexize these two new columns:
raw_behaviour['noclicks'] = raw_behaviour['noclicks'].map(lambda list_of_strings: [item2ind.get(l, 0) for l in list_of_strings])
raw_behaviour['click'] = raw_behaviour['click'].map(lambda x: item2ind.get(x,0))

raw_behaviour.head()

Unnamed: 0,id,userid,timestamp,click_history,impressions,userIdx,click_history_idx,noclicks,click
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0,1,"[23289, 10193, 15030, 19962, 23924, 5757, 4571...",[875],35475
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...,2,"[23471, 10198, 28316, 28306, 13020, 13870, 125...","[32799, 38100, 30822, 31130, 39081, 32645, 379...",28611
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...,3,"[6226, 5079, 11612, 23917, 8527, 20061, 21160,...","[34426, 29564, 33673, 37551, 1974, 33337, 4694...",6473
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0,4,"[30014, 18078, 41100, 7543, 20856, 15901, 1506...","[875, 29412, 523]",892
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...,5,"[21031, 5662, 19933, 27864]","[3090, 29486, 34194, 32645, 39616, 600, 31036,...",44994


In [44]:
# convert timestamp value to hours since epoch
raw_behaviour['epochhrs'] = pd.to_datetime(raw_behaviour['timestamp']).values.astype(np.int64)/(1e6)/1000/3600
raw_behaviour['epochhrs'] = raw_behaviour['epochhrs'].round()
raw_behaviour[['click','epochhrs']].groupby("click").min("epochhrs").reset_index()

Unnamed: 0,click,epochhrs
0,65,437045.0
1,73,437029.0
2,196,437044.0
3,207,437022.0
4,319,437020.0
...,...,...
6346,51007,437018.0
6347,51102,437101.0
6348,51181,437097.0
6349,51199,437096.0


In [45]:
raw_behaviour

Unnamed: 0,impressionId,userId,timestamp,click_history,impressions,userIdx,click_history_idx,noclicks,click,epochhrs
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0,1,"[4331, 42517, 47354, 1004, 4966, 38081, 26760,...",[33199],16517,437073.0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...,2,"[4513, 42522, 9358, 9348, 45344, 46194, 44866,...","[13841, 19142, 11864, 12172, 20123, 13687, 189...",9653,437106.0
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...,3,"[38550, 37403, 43936, 4959, 40851, 1103, 2202,...","[15468, 10606, 14715, 18593, 34298, 14379, 279...",38797,437143.0
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0,4,"[11056, 50402, 22142, 39867, 1898, 48225, 4738...","[33199, 10454, 32847]",33216,437069.0
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...,5,"[2073, 37986, 975, 8906]","[35414, 10528, 15236, 13687, 20658, 32924, 120...",26036,437104.0
...,...,...,...,...,...,...,...,...,...,...
156960,156961,U21593,11/14/2019 10:24:05 PM,N7432 N58559 N1954 N43353 N14343 N13008 N28833...,N2235-0 N22975-0 N64037-0 N47652-0 N11378-0 N4...,7282,"[174, 51041, 42969, 46194, 47898, 1219, 47929,...","[17534, 14451, 27958, 9276, 15631, 17826, 2860...",29309,437158.0
156961,156962,U10123,11/13/2019 6:57:04 AM,N9803 N104 N24462 N57318 N55743 N40526 N31726 ...,N3841-0 N61571-0 N58813-0 N28213-0 N4428-0 N25...,1036,"[5564, 39144, 50528, 39807, 26818, 58, 49391, ...","[15255, 18776, 21346, 38856, 16343, 11670, 207...",19750,437119.0
156962,156963,U75630,11/14/2019 10:58:13 AM,N29898 N59704 N4408 N9803 N53644 N26103 N812 N...,N55913-0 N62318-0 N53515-0 N10960-0 N9135-0 N5...,4834,"[23648, 5371, 8375, 5564, 34288, 41875, 45447,...","[14213, 16700, 13343, 3249, 26786, 12751, 1131...",29461,437147.0
156963,156964,U44625,11/13/2019 2:57:02 PM,N4118 N47297 N3164 N43295 N6056 N38747 N42973 ...,N6219-0 N3663-0 N31147-0 N58363-0 N4107-0 N457...,30674,"[29379, 47380, 35643, 36911, 15789, 34625, 479...","[19959, 18716, 27498, 26442, 11994, 27473, 192...",17670,437127.0


# Modeling
We want to make a matrix factorization model where each user $u$ has a d-dimensional parameter vector $z_u$ and each item $i$ has a parameter vector $v_i$. This implies that we do not need the click_history_idx column when we train the model (why?).

Second, to simplify the computation of things we assume that the user only considers two items in each interaction: The item the user clicked on, and the first item in the noclicks list (what are we missing by this assumption?).

Hence, our dataset consist of a `behaviour` dataframe and a `news` dataframe with content information of the news articles. We will use a language model to interpret our article data, and for simplicity we will only use the title as the text input here.

In [46]:
raw_behaviour['noclick'] = raw_behaviour['noclicks'].map(lambda x : x[0])
behaviour = raw_behaviour[['epochhrs','userIdx','click_history_idx','noclick','click']]
behaviour.head()

Unnamed: 0,epochhrs,userIdx,click_history_idx,noclick,click
0,437073.0,1,"[4331, 42517, 47354, 1004, 4966, 38081, 26760,...",33199,16517
1,437106.0,2,"[4513, 42522, 9358, 9348, 45344, 46194, 44866,...",13841,9653
2,437143.0,3,"[38550, 37403, 43936, 4959, 40851, 1103, 2202,...",15468,38797
3,437069.0,4,"[11056, 50402, 22142, 39867, 1898, 48225, 4738...",33199,33216
4,437104.0,5,"[2073, 37986, 975, 8906]",35414,26036


In [47]:
# Let us use the last 10pct of the data as our validation data:
test_time_th = behaviour['epochhrs'].quantile(0.9)
train = behaviour[behaviour['epochhrs']< test_time_th]
valid =  behaviour[behaviour['epochhrs']>= test_time_th]

In [48]:
class MindDataset(Dataset):
    # A fairly simple torch dataset module that can take a pandas dataframe (as above), 
    # and convert the relevant fields into a dictionary of arrays that can be used in a dataloader
    def __init__(self, df):
        # Create a dictionary of tensors out of the dataframe
        self.data = {
            'userIdx' : torch.tensor(df.userIdx.values),
            'click' : torch.tensor(df.click.values),
            'noclick' : torch.tensor(df.noclick.values)
        }
    def __len__(self):
        return len(self.data['userIdx'])
    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.data.items()}

In [53]:
# Build datasets and dataloaders of train and validation dataframes:
bs = 1024
ds_train = MindDataset(train)
train_loader = DataLoader(ds_train, batch_size=bs, shuffle=True)
ds_valid = MindDataset(valid)
valid_loader = DataLoader(ds_valid, batch_size=bs, shuffle=False)

batch = next(iter(train_loader))

In [54]:
batch["noclick"]

tensor([20021, 17444,  9575,  ..., 27984, 21075, 13531])

## Model

#### Framework
We will use pytorch-lightning to define and train our model. It is a high-level framework (similar to fastAI) but with a slightly different way of defining things. It is my personal go-to framework and is very flexible. For more information, see https://pytorch-lightning.readthedocs.io/.

#### The model
We assume that each interaction goes as follow: the user is presented with two items: the click and no-click item. After the user reviewed both items, she will choose the most relevant one. This can be modeled as a categorical distirbution with two options (yes, you could do binomial). There is a loss function in pytorch for this already, called the `F.cross_entropy` that we will use.

In [55]:
# Build a matrix factorization model
class NewsMF(pl.LightningModule):
    def __init__(self, num_users, num_items, dim = 10):
        super().__init__()
        self.dim=dim
        self.useremb = nn.Embedding(num_embeddings=num_users, embedding_dim=dim)
        self.itememb = nn.Embedding(num_embeddings=num_items, embedding_dim=dim)
    
    def forward(self, user, item):
        batch_size = user.size(0)
        uservec = self.useremb(user)
        itemvec = self.itememb(item)

        score = (uservec*itemvec).sum(-1).unsqueeze(-1)
        
        return score
    
    def training_step(self, batch, batch_idx):
        batch_size = batch['userIdx'].size(0)

        score_click = self.forward(batch['userIdx'], batch['click'])
        score_noclick = self.forward(batch['userIdx'], batch['noclick'])
        
        scores_all = torch.concat((score_click, score_noclick), dim=1)
        # Compute loss as cross entropy (categorical distribution between the clicked and the no clicked item)
        loss = F.cross_entropy(input=scores_all, target=torch.zeros(batch_size, device=scores_all.device).long())
        
        return loss
    
    def validation_step(self, batch, batch_idx):
        # for now, just do the same computation as during training
        loss = self.training_step(batch, batch_idx)
        self.log("val_loss", loss, prog_bar=True, on_step=False, on_epoch=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer
    

In [56]:
# Build and train the model
# mf_model = NewsMF(num_users=len(ind2user)+1, num_items = len(ind2item)+1)
# trainer = pl.Trainer(max_epochs=10, accelerator="gpu", logger=logger)
# trainer.fit(model=mf_model, train_dataloaders=train_loader, val_dataloaders=valid_loader)

# Save the model
# trainer.save_checkpoint("model_user.ckpt")

# Load the trained model
mf_model = NewsMF.load_from_checkpoint(checkpoint_path="model_user.ckpt", num_users=len(ind2user) + 1, num_items=len(ind2item) + 1)

# Sense Check

In [49]:
USER_ID = 1

# Suggested Items

In [57]:
# Create item_ids and user ids list
item_id = list(ind2item.keys())
userIdx =  [USER_ID]*len(item_id)


preditions = mf_model.forward(torch.IntTensor(userIdx), torch.IntTensor(item_id))

# Select top 10 argmax
top_index = torch.topk(preditions.flatten(), 10).indices

# Filter for top 10 suggested items
filters = [ind2item[ix.item()] for ix in top_index]
news[news["id"].isin(filters)]

Unnamed: 0,id,category,subcategory,title,abstract,url,title_entities,abstract_entities,author,date
4179,N17104,news,newscrime,"Veterans suffered, investors lost millions in ...",People lost hundreds of millions of dollars. B...,https://assets.msn.com/labs/mind/AAJUAA8.html,[],[],Jack Smith,"May 12, 2019"
13719,N53056,sports,mma,Dana White confirms former title challenger Al...,,https://assets.msn.com/labs/mind/BBWq6BK.html,"[{""Label"": ""Alexander Gustafsson"", ""Type"": ""P""...",[],Alice Miller,"August 3, 2019"
19327,N35089,news,newscrime,Lamorris Robinson's family releases video of h...,"The video shows Charles Kalb, 56, shooting and...",https://assets.msn.com/labs/mind/BBWIel9.html,[],[],Eleanor Wilson,"June 19, 2019"
19623,N56498,sports,football_nfl,Where would the Browns be without Nick Chubb? ...,Nick Chubb had a big day on Sunday. (John Kunt...,https://assets.msn.com/labs/mind/BBWBPwm.html,"[{""Label"": ""Cleveland Browns"", ""Type"": ""O"", ""W...","[{""Label"": ""Cleveland Browns"", ""Type"": ""O"", ""W...",Charlie Smith,"June 1, 2019"
20437,N49962,sports,football_nfl,Buccaneers vs Cardinals day after reactions,The offense moved the ball all game while the ...,https://assets.msn.com/labs/mind/BBWBfaI.html,"[{""Label"": ""Tampa Bay Buccaneers"", ""Type"": ""O""...",[],Felix Brown,"February 6, 2019"
27433,N1525,video,news,AP Top Stories November 11 A,Here's the latest for Monday November 11th: Ho...,https://assets.msn.com/labs/mind/BBWB3Vt.html,"[{""Label"": ""Middle East Economic Survey"", ""Typ...","[{""Label"": ""Hong Kong Police Force"", ""Type"": ""...",Jack Miller,"October 12, 2019"
33607,N43731,travel,travelnews,Military training plane goes off runway at New...,A Contract Military Aircraft crashed at the Ne...,https://assets.msn.com/labs/mind/AAJ7ezF.html,"[{""Label"": ""Newport News/Williamsburg Internat...","[{""Label"": ""Newport News/Williamsburg Internat...",Alice Williams,"September 30, 2019"
39728,N30271,sports,football_nfl,Five winners and three losers from the Chiefs'...,The Chiefs had a few guys turn in winning perf...,https://assets.msn.com/labs/mind/AAJrX37.html,"[{""Label"": ""Kansas City Chiefs"", ""Type"": ""O"", ...","[{""Label"": ""Kansas City Chiefs"", ""Type"": ""O"", ...",Alice Williams,"November 30, 2019"
41170,N36704,sports,football_nfl,H.S. FOOTBALL: Wagner-Veterans Memorial showdo...,On a collision course since the start of the s...,https://assets.msn.com/labs/mind/AAJF4GD.html,[],"[{""Label"": ""Judson Independent School District...",Bob Miller,"October 21, 2019"
43786,N2172,finance,financenews,"Voters approve I-976, the $30 car tab measure....","Results are still trickling in, but in the fir...",https://assets.msn.com/labs/mind/AAJXIHA.html,[],[],Felix Wilson,"August 27, 2019"


# Historical Items

In [None]:
click_ids = behaviour[behaviour["userIdx"]==USER_ID]["click"].values
ll = lambda x: "N"+str(x)

click_ids = [ll(each) for each in click_ids]

news[news["itemId"].isin(click_ids)]

Unnamed: 0,itemId,category,subcategory,title,abstract,url,title_entities,abstract_entities
20810,N33900,sports,football_nfl,Patriots Vs. Jets Live: New England Steamrolls...,Final Patriots - : The New England Patriots im...,https://assets.msn.com/labs/mind/AAJ7aqU.html,"[{""Label"": ""New England Patriots"", ""Type"": ""O""...","[{""Label"": ""New England Patriots"", ""Type"": ""O""..."
28754,N32675,news,newsus,Northbound Texas 130 in Pflugerville closing f...,Northbound Texas 130 at Pflugerville Parkway w...,https://assets.msn.com/labs/mind/AAJpxWP.html,"[{""Label"": ""Texas"", ""Type"": ""G"", ""WikidataId"":...","[{""Label"": ""Texas"", ""Type"": ""G"", ""WikidataId"":..."
33795,N34112,travel,travelarticle,Budweiser Clydesdale to strut stuff at Jackson...,Residents can get an up-close glimpse or selfi...,https://assets.msn.com/labs/mind/BBWLjXu.html,"[{""Label"": ""Winn-Dixie"", ""Type"": ""O"", ""Wikidat...","[{""Label"": ""Winn-Dixie"", ""Type"": ""O"", ""Wikidat..."
33846,N32603,news,newsus,Joke Made About President Trump During North S...,An inappropriate joke made during a North Surr...,https://assets.msn.com/labs/mind/BBWJlkC.html,[],"[{""Label"": ""White House"", ""Type"": ""F"", ""Wikida..."
43516,N39247,news,newsus,Allegheny County Controller Chelsa Wagner Goes...,Allegheny County Controller Chelsa Wagner goes...,https://assets.msn.com/labs/mind/BBWBTtb.html,[],"[{""Label"": ""Detroit"", ""Type"": ""G"", ""WikidataId..."
44747,N45975,video,news,Desperation? Trump's top aide now suing Trump ...,The potential linchpin to the impeachment case...,https://assets.msn.com/labs/mind/BBWBZ1g.html,[],"[{""Label"": ""Ari Melber"", ""Type"": ""P"", ""Wikidat..."
45916,N43459,sports,football_nfl,Will Cain Ranks Three Teams Ahead Of Patriots ...,https://www.dailymotion.com/embed/video/x ns u...,https://assets.msn.com/labs/mind/BBWwsaG.html,"[{""Label"": ""New England Patriots"", ""Type"": ""O""...","[{""Label"": ""New England Patriots"", ""Type"": ""O""..."
