# Import necessary modules

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch.nn as nn
import pytorch_lightning as pl
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch
from collections import Counter
from tensorflow.keras.callbacks import TensorBoard

from sklearn.metrics import roc_auc_score, ndcg_score
from sklearn.preprocessing import LabelBinarizer

import os
os.environ['CRYPTOGRAPHY_OPENSSL_NO_LEGACY'] = '1'
import psycopg2


2023-12-04 04:35:25.270932: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-04 04:35:25.271019: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-04 04:35:25.278648: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Establish a connection to the PostgreSQL database
conn = psycopg2.connect(
    dbname="recome",
    user="postgres",
    password="postgres",
    host="localhost",
    port="5432"
)

# Create a cursor object
cur = conn.cursor()

# Close the connection
# cur.close()
# conn.close()

## Manual pre-processing of data
### behaviors.tsv

#### From documentation:  
The behaviors.tsv file contains the impression logs and users' news click histories. It has 5 columns divided by the tab symbol:
- Impression ID. The ID of an impression.  
- User ID. The anonymous ID of a user.  
- Time. The impression time with format "MM/DD/YYYY HH:MM:SS AM/PM".  
- History. The news click history (ID list of clicked news) of this user before this impression. The clicked news articles are ordered by time.  
- Impressions. List of news displayed in this impression and user's click behaviors on them (1 for click and 0 for non-click). The orders of news in a impressions have been shuffled.  

In [83]:
# Execute a query for behaviours
cur.execute("SELECT * FROM behaviours")
# Fetch all the rows for news
rows = cur.fetchall()
column_names = [desc[0] for desc in cur.description] # Get the column names 
raw_behaviour_users = pd.DataFrame(rows, columns=column_names) # Create a DataFrame from the rows, with the column names

print(f"The dataset originally consists of {len(raw_behaviour_users)} number of interactions.")
raw_behaviour_users.head()

The dataset originally consists of 156966 number of interactions.


Unnamed: 0,id,userid,timestamp,click_history,impressions
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...


In [84]:
## Indexize users
unique_userIds_user = raw_behaviour_users['userid'].unique()
# Allocate a unique index for each user, but let the zeroth index be a UNK index:
ind2user_user = {idx +1: itemid for idx, itemid in enumerate(unique_userIds_user)}
user2ind_user = {itemid : idx for idx, itemid in ind2user_user.items()}
print(f"We have {len(user2ind_user)} unique users in the dataset")

# Create a new column with userIdx:
raw_behaviour_users['userIdx'] = raw_behaviour_users['userid'].map(lambda x: user2ind_user.get(x,0))
raw_behaviour_users.head()

We have 50001 unique users in the dataset


Unnamed: 0,id,userid,timestamp,click_history,impressions,userIdx
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0,1
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...,2
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...,3
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0,4
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...,5


## Load article data
We also need to get the content information of each article. We will use the news.tsv file to index the items.

In [85]:
# Execute a query for news
cur.execute("SELECT * FROM news1")
# Fetch all the rows for news
rows = cur.fetchall()
column_names = [desc[0] for desc in cur.description] # Get the column names 
news = pd.DataFrame(rows, columns=column_names) # Create a DataFrame from the rows, with the column names
print(f"The news data consist in total of {len(news)} number of news.")

# Build index of items
ind2item = {idx +1: itemid for idx, itemid in enumerate(news['id'].values)}
item2ind = {itemid : idx for idx, itemid in ind2item.items()}

news.head()

The news data consist in total of 51282 number of news.


Unnamed: 0,id,category,subcategory,title,abstract,url,title_entities,abstract_entities,author,date
0,N31311,sports,football_nfl,"Eagles coach Doug Pederson's press conference,...",Eagles coach Doug Pederson will hold his usual...,https://assets.msn.com/labs/mind/BBWHKiV.html,"[{""Label"": ""Doug Pederson"", ""Type"": ""P"", ""Wiki...","[{""Label"": ""Doug Pederson"", ""Type"": ""P"", ""Wiki...",Grace Brown,"March 30, 2019"
1,N49514,news,newsus,Supreme Court says Ginsburg home sick with sto...,WASHINGTON (AP) The Supreme Court says Justi...,https://assets.msn.com/labs/mind/BBWHKmN.html,"[{""Label"": ""Ruth Bader Ginsburg"", ""Type"": ""P"",...","[{""Label"": ""Ruth Bader Ginsburg"", ""Type"": ""P"",...",Grace Taylor,"August 18, 2019"
2,N62104,travel,travelnews,Top Jacksonville news: Pub to close; man with ...,Here's the most recent top news in Jacksonvill...,https://assets.msn.com/labs/mind/BBWL8zg.html,"[{""Label"": ""Jacksonville, Florida"", ""Type"": ""G...","[{""Label"": ""Jacksonville, Florida"", ""Type"": ""G...",Felix Brown,"June 4, 2019"
3,N8020,video,news,What's pulling America apart?,Americans are more politically divided than ev...,https://assets.msn.com/labs/mind/BBWBMTs.html,[],[],Bob Johnson,"April 15, 2019"
4,N41002,news,newsus,"Woman, 23, killed by falling branch brightened...",The Boca Raton woman killed by a falling branc...,https://assets.msn.com/labs/mind/BBWEqVS.html,[],"[{""Label"": ""Boca Raton, Florida"", ""Type"": ""G"",...",Henry Smith,"January 25, 2019"


Now we need to process the click history and impressions. We need to both indexize the strings, but also to decode impressions into clicks and non-clicks.

In [86]:
# Indexize click history field
def process_click_history_user(s):
    list_of_strings = str(s).split(" ")
    return [item2ind.get(l, 0) for l in list_of_strings]
        
raw_behaviour_users['click_history_idx'] = raw_behaviour_users.click_history.map(lambda s:  process_click_history_user(s))
raw_behaviour_users.head()

Unnamed: 0,id,userid,timestamp,click_history,impressions,userIdx,click_history_idx
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0,1,"[38120, 25024, 29861, 34793, 38755, 20588, 926..."
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...,2,"[38302, 25029, 43147, 43137, 27851, 28701, 273..."
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...,3,"[21057, 19910, 26443, 38748, 23358, 34892, 359..."
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0,4,"[44845, 32909, 4649, 22374, 35687, 30732, 2989..."
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...,5,"[35862, 20493, 34764, 42695]"


In [87]:
# collect one click and one no-click from impressions:
def process_impression_user(s):
    list_of_strings = s.split(" ")
    itemid_rel_tuple = [l.split("-") for l in list_of_strings]
    noclicks = []
    for entry in itemid_rel_tuple:
        # print(entry[1])
        if entry[1] == '0':
            noclicks.append(entry[0])
        if entry[1] == '1':
            click = entry[0]
    return noclicks, click

raw_behaviour_users['noclicks'], raw_behaviour_users['click'] = zip(*raw_behaviour_users['impressions'].map(process_impression_user))
# We can then indexize these two new columns:
raw_behaviour_users['noclicks'] = raw_behaviour_users['noclicks'].map(lambda list_of_strings: [item2ind.get(l, 0) for l in list_of_strings])
raw_behaviour_users['click'] = raw_behaviour_users['click'].map(lambda x: item2ind.get(x,0))

raw_behaviour_users.head()

Unnamed: 0,id,userid,timestamp,click_history,impressions,userIdx,click_history_idx,noclicks,click
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0,1,"[38120, 25024, 29861, 34793, 38755, 20588, 926...",[15706],50306
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...,2,"[38302, 25029, 43147, 43137, 27851, 28701, 273...","[47630, 1649, 45653, 45961, 2630, 47476, 1453,...",43442
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...,3,"[21057, 19910, 26443, 38748, 23358, 34892, 359...","[49257, 44395, 48504, 1100, 16805, 48168, 1049...",21304
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0,4,"[44845, 32909, 4649, 22374, 35687, 30732, 2989...","[15706, 44243, 15354]",15723
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...,5,"[35862, 20493, 34764, 42695]","[17921, 44317, 49025, 47476, 3165, 15431, 4586...",8543


In [88]:
# convert timestamp value to hours since epoch
raw_behaviour_users['epochhrs'] = pd.to_datetime(raw_behaviour_users['timestamp']).values.astype(np.int64)/(1e6)/1000/3600
raw_behaviour_users['epochhrs'] = raw_behaviour_users['epochhrs'].round()
raw_behaviour_users[['click','epochhrs']].groupby("click").min("epochhrs").reset_index()

Unnamed: 0,click,epochhrs
0,1,437121.0
1,3,437149.0
2,7,437141.0
3,16,437102.0
4,18,437123.0
...,...,...
6347,51274,437147.0
6348,51275,437031.0
6349,51279,437039.0
6350,51280,437083.0


In [89]:
raw_behaviour_users

Unnamed: 0,id,userid,timestamp,click_history,impressions,userIdx,click_history_idx,noclicks,click,epochhrs
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0,1,"[38120, 25024, 29861, 34793, 38755, 20588, 926...",[15706],50306,437073.0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...,2,"[38302, 25029, 43147, 43137, 27851, 28701, 273...","[47630, 1649, 45653, 45961, 2630, 47476, 1453,...",43442,437106.0
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...,3,"[21057, 19910, 26443, 38748, 23358, 34892, 359...","[49257, 44395, 48504, 1100, 16805, 48168, 1049...",21304,437143.0
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0,4,"[44845, 32909, 4649, 22374, 35687, 30732, 2989...","[15706, 44243, 15354]",15723,437069.0
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...,5,"[35862, 20493, 34764, 42695]","[17921, 44317, 49025, 47476, 3165, 15431, 4586...",8543,437104.0
...,...,...,...,...,...,...,...,...,...,...
156961,156962,U10123,11/13/2019 6:57:04 AM,N9803 N104 N24462 N57318 N55743 N40526 N31726 ...,N3841-0 N61571-0 N58813-0 N28213-0 N4428-0 N25...,1080,"[39353, 21651, 33035, 22314, 9325, 33847, 3189...","[49044, 1283, 3853, 21363, 50132, 45459, 3208,...",2257,437119.0
156962,156963,U75630,11/14/2019 10:58:13 AM,N29898 N59704 N4408 N9803 N53644 N26103 N812 N...,N55913-0 N62318-0 N53515-0 N10960-0 N9135-0 N5...,1948,"[6155, 39160, 42164, 39353, 16795, 24382, 2795...","[48002, 50489, 47132, 37038, 9293, 46540, 4510...",11968,437147.0
156963,156964,U44625,11/13/2019 2:57:02 PM,N4118 N47297 N3164 N43295 N6056 N38747 N42973 ...,N6219-0 N3663-0 N31147-0 N58363-0 N4107-0 N457...,30681,"[11886, 29887, 18150, 19418, 49578, 17132, 304...","[2466, 1223, 10005, 8949, 45783, 9980, 1768, 4...",177,437127.0
156964,156965,U64800,11/14/2019 3:25:49 PM,N22997 N48742,N61233-0 N33828-1 N19661-0 N41934-0,13401,"[32392, 29043]","[50241, 4987, 11344]",44956,437151.0


# Modeling
We want to make a matrix factorization model where each user $u$ has a d-dimensional parameter vector $z_u$ and each item $i$ has a parameter vector $v_i$. This implies that we do not need the click_history_idx column when we train the model (why?).

Second, to simplify the computation of things we assume that the user only considers two items in each interaction: The item the user clicked on, and the first item in the noclicks list (what are we missing by this assumption?).

Hence, our dataset consist of a `behaviour` dataframe and a `news` dataframe with content information of the news articles. We will use a language model to interpret our article data, and for simplicity we will only use the title as the text input here.

In [90]:
raw_behaviour_users['noclick'] = raw_behaviour_users['noclicks'].map(lambda x : x[0])
behaviour_user = raw_behaviour_users[['epochhrs','userIdx','click_history_idx','noclick','click']]
behaviour_user.head()

Unnamed: 0,epochhrs,userIdx,click_history_idx,noclick,click
0,437073.0,1,"[38120, 25024, 29861, 34793, 38755, 20588, 926...",15706,50306
1,437106.0,2,"[38302, 25029, 43147, 43137, 27851, 28701, 273...",47630,43442
2,437143.0,3,"[21057, 19910, 26443, 38748, 23358, 34892, 359...",49257,21304
3,437069.0,4,"[44845, 32909, 4649, 22374, 35687, 30732, 2989...",15706,15723
4,437104.0,5,"[35862, 20493, 34764, 42695]",17921,8543


In [91]:
# Let us use the last 10pct of the data as our validation data:
test_time_th_user = behaviour_user['epochhrs'].quantile(0.9)
train_user = behaviour_user[behaviour_user['epochhrs']< test_time_th_user]
valid_user =  behaviour_user[behaviour_user['epochhrs']>= test_time_th_user]

In [92]:
class MindDataset_user(Dataset):
    # A fairly simple torch dataset module that can take a pandas dataframe (as above), 
    # and convert the relevant fields into a dictionary of arrays that can be used in a dataloader
    def __init__(self, df):
        # Create a dictionary of tensors out of the dataframe
        self.data = {
            'userIdx' : torch.tensor(df.userIdx.values),
            'click' : torch.tensor(df.click.values),
            'noclick' : torch.tensor(df.noclick.values)
        }
    def __len__(self):
        return len(self.data['userIdx'])
    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.data.items()}

In [93]:
# Build datasets and dataloaders of train and validation dataframes:
bs_user = 1024
ds_train_user = MindDataset_user(train_user)
train_loader_user = DataLoader(ds_train_user, batch_size=bs_user, shuffle=True)
ds_valid_user = MindDataset_user(valid_user)
valid_loader_user = DataLoader(ds_valid_user, batch_size=bs_user, shuffle=False)

batch_user = next(iter(train_loader_user))

In [94]:
batch_user["noclick"]

tensor([12207, 47630,  7876,  ..., 13586, 44369, 48292])

## Model

#### Framework
We will use pytorch-lightning to define and train our model. It is a high-level framework (similar to fastAI) but with a slightly different way of defining things. It is my personal go-to framework and is very flexible. For more information, see https://pytorch-lightning.readthedocs.io/.

#### The model
We assume that each interaction goes as follow: the user is presented with two items: the click and no-click item. After the user reviewed both items, she will choose the most relevant one. This can be modeled as a categorical distirbution with two options (yes, you could do binomial). There is a loss function in pytorch for this already, called the `F.cross_entropy` that we will use.

In [95]:
# Build a matrix factorization model
class NewsMF_user(pl.LightningModule):
    def __init__(self, num_users, num_items, dim = 10):
        super().__init__()
        self.dim=dim
        self.useremb = nn.Embedding(num_embeddings=num_users, embedding_dim=dim)
        self.itememb = nn.Embedding(num_embeddings=num_items, embedding_dim=dim)
    
    def forward(self, user, item):
        batch_size = user.size(0)
        uservec = self.useremb(user)
        itemvec = self.itememb(item)

        score = (uservec*itemvec).sum(-1).unsqueeze(-1)
        
        return score
    
    def training_step(self, batch, batch_idx):
        batch_size = batch['userIdx'].size(0)

        score_click = self.forward(batch['userIdx'], batch['click'])
        score_noclick = self.forward(batch['userIdx'], batch['noclick'])
        
        scores_all = torch.concat((score_click, score_noclick), dim=1)
        # Compute loss as cross entropy (categorical distribution between the clicked and the no clicked item)
        loss = F.cross_entropy(input=scores_all, target=torch.zeros(batch_size, device=scores_all.device).long())
        
        return loss
    
    def validation_step(self, batch, batch_idx):
        # for now, just do the same computation as during training
        loss = self.training_step(batch, batch_idx)
        self.log("val_loss", loss, prog_bar=True, on_step=False, on_epoch=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer
    

In [97]:
# Build and train the model
mf_model_user = NewsMF_user(num_users=len(ind2user_user)+1, num_items = len(ind2item)+1)
trainer_user = pl.Trainer(max_epochs=10, accelerator="gpu")
trainer_user.fit(model=mf_model_user, train_dataloaders=train_loader_user, val_dataloaders=valid_loader_user)

# Save the model
trainer_user.save_checkpoint("model_user.ckpt")

# Load the trained model
mf_model_user = NewsMF_user.load_from_checkpoint(checkpoint_path="model_user.ckpt", num_users=len(ind2user_user) + 1, num_items=len(ind2item) + 1)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type      | Params
--------------------------------------
0 | useremb | Embedding | 500 K 
1 | itememb | Embedding | 512 K 
--------------------------------------
1.0 M     Trainable params
0         Non-trainable params
1.0 M     Total params
4.051     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/yvesito/.local/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


                                                                           

/home/yvesito/.local/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Epoch 9: 100%|██████████| 138/138 [00:04<00:00, 29.32it/s, v_num=3, val_loss=1.680]

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 138/138 [00:04<00:00, 29.08it/s, v_num=3, val_loss=1.680]


# Sense Check

In [98]:
USER_ID = 1

# Suggested Items

In [99]:
# Create item_ids and user ids list
item_id_user = list(ind2item.keys())
userIdx_user =  [USER_ID]*len(item_id_user)


preditions_user = mf_model_user.forward(torch.IntTensor(userIdx_user), torch.IntTensor(item_id_user))

# Select top 10 argmax
top_index_user = torch.topk(preditions_user.flatten(), 10).indices

# Filter for top 10 suggested items
filters_user = [ind2item[ix.item()] for ix in top_index_user]
recommendedNews = news[news["id"].isin(filters_user)]

# Historical Items

In [None]:
click_ids = behaviour[behaviour["userIdx"]==USER_ID]["click"].values
ll = lambda x: "N"+str(x)

click_ids = [ll(each) for each in click_ids]

news[news["itemId"].isin(click_ids)]

Unnamed: 0,itemId,category,subcategory,title,abstract,url,title_entities,abstract_entities
20810,N33900,sports,football_nfl,Patriots Vs. Jets Live: New England Steamrolls...,Final Patriots - : The New England Patriots im...,https://assets.msn.com/labs/mind/AAJ7aqU.html,"[{""Label"": ""New England Patriots"", ""Type"": ""O""...","[{""Label"": ""New England Patriots"", ""Type"": ""O""..."
28754,N32675,news,newsus,Northbound Texas 130 in Pflugerville closing f...,Northbound Texas 130 at Pflugerville Parkway w...,https://assets.msn.com/labs/mind/AAJpxWP.html,"[{""Label"": ""Texas"", ""Type"": ""G"", ""WikidataId"":...","[{""Label"": ""Texas"", ""Type"": ""G"", ""WikidataId"":..."
33795,N34112,travel,travelarticle,Budweiser Clydesdale to strut stuff at Jackson...,Residents can get an up-close glimpse or selfi...,https://assets.msn.com/labs/mind/BBWLjXu.html,"[{""Label"": ""Winn-Dixie"", ""Type"": ""O"", ""Wikidat...","[{""Label"": ""Winn-Dixie"", ""Type"": ""O"", ""Wikidat..."
33846,N32603,news,newsus,Joke Made About President Trump During North S...,An inappropriate joke made during a North Surr...,https://assets.msn.com/labs/mind/BBWJlkC.html,[],"[{""Label"": ""White House"", ""Type"": ""F"", ""Wikida..."
43516,N39247,news,newsus,Allegheny County Controller Chelsa Wagner Goes...,Allegheny County Controller Chelsa Wagner goes...,https://assets.msn.com/labs/mind/BBWBTtb.html,[],"[{""Label"": ""Detroit"", ""Type"": ""G"", ""WikidataId..."
44747,N45975,video,news,Desperation? Trump's top aide now suing Trump ...,The potential linchpin to the impeachment case...,https://assets.msn.com/labs/mind/BBWBZ1g.html,[],"[{""Label"": ""Ari Melber"", ""Type"": ""P"", ""Wikidat..."
45916,N43459,sports,football_nfl,Will Cain Ranks Three Teams Ahead Of Patriots ...,https://www.dailymotion.com/embed/video/x ns u...,https://assets.msn.com/labs/mind/BBWwsaG.html,"[{""Label"": ""New England Patriots"", ""Type"": ""O""...","[{""Label"": ""New England Patriots"", ""Type"": ""O""..."
