In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch.nn as nn
import pytorch_lightning as pl
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch
from collections import Counter
print(torch.cuda.is_available())

import psycopg2
      


ModuleNotFoundError: No module named 'torch._dynamo'

In [None]:
# Establish a connection to the PostgreSQL database
conn = psycopg2.connect(
    dbname="recome",
    user="postgres",
    password="postgres",
    host="localhost",
    port="5432"
)

# Create a cursor object
cur = conn.cursor()

# Close the connection
# cur.close()
# conn.close()


In [None]:
# Execute a query for behaviours
cur.execute("SELECT * FROM behaviours")
# Fetch all the rows for news
rows = cur.fetchall()
column_names = [desc[0] for desc in cur.description] # Get the column names 
raw_behaviour = pd.DataFrame(rows, columns=column_names) # Create a DataFrame from the rows, with the column names

print(f"The dataset originally consists of {len(raw_behaviour)} number of interactions.")
raw_behaviour.head()

The dataset originally consists of 156966 number of interactions.


Unnamed: 0,id,userid,timestamp,click_history,impressions
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...


In [None]:
# Execute a query for news
cur.execute("SELECT * FROM news1")
# Fetch all the rows for news
rows = cur.fetchall()
column_names = [desc[0] for desc in cur.description] # Get the column names 
news = pd.DataFrame(rows, columns=column_names) # Create a DataFrame from the rows, with the column names



print(f"The news data consist in total of {len(news)} number of news.")
news.head()

The news data consist in total of 51282 number of news.


Unnamed: 0,id,category,subcategory,title,abstract,url,title_entities,abstract_entities,author,date
0,N49531,news,newspolitics,Some Republicans inch closer to Trump impeachm...,Several Republicans grew more receptive this w...,https://assets.msn.com/labs/mind/AAIZARM.html,"[{""Label"": ""Mick Mulvaney"", ""Type"": ""P"", ""Wiki...","[{""Label"": ""Mick Mulvaney"", ""Type"": ""P"", ""Wiki...",Diana Moore,"April 6, 2019"
1,N37701,news,newsworld,Russia patrolling between Turkish and Syrian f...,Battles in northern Syrian towns Tuesday were ...,https://assets.msn.com/labs/mind/AAIOmbO.html,"[{""Label"": ""United States"", ""Type"": ""G"", ""Wiki...","[{""Label"": ""Turkey"", ""Type"": ""G"", ""WikidataId""...",Henry Smith,"July 5, 2019"
2,N11071,sports,football_nfl,Burning questions for Lions vs Packers on 'Mon...,The Green Bay Packers host the Detroit Lions f...,https://assets.msn.com/labs/mind/AAILr0Y.html,"[{""Label"": ""Detroit Lions"", ""Type"": ""O"", ""Wiki...","[{""Label"": ""Detroit Lions"", ""Type"": ""O"", ""Wiki...",Ivy Davis,"July 9, 2019"
3,N7360,news,newscrime,"Just Out of Prison, Brooklyn Gunman Is Killed ...",Life for Nasheem Prioleau had become a series ...,https://assets.msn.com/labs/mind/AAIQxFd.html,"[{""Label"": ""Just Out"", ""Type"": ""M"", ""WikidataI...","[{""Label"": ""Brooklyn"", ""Type"": ""G"", ""WikidataI...",Ivy Taylor,"April 22, 2019"
4,N11987,sports,football_nfl,"NFL power rankings, Week 8: 49ers challenging ...",With eight weeks of the 2019 NFL regular seaso...,https://assets.msn.com/labs/mind/AAJvUKj.html,"[{""Label"": ""New York Jets"", ""Type"": ""O"", ""Wiki...","[{""Label"": ""New Orleans Saints"", ""Type"": ""O"", ...",Diana Taylor,"September 16, 2019"


Now we need to process the click history and impressions. We first need to decode impressions into clicks and non-clicks.

In [None]:
# Function to split the impressions and clicks into two seperate lists
def process_impression(impression_list):
    list_of_strings = impression_list.split()
    click = [x.split('-')[0] for x in list_of_strings if x.split('-')[1] == '1']
    non_click = [x.split('-')[0] for x in list_of_strings if x.split('-')[1] == '0']
    return click,non_click

# We can then indexize these two new columns:
raw_behaviour['click'], raw_behaviour['noclicks'] = zip(*raw_behaviour['impressions'].map(process_impression))

In [None]:
# Convert timestamp value to hours since epoch
raw_behaviour['epochhrs'] = pd.to_datetime(raw_behaviour['timestamp']).values.astype(np.int64)/(1e6)/1000/3600
raw_behaviour['epochhrs'] = raw_behaviour['epochhrs'].round()

# Click History
In the dataset we can see that a large number of items and users does not have sufficent amount of clicks. This is since we are working with a smaller version of the MIND dataset that contains 50k users instead of the full version of 1 million users. Therefore it will be hard to learn the user and item embeddings by only relying on the interactions e.g. the <b>clicks</b> and <b>noclicks</b>.

To resolve this issue in the lab, we will expand the click_history column, which will add about 7 times more interactions than the original data. However, note that these events don't have any information about which articles were shown to the user e.g. the impressions or noclicks.

In [None]:
# If there exists several clicks in one session, expand to new observation
raw_behaviour = raw_behaviour.explode("click").reset_index(drop=True)

# Extract the clicks from the previous clicks
click_history = raw_behaviour[["userid","click_history"]].drop_duplicates().dropna()
click_history["click_history"] = click_history.click_history.map(lambda x: x.split())
click_history = click_history.explode("click_history").rename(columns={"click_history":"click"})

# Dummy time set to earlies epochhrs in raw_behaviour as we don't know when these events took place.
click_history["epochhrs"] = raw_behaviour.epochhrs.min() 
click_history["noclicks"] = pd.Series([[] for _ in range(len(click_history.index))])

# concatenate historical clicks with the raw_behaviour
raw_behaviour = pd.concat([raw_behaviour,click_history],axis=0).reset_index(drop=True)
print(f"The dataset after pre-processing consist of {len(raw_behaviour)} number of interactions.")

The dataset after pre-processing consist of 1162404 number of interactions.


# Cold start problem
Still after doing our pre-processing and adding the <b>click_history</b> to the <b>click column</b>, we can see that a large number of items does not have sufficent amount of clicks. This can be thought of as a <u>cold start problem</u>. To adjust for this we will remove items from the raw_behaviour that falls under the min_click_cutoff.

In [None]:
min_click_cutoff = 100
print(f'Number of items that have less than {min_click_cutoff} clicks make up',np.round(np.mean(raw_behaviour.groupby("click").size() < min_click_cutoff)*100,3),'% of the total, and these will be removed.') 

Number of items that have less than 100 clicks make up 93.852 % of the total, and these will be removed.


In [None]:
# remove items with less clicks than min_click_cutoff
raw_behaviour = raw_behaviour[raw_behaviour.groupby("click")["userid"].transform('size') >= min_click_cutoff].reset_index(drop=True)
# Get a set with all the unique items
click_set = set(raw_behaviour['click'].unique())

# remove items for impressions that is not avaiable in the click set (the items that we will be training on)
raw_behaviour['noclicks'] = raw_behaviour['noclicks'].apply(lambda impressions: [impression for impression in impressions if impression in click_set])

<h2> Output of data preprocessing (START HERE IF YOU ARE DOING AI ACADEMY) </h2>
In this preprocessing we have processed behaviour data, article data and user data. The main component is <b>behaviour</b>, and for collaborative filtering purposes this is all we need. However, if we want to utilize content data on the news items some additional preprocessing on the news dataframe must be applied.

In [None]:
## Select the columns that we now want to use for further analysis
behaviour = raw_behaviour[['epochhrs','userid','click','noclicks']].copy()

print('Number of interactions in the behaviour dataset:', behaviour.shape[0])
print('Number of users in the behaviour dataset:', behaviour.userid.nunique())
print('Number of articles in the behaviour dataset:', behaviour.click.nunique())

behaviour.head()

Number of interactions in the behaviour dataset: 781873
Number of users in the behaviour dataset: 49833
Number of articles in the behaviour dataset: 2451


Unnamed: 0,epochhrs,userid,click,noclicks
0,437073.0,U13740,N55689,[N35729]
1,437106.0,U91836,N17059,"[N20678, N39317, N58114, N20495, N42977, N1459..."
2,437143.0,U73700,N23814,"[N23877, N35389, N49712, N16844, N59685, N2344..."
3,437069.0,U34670,N49685,"[N35729, N33632, N27581]"
4,437083.0,U19739,N33619,[]


# Train / Test Split + indexing
Before we carry on to define our first model we first need to apply indexizing for the users and items in the behaviour dataframe, as pytorch requires integer indicies instead of strings for user and item IDs.

We do this by two dictionaries:

<b>ind2item </b>: mapping the item indicies given in behaviour to the real item Id given in the dataset.
<b>ind2user</b>: mapping the user indicies given in behaviour to the real user Id given in the dataset.
Note that we also create <b> item2ind </b> and <b> user2ind </b> to do the reverse.

The indexing will be created based on the training data, where new unseen articles in the validation set will get the index 0. We will use 90% for training 10% for validation, when we split the data it's important to make use of temporal epochhrs to divide the data, as a regular random split in this case does not make sense in recommender systems.

In [None]:
# Let us use the last 10pct of the data as our validation data:
test_time_th = behaviour['epochhrs'].quantile(0.9)
train = behaviour[behaviour['epochhrs']< test_time_th].copy()

## Indexize items
# Allocate a unique index for each item, but let the zeroth index be a UNK index:
ind2item = {idx +1: itemid for idx, itemid in enumerate(train.click.unique())}
item2ind = {itemid : idx for idx, itemid in ind2item.items()}

train['noclicks'] = train['noclicks'].map(lambda list_of_items: [item2ind.get(l, 0) for l in list_of_items])
train['click'] = train['click'].map(lambda item: item2ind.get(item, 0))

## Indexize users
# Allocate a unique index for each user, but let the zeroth index be a UNK index:
ind2user = {idx +1: userid for idx, userid in enumerate(train['userid'].unique())}
user2ind = {userid : idx for idx, userid in ind2user.items()}

# Create a new column with userIdx:
train['userIdx'] = train['userid'].map(lambda x: user2ind.get(x,0))

# Repeat for validation
valid =  behaviour[behaviour['epochhrs']>= test_time_th].copy()
valid["click"] = valid["click"].map(lambda item: item2ind.get(item, 0))
valid["noclicks"] = valid["noclicks"].map(lambda list_of_items: [item2ind.get(l, 0) for l in list_of_items])
valid["userIdx"] = valid["userid"].map(lambda x: user2ind.get(x,0))

In [None]:
valid

Unnamed: 0,epochhrs,userid,click,noclicks,userIdx
1,437106.0,U91836,0,"[456, 326, 0, 0, 0, 477, 220, 143, 0]",7912
2,437143.0,U73700,0,"[0, 0, 0, 0, 0, 0, 0, 262, 0, 0, 0, 0, 0, 0, 0...",33857
6,437110.0,U46596,0,"[143, 0, 0, 0]",8708
7,437122.0,U79199,0,"[419, 0, 0]",13979
9,437145.0,U89744,0,"[0, 0, 0, 0, 308, 0, 321, 0, 262, 94, 0, 103, ...",943
...,...,...,...,...,...
168643,437105.0,U17467,0,"[465, 147, 477, 357, 489, 143, 206, 13, 304, 4...",21122
168644,437152.0,U72015,279,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 226...",49396
168645,437127.0,U44625,0,"[114, 134, 0, 0, 0, 0, 0, 357, 237, 284, 27, 1...",22123
168646,437151.0,U64800,0,"[0, 0, 0]",6481


# Modeling & Negative sampling
We want to make a matrix factorization model where each user $u$ has a d-dimensional parameter vector $z_u$ and each item $i$ has a parameter vector $v_i$.

Second, to simplify the computation of things and as we do not have a `noclicks` for every `click` interaction we will only utilize two **known** things in the training phase: The item the `userIdx` and `click`. However, as we want to model the binary behavior in terms of clicks and non-clicks we will make use of something called negative sampling. With negative sampling - we will draw a sample a random negative item for each known user-click combination to express  the lack of preference by the user for the sampled item.

In [None]:
class MindDataset(Dataset):
    # A fairly simple torch dataset module that can take a pandas dataframe (as above), 
    # and convert the relevant fields into a dictionary of arrays that can be used in a dataloader
    def __init__(self, df):
        # Create a dictionary of tensors out of the dataframe
        self.data = {
            'userIdx' : torch.tensor(df.userIdx.values.astype(np.int64)),
            'click' : torch.tensor(df.click.values.astype(np.int64))
        }
    def __len__(self):
        return len(self.data['userIdx'])
    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.data.items()}

In [None]:
# Build datasets and dataloaders of train and validation dataframes:
bs = 1024
ds_train = MindDataset(train)
train_loader = DataLoader(ds_train, batch_size=bs, shuffle=True)
ds_valid = MindDataset(valid)
valid_loader = DataLoader(ds_valid, batch_size=bs, shuffle=False)

batch = next(iter(train_loader))

## Model

#### Framework
We will use pytorch-lightning to define and train our model. It is a high-level framework (similar to fastAI) but with a slightly different way of defining things. It is my personal go-to framework and is very flexible. For more information, see https://pytorch-lightning.readthedocs.io/.

#### The model
We assume that each interaction goes as follow: the user is presented with two items: the click and no-click item, where the no-click item will be randomly chosen with negative sampling. After the user reviewed both items, she will choose the most relevant one. This can be modeled as a categorical distirbution with two options (yes, you could do binomial). There is a loss function in pytorch for this already, called the `F.binary_cross_entropy` that we will use.

In [None]:
# Build a matrix factorization model
class NewsMF(pl.LightningModule):
    def __init__(self, num_users, num_items, dim = 10):
        super().__init__()
        self.dim=dim
        self.num_users = num_users
        self.num_items = num_items
        
        self.useremb = nn.Embedding(num_embeddings=num_users, embedding_dim=dim)
        self.itememb = nn.Embedding(num_embeddings=num_items, embedding_dim=dim)

        
    def step(self, batch, batch_idx, phase="train"):
        batch_size = batch['userIdx'].size(0)
        uservec = self.useremb(batch['userIdx'])       
        itemvec_click = self.itememb(batch['click'])
        
        # For each positive interaction,sample a random negative
        neg_sample = torch.randint_like(batch["click"],1,self.num_items)
        itemvec_noclick = self.itememb(neg_sample)
        
        score_click = torch.sigmoid((uservec*itemvec_click).sum(-1).unsqueeze(-1))
        score_noclick =  torch.sigmoid((uservec*itemvec_noclick).sum(-1).unsqueeze(-1))

        # Compute loss as binary cross entropy (categorical distribution between the clicked and the no clicked item)
        scores_all = torch.concat((score_click, score_noclick), dim=1)
        target_all = torch.concat((torch.ones_like(score_click), torch.zeros_like(score_noclick)),dim=1)
        loss = F.binary_cross_entropy(scores_all, target_all)
        return loss
    
    
    def training_step(self, batch, batch_idx):
        return self.step(batch, batch_idx, "train")
    
    def validation_step(self, batch, batch_idx):
        # for now, just do the same computation as during training
        return self.step(batch, batch_idx, "val")

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

In [None]:
# Define and train model
mf_model = NewsMF(num_users=len(ind2user) + 1, num_items = len(ind2item) + 1, dim = 50)
trainer = pl.Trainer(max_epochs=50, accelerator="gpu")
trainer.fit(model=mf_model, train_dataloaders=train_loader)

# Save the model
trainer.save_checkpoint("model_news.ckpt")

# Load the trained model
# mf_model = NewsMF.load_from_checkpoint(checkpoint_path="model_news.ckpt", num_users=len(ind2user) + 1, num_items=len(ind2item) + 1, dim = 50)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/yvesito/.local/lib/python3.10/site-packages/pytorch_lightning/trainer/configuration_validator.py:74: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type      | Params
--------------------------------------
0 | useremb | Embedding | 2.5 M 
1 | itememb | Embedding | 113 K 
--------------------------------------
2.6 M     Trainable params
0         Non-trainable params
2.6 M     Total params
10.335    Total estimated model params size (MB)
/home/yvesito/.local/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.

Epoch 9: 100%|██████████| 686/686 [00:18<00:00, 36.21it/s, v_num=2]

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 686/686 [00:19<00:00, 36.08it/s, v_num=2]


In [None]:
## Add more information to the article data 
# The item index
news["ind"] = news["id"].map(item2ind)
news = news.sort_values("ind").reset_index(drop=True)
# Number of clicks in training data per article, investigate the cold start issue
news["n_click_training"] = news["ind"].map(dict(Counter(train.click)))
# 5 most clicked articles
# news.sort_values("n_click_training",ascending=False).head()
trendingnews = news.sort_values("n_click_training",ascending=False)
trendingnews.head()

Unnamed: 0,id,category,subcategory,title,abstract,url,title_entities,abstract_entities,author,date,ind,n_click_training
597,N306,movies,movies-celebrity,Kevin Spacey Won't Be Charged in Sexual Assaul...,The Los Angeles County District Attorney's Off...,https://assets.msn.com/labs/mind/AAJy6rv.html,"[{""Label"": ""Kevin Spacey"", ""Type"": ""P"", ""Wikid...","[{""Label"": ""Kevin Spacey"", ""Type"": ""P"", ""Wikid...",Eleanor Jones,"April 27, 2019",598.0,4802.0
0,N55689,sports,football_nfl,"Charles Rogers, former Michigan State football...","Charles Rogers, the former Michigan State foot...",https://assets.msn.com/labs/mind/BBWAPO6.html,"[{""Label"": ""Charles Rogers (American football)...","[{""Label"": ""2003 NFL Draft"", ""Type"": ""U"", ""Wik...",Charlie Davis,"February 9, 2019",1.0,4316.0
656,N42620,lifestyle,lifestylebuzz,Heidi Klum's 2019 Halloween Costume Transforma...,You might say she's scary good at playing dres...,https://assets.msn.com/labs/mind/AAJFlhi.html,"[{""Label"": ""Heidi Klum"", ""Type"": ""P"", ""Wikidat...","[{""Label"": ""Heidi Klum"", ""Type"": ""P"", ""Wikidat...",Grace Wilson,"September 3, 2019",657.0,4047.0
10,N47020,news,newsopinion,The News In Cartoons,News as seen through the eyes of the nation's ...,https://assets.msn.com/labs/mind/AAJ7oYd.html,[],[],Charlie Davis,"February 25, 2019",11.0,3545.0
9,N35729,news,newsus,Porsche launches into second story of New Jers...,The Porsche went airborne off a median in Toms...,https://assets.msn.com/labs/mind/BBWyjM9.html,"[{""Label"": ""Porsche"", ""Type"": ""O"", ""WikidataId...","[{""Label"": ""Porsche"", ""Type"": ""O"", ""WikidataId...",Charlie Moore,"September 1, 2019",10.0,3346.0
...,...,...,...,...,...,...,...,...,...,...,...,...
604,N40509,news,newscrime,One of FBI's Most Wanted fugitives offers surr...,One of the FBI's Most Wanted fugitives says he...,https://assets.msn.com/labs/mind/BBWqn9T.html,"[{""Label"": ""FBI Ten Most Wanted Fugitives"", ""T...","[{""Label"": ""FBI Ten Most Wanted Fugitives"", ""T...",Henry Williams,"November 18, 2019",605.0,1014.0
771,N54842,news,newsus,Mississippi woman found after being missing fo...,S.O.S spelled out with rocks saved a woman mis...,https://assets.msn.com/labs/mind/AAJvHE4.html,"[{""Label"": ""Mississippi"", ""Type"": ""G"", ""Wikida...",[],Grace Davis,"September 21, 2019",772.0,1014.0
942,N56753,news,newspolitics,Fox News contributor: 'Most likely' outcome is...,Fox News contributor Christopher Hahn predicte...,https://assets.msn.com/labs/mind/AAJrEwW.html,"[{""Label"": ""Donald Trump"", ""Type"": ""P"", ""Wikid...","[{""Label"": ""Christopher Hahn"", ""Type"": ""P"", ""W...",Eleanor Williams,"December 10, 2019",943.0,1008.0
1055,N4593,movies,movies-celebrity,"Emily Ratajkowski Is Being Sued for $150,000 O...",What's more? The photographer is also asking f...,https://assets.msn.com/labs/mind/AAJga7m.html,"[{""Label"": ""Emily Ratajkowski"", ""Type"": ""P"", ""...",[],Alice Williams,"February 10, 2019",1056.0,1004.0


In [None]:
# store the learned item embedding into a seperate tensor
itememb = mf_model.itememb.weight.detach()
print(itememb.shape)

torch.Size([2279, 50])


In [None]:
# Investigate different rows of the item embedding (articles embeddings) to see if the model works
## some examples N13259, N16636, N10272
## Can you find some examples that does not work good? Why?

ind = item2ind.get("N10272") 
SelectedNews = news[news['id'] == "N306"].category.values[0]
print(SelectedNews)
# This calculates the cosine similarity and outputs the 10 most similar articles w.r.t to ind in descending order
similarity = torch.nn.functional.cosine_similarity( itememb[ind], itememb, dim=0)
# y_true = similarity.argsort(descending=False)-1
# print(similarity)
# print(y_true)
most_sim = news[~news.ind.isna()].iloc[( similarity.argsort(descending=True).numpy()-1)]

most_sim = most_sim[most_sim['category'] == SelectedNews]
most_sim.head(100)

movies


Unnamed: 0,id,category,subcategory,title,abstract,url,title_entities,abstract_entities,author,date,ind,n_click_training
29,N55582,movies,movies-celebrity,"Halle Berry, 53, flaunts chiseled abs on Insta...",Halle Berry is trying to motivate her social m...,https://assets.msn.com/labs/mind/BBWtN08.html,[],[],Jack Williams,"June 13, 2019",30.0,393.0
31,N24272,movies,movies-celebrity,Actress Accuses Roman Polanski of Raping Her i...,'Devil Fish' actor Valentine Monnier accuses t...,https://assets.msn.com/labs/mind/BBWu461.html,"[{""Label"": ""Roman Polanski"", ""Type"": ""P"", ""Wik...","[{""Label"": ""Le Parisien"", ""Type"": ""M"", ""Wikida...",Ivy Johnson,"October 10, 2019",32.0,509.0
