In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch.nn as nn
import pytorch_lightning as pl
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch
import psycopg2
from collections import Counter

from typing import List

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Establish a connection to the PostgreSQL database
conn = psycopg2.connect(
    dbname="recome",
    user="postgres",
    password="postgres",
    host="localhost",
    port="5432"
)

# Create a cursor object
cur = conn.cursor()

# Close the connection
# cur.close()
# conn.close()

In [3]:
# Execute a query for behaviours
cur.execute("SELECT * FROM behaviours1")
# Fetch all the rows for news
rows = cur.fetchall()
column_names = [desc[0] for desc in cur.description] # Get the column names 
raw_behaviour = pd.DataFrame(rows, columns=column_names) # Create a DataFrame from the rows, with the column names

print(f"The dataset originally consists of {len(raw_behaviour)} number of interactions.")
raw_behaviour.head()

The dataset originally consists of 100 number of interactions.


Unnamed: 0,id,userid,timestamp,click_history,impressions
0,0,U1,12/11/2023 03:06:31 AM,N2216 N1796 N1336 N494 N809 N142 N1804,N1617-1 N1389-0
1,1,U2,12/12/2023 07:55:10 AM,N2152,N1737-1 N1822-1 N1862-1 N1886-0 N1051-1 N958-1...
2,2,U3,12/10/2023 02:47:58 AM,N2302 N1464 N10 N1722 N902 N1597 N2144 N978 N6...,N313-1 N2095-1 N1783-1 N2277-1 N1668-1 N737-1 ...
3,3,U4,12/12/2023 08:39:42 PM,N1028 N2390 N1158 N2373 N1002 N1113,N1780-1 N524-1 N908-0
4,4,U5,12/10/2023 08:21:08 AM,N1936 N1793 N198 N965,N124-0 N2010-0 N920-0 N365-0


In [4]:
## Indexize users
unique_userIds = raw_behaviour['userid'].unique()
# Allocate a unique index for each user, but let the zeroth index be a UNK index:
ind2user = {idx +1: itemid for idx, itemid in enumerate(unique_userIds)}
user2ind = {itemid : idx for idx, itemid in ind2user.items()}
print(f"We have {len(user2ind)} unique users in the dataset")

# Create a new column with userIdx:
raw_behaviour['userIdx'] = raw_behaviour['userid'].map(lambda x: user2ind.get(x,0))

We have 100 unique users in the dataset


In [5]:
raw_behaviour.head()

Unnamed: 0,id,userid,timestamp,click_history,impressions,userIdx
0,0,U1,12/11/2023 03:06:31 AM,N2216 N1796 N1336 N494 N809 N142 N1804,N1617-1 N1389-0,1
1,1,U2,12/12/2023 07:55:10 AM,N2152,N1737-1 N1822-1 N1862-1 N1886-0 N1051-1 N958-1...,2
2,2,U3,12/10/2023 02:47:58 AM,N2302 N1464 N10 N1722 N902 N1597 N2144 N978 N6...,N313-1 N2095-1 N1783-1 N2277-1 N1668-1 N737-1 ...,3
3,3,U4,12/12/2023 08:39:42 PM,N1028 N2390 N1158 N2373 N1002 N1113,N1780-1 N524-1 N908-0,4
4,4,U5,12/10/2023 08:21:08 AM,N1936 N1793 N198 N965,N124-0 N2010-0 N920-0 N365-0,5


In [6]:
# Execute a query for news
cur.execute("SELECT * FROM news5")
# Fetch all the rows for news
rows = cur.fetchall()
column_names = [desc[0] for desc in cur.description] # Get the column names 
news = pd.DataFrame(rows, columns=column_names) # Create a DataFrame from the rows, with the column names
print(f"The news data consist in total of {len(news)} number of news.")

# Build index of items
ind2item = {idx +1: itemid for idx, itemid in enumerate(news['id'].values)}
item2ind = {itemid : idx for idx, itemid in ind2item.items()}

news.head()

The news data consist in total of 2422 number of news.


Unnamed: 0,id,category,title,img_url,abstract,url,author,date
0,N1,news,Stephen Fry reveals full hell of his O2 stage ...,https://i.dailymail.co.uk/1s/2023/12/09/12/787...,He was later treated at Queen Elizabeth Hospit...,https://www.dailymail.co.uk/news/article-12844...,Gemma Parry,2023-12-09 20:47:02+08
1,N2,tvshowbiz,BBC Breakfast's Naga Munchetty leaves viewers ...,https://i.dailymail.co.uk/1s/2023/12/09/12/787...,BBC Breakfast host Naga Munchetty stunned view...,https://www.dailymail.co.uk/tvshowbiz/article-...,Laura Fox,2023-12-09 20:48:15+08
2,N3,home,CAROLINE WEST-MEADS: I'm worried my stepdaught...,https://i.dailymail.co.uk/1s/2023/12/05/17/786...,"Although my husband loves his daughter, I don’...",https://www.dailymail.co.uk/home/you/article-1...,Caroline West-Meads,2023-12-09 20:01:23+08
3,N4,news,Meredith Kercher's killer raped me: Ex-girlfri...,https://i.dailymail.co.uk/1s/2023/12/09/12/786...,The former girlfriend of Meredith Kercher's ki...,https://www.dailymail.co.uk/news/article-12844...,Andrew Young,2023-12-09 22:23:09+08
4,N5,news,British Airways bosses apologise for telling c...,https://i.dailymail.co.uk/1s/2023/12/09/11/761...,British Airways bosses have apologised for tel...,https://www.dailymail.co.uk/news/article-12844...,Gemma Parry,2023-12-09 19:26:03+08


In [7]:
# Indexize click history field
def process_click_history(s):
    list_of_strings = str(s).split(" ")
    return [item2ind.get(l, 0) for l in list_of_strings]
        
raw_behaviour['click_history_idx'] = raw_behaviour.click_history.map(lambda s:  process_click_history(s))
raw_behaviour.head()

Unnamed: 0,id,userid,timestamp,click_history,impressions,userIdx,click_history_idx
0,0,U1,12/11/2023 03:06:31 AM,N2216 N1796 N1336 N494 N809 N142 N1804,N1617-1 N1389-0,1,"[2218, 1798, 1340, 495, 811, 144, 1806]"
1,1,U2,12/12/2023 07:55:10 AM,N2152,N1737-1 N1822-1 N1862-1 N1886-0 N1051-1 N958-1...,2,[2154]
2,2,U3,12/10/2023 02:47:58 AM,N2302 N1464 N10 N1722 N902 N1597 N2144 N978 N6...,N313-1 N2095-1 N1783-1 N2277-1 N1668-1 N737-1 ...,3,"[2303, 1468, 10, 1725, 904, 1599, 2145, 982, 6..."
3,3,U4,12/12/2023 08:39:42 PM,N1028 N2390 N1158 N2373 N1002 N1113,N1780-1 N524-1 N908-0,4,"[1030, 2390, 1163, 2373, 1004, 1118]"
4,4,U5,12/10/2023 08:21:08 AM,N1936 N1793 N198 N965,N124-0 N2010-0 N920-0 N365-0,5,"[1940, 1795, 199, 969]"


In [8]:
def process_impression(s):
    list_of_strings = s.split(" ")
    itemid_rel_tuple = [l.split("-") for l in list_of_strings]
    noclicks = []
    click = None  # Initialize click
    for entry in itemid_rel_tuple:
        if entry[1] =='0':
            noclicks.append(entry[0])
        if entry[1] =='1':
            click = entry[0]
    return noclicks, click

raw_behaviour['noclicks'], raw_behaviour['click'] = zip(*raw_behaviour['impressions'].map(process_impression))
# We can then indexize these two new columns:
raw_behaviour['noclicks'] = raw_behaviour['noclicks'].map(lambda list_of_strings: [item2ind.get(l, 0) for l in list_of_strings])
raw_behaviour['click'] = raw_behaviour['click'].map(lambda x: item2ind.get(x,0))

In [9]:
raw_behaviour.head()

Unnamed: 0,id,userid,timestamp,click_history,impressions,userIdx,click_history_idx,noclicks,click
0,0,U1,12/11/2023 03:06:31 AM,N2216 N1796 N1336 N494 N809 N142 N1804,N1617-1 N1389-0,1,"[2218, 1798, 1340, 495, 811, 144, 1806]",[1393],1621
1,1,U2,12/12/2023 07:55:10 AM,N2152,N1737-1 N1822-1 N1862-1 N1886-0 N1051-1 N958-1...,2,[2154],"[1889, 2328, 1181]",1985
2,2,U3,12/10/2023 02:47:58 AM,N2302 N1464 N10 N1722 N902 N1597 N2144 N978 N6...,N313-1 N2095-1 N1783-1 N2277-1 N1668-1 N737-1 ...,3,"[2303, 1468, 10, 1725, 904, 1599, 2145, 982, 6...",[],320
3,3,U4,12/12/2023 08:39:42 PM,N1028 N2390 N1158 N2373 N1002 N1113,N1780-1 N524-1 N908-0,4,"[1030, 2390, 1163, 2373, 1004, 1118]",[910],526
4,4,U5,12/10/2023 08:21:08 AM,N1936 N1793 N198 N965,N124-0 N2010-0 N920-0 N365-0,5,"[1940, 1795, 199, 969]","[125, 2013, 923, 367]",0


In [10]:
# convert timestamp value to hours since epoch
raw_behaviour['epochhrs'] = pd.to_datetime(raw_behaviour['timestamp']).values.astype(np.int64)/(1e6)/1000/3600
raw_behaviour['epochhrs'] = raw_behaviour['epochhrs'].round()

## find first publish date
#raw_behaviour[['click','epochhrs']].groupby("click").min("epochhrs").reset_index()

In [11]:
## Select the columns that we now want to use for further analysis
behaviour = raw_behaviour[['epochhrs','userIdx','click_history_idx','noclicks','click']]
behaviour.head()

Unnamed: 0,epochhrs,userIdx,click_history_idx,noclicks,click
0,472851.0,1,"[2218, 1798, 1340, 495, 811, 144, 1806]",[1393],1621
1,472880.0,2,[2154],"[1889, 2328, 1181]",1985
2,472827.0,3,"[2303, 1468, 10, 1725, 904, 1599, 2145, 982, 6...",[],320
3,472893.0,4,"[1030, 2390, 1163, 2373, 1004, 1118]",[910],526
4,472832.0,5,"[1940, 1795, 199, 969]","[125, 2013, 923, 367]",0


In [12]:
behaviour.loc[:,'noclick'] = behaviour['noclicks'].map(lambda x : x[0] if len(x) > 0 else 0)
behaviour.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


Unnamed: 0,epochhrs,userIdx,click_history_idx,noclicks,click,noclick
0,472851.0,1,"[2218, 1798, 1340, 495, 811, 144, 1806]",[1393],1621,1393
1,472880.0,2,[2154],"[1889, 2328, 1181]",1985,1889
2,472827.0,3,"[2303, 1468, 10, 1725, 904, 1599, 2145, 982, 6...",[],320,0
3,472893.0,4,"[1030, 2390, 1163, 2373, 1004, 1118]",[910],526,910
4,472832.0,5,"[1940, 1795, 199, 969]","[125, 2013, 923, 367]",0,125


In [13]:
# Let us use the last 10pct of the data as our validation data:
test_time_th = behaviour['epochhrs'].quantile(0.9)
train = behaviour[behaviour['epochhrs']< test_time_th]
valid =  behaviour[behaviour['epochhrs']>= test_time_th]

In [14]:
class MindDataset(Dataset):
    # A fairly simple torch dataset module that can take a pandas dataframe (as above), 
    # and convert the relevant fields into a dictionary of arrays that can be used in a dataloader
    def __init__(self, df):
        # Create a dictionary of tensors out of the dataframe
        self.data = {
            'userIdx' : torch.tensor(df.userIdx.values),
            'click' : torch.tensor(df.click.values),
            'noclick' : torch.tensor(df.noclick.values)
        }
    def __len__(self):
        return len(self.data['userIdx'])
    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.data.items()}

In [15]:
# Build datasets and dataloaders of train and validation dataframes:
bs = 1024
ds_train = MindDataset(df=train)
train_loader = DataLoader(ds_train, batch_size=bs, shuffle=True)
ds_valid = MindDataset(df=valid)
valid_loader = DataLoader(ds_valid, batch_size=bs, shuffle=False)

batch = next(iter(train_loader))

In [16]:
# Build a matrix factorization model
class NewsMF(pl.LightningModule):
    def __init__(self, num_users, num_items, dim = 10):
        super().__init__()
        self.dim=dim
        self.useremb = nn.Embedding(num_embeddings=num_users, embedding_dim=dim)
        self.itememb = nn.Embedding(num_embeddings=num_items, embedding_dim=dim)

    def step(self, batch, batch_idx, phase="train"):
        batch_size = batch['userIdx'].size(0)
        score_click = self.forward(batch["userIdx"], batch["click"])
        score_noclick = self.forward(batch["userIdx"], batch["noclick"])         
        scores_all = torch.concat((score_click, score_noclick), dim=1)
        loss = F.cross_entropy(input=scores_all, target=torch.zeros(batch_size, device=scores_all.device).long())
        return loss
    
    def forward(self, users, items):
        uservec =  self.useremb(users)
        itemvec = self.itememb(items)
        score = (uservec*itemvec).sum(-1).unsqueeze(-1)
        return score
               
        
    def predict_single_user(self, user_idx):
        items = torch.arange(0, len(ind2item))
        user = torch.zeros_like(items) + user_idx
        scores = self.forward(user, items)
        recommendations = [item.item() for item in torch.topk(scores, 500, dim=0)[1]]
        return recommendations
    
    def predict(self, users):
        recommendations = []
        for user in users:
            recommendation = self.predict_single_user(user)
            recommendations.append(recommendation) 
        return recommendations        
        
    def training_step(self, batch, batch_idx):
        return self.step(batch, batch_idx, "train")
    
    def validation_step(self, batch, batch_idx):
        # for now, just do the same computation as during training
        return self.step(batch, batch_idx, "val")

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer
    

In [17]:
mf_model = NewsMF(num_users=len(ind2user)+1, num_items = len(ind2item)+1, dim=15)

trainer = pl.Trainer(max_epochs=50, accelerator="gpu")
trainer.fit(model=mf_model, train_dataloaders=train_loader, val_dataloaders=valid_loader)

# Save the model
trainer.save_checkpoint("model_user1.ckpt")

# Load the trained model
mf_model = NewsMF.load_from_checkpoint(checkpoint_path="model_user1.ckpt", num_users=len(ind2user)+1, num_items = len(ind2item)+1, dim=15)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  "Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning`"
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type      | Params
--------------------------------------
0 | useremb | Embedding | 1.5 K 
1 | itememb | Embedding | 36.3 K
--------------------------------------
37.9 K    Trainable params
0         Non-trainable params
37.9 K    Total params
0.151     Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s]



                                                                           



Epoch 49: 100%|██████████| 2/2 [00:00<00:00, 44.28it/s, loss=1.77, v_num=7] 

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 2/2 [00:00<00:00, 37.05it/s, loss=1.77, v_num=7]


In [18]:
valid_batch = next(iter(valid_loader))
predictions = mf_model.predict(valid_batch["userIdx"])
true_values = [item.item() for item in valid_batch["click"]]

In [19]:
def accuracy_at_k(predictions: List[List], true_values: List):
    hits = 0
    for preds, true in zip(predictions, true_values):
        if true in preds:
            hits += 1
    return hits / len(true_values)

accuracy_at_k(predictions, true_values)

0.3

In [20]:
user_idx = 3
print(user_idx)
items = torch.arange(0, len(ind2item))
user = torch.zeros_like(items) + user_idx
recommendations = mf_model.predict_single_user(user)
# print(recommendations)

3


In [21]:
# read news article
article_id = behaviour[behaviour.userIdx == user_idx]["click"].values[0]
print(article_id + 1)
news[news.id == ind2item[article_id + 1]]

321


Unnamed: 0,id,category,title,img_url,abstract,url,author,date
320,N320,money,Chemring profits from defence boom as Ukraine ...,https://i.dailymail.co.uk/1s/2023/12/09/17/787...,Chemring is set to record its highest profit i...,https://www.thisismoney.co.uk/money/markets/ar...,Francesca Washtell,2023-12-10 05:50:32+08


In [22]:
reconews = news[news.id.isin([(ind2item[item + 1])  for item in recommendations])]
reconews.head()

Unnamed: 0,id,category,title,img_url,abstract,url,author,date
10,N11,home,Cost-saving ideas and time-saving tips make th...,https://i.dailymail.co.uk/1s/2023/12/05/17/786...,Roast Fillet of BeefFillet is a beautifully te...,https://www.dailymail.co.uk/home/you/article-1...,Eleanor Maidment,2023-12-09 20:01:08+08
12,N13,health,I assumed my stomach cramps were down to my pe...,https://i.dailymail.co.uk/1s/2023/12/05/17/786...,A woman who assumed her stomach cramps were do...,https://www.dailymail.co.uk/health/article-128...,Emily Craig,2023-12-09 20:44:25+08
13,N14,news,Oliver Dowden says Bristol University should n...,https://i.dailymail.co.uk/1s/2023/12/09/11/787...,Oliver Dowden said Bristol University should n...,https://www.dailymail.co.uk/news/article-12844...,Miriam Kuepper,2023-12-09 19:32:35+08
21,N22,news,Gripping moment Israeli soldiers battle Hamas ...,https://i.dailymail.co.uk/1s/2023/12/09/12/787...,Israeli Defence Forces have revealed footage o...,https://www.dailymail.co.uk/news/article-12844...,Elizabeth Haigh,2023-12-09 20:28:38+08
33,N34,news,Savvy mother installs £200 DIY laundry chute a...,https://i.dailymail.co.uk/1s/2023/12/08/15/787...,A savvy mum from Merseyside has saved 'hundred...,https://www.dailymail.co.uk/news/article-12842...,Sophie Watson,2023-12-09 20:29:07+08


In [23]:
## Add more information to the article data 
# The item index
news["ind"] = news["id"].map(item2ind)
news = news.sort_values("ind").reset_index(drop=True)
# Number of clicks in training data per article, investigate the cold start issue
news["n_click_training"] = news["ind"].map(dict(Counter(train.click)))
# 5 most clicked articles
# news.sort_values("n_click_training",ascending=False).head()
trendingnews = news.sort_values("n_click_training",ascending=False)
trendingnews.head()

Unnamed: 0,id,category,title,img_url,abstract,url,author,date,ind,n_click_training
1142,N1138,sport,Champion Aussie athlete reveals she didn't kno...,https://i.dailymail.co.uk/1s/2023/12/11/09/788...,Taneille Crase has opened up on the sobering r...,https://www.dailymail.co.uk/sport/othersports/...,Ollie Lewis,2023-12-11 17:29:56+08,1143,2.0
0,N1,news,Stephen Fry reveals full hell of his O2 stage ...,https://i.dailymail.co.uk/1s/2023/12/09/12/787...,He was later treated at Queen Elizabeth Hospit...,https://www.dailymail.co.uk/news/article-12844...,Gemma Parry,2023-12-09 20:47:02+08,1,1.0
1694,N1692,tvshowbiz,Bridgerton gives glimpse at Penelope Featherin...,https://i.dailymail.co.uk/1s/2023/12/12/15/788...,Bridgerton has sent fans into a frenzy by shar...,https://www.dailymail.co.uk/tvshowbiz/article-...,Kate Dennett,2023-12-12 23:56:30+08,1695,1.0
2024,N2022,money,Eyeing up a gift on Facebook? Santander will b...,https://i.dailymail.co.uk/1s/2023/12/12/23/788...,"In some cases, items listed on Facebook Market...",https://www.thisismoney.co.uk/money/beatthesca...,Jessica Beard,2023-12-13 07:58:04+08,2025,1.0
1990,N1988,tvshowbiz,Love Island's Tasha Ghouri cuts a glamorous fi...,https://i.dailymail.co.uk/1s/2023/12/12/20/788...,Love Island star Tasha Ghouri and Millie Macki...,https://www.dailymail.co.uk/tvshowbiz/article-...,Laura Fox,2023-12-13 04:38:14+08,1991,1.0
