In [26]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch.nn as nn
import pytorch_lightning as pl
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch
import psycopg2
from collections import Counter

from typing import List

In [2]:
# Establish a connection to the PostgreSQL database
conn = psycopg2.connect(
    dbname="recome",
    user="postgres",
    password="postgres",
    host="localhost",
    port="5432"
)

# Create a cursor object
cur = conn.cursor()

# Close the connection
# cur.close()
# conn.close()

In [3]:
# Execute a query for behaviours
cur.execute("SELECT * FROM behaviours")
# Fetch all the rows for news
rows = cur.fetchall()
column_names = [desc[0] for desc in cur.description] # Get the column names 
raw_behaviour = pd.DataFrame(rows, columns=column_names) # Create a DataFrame from the rows, with the column names

print(f"The dataset originally consists of {len(raw_behaviour)} number of interactions.")
raw_behaviour.head()

The dataset originally consists of 156966 number of interactions.


Unnamed: 0,id,userid,timestamp,click_history,impressions
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...


In [4]:
## Indexize users
unique_userIds = raw_behaviour['userid'].unique()
# Allocate a unique index for each user, but let the zeroth index be a UNK index:
ind2user = {idx +1: itemid for idx, itemid in enumerate(unique_userIds)}
user2ind = {itemid : idx for idx, itemid in ind2user.items()}
print(f"We have {len(user2ind)} unique users in the dataset")

# Create a new column with userIdx:
raw_behaviour['userIdx'] = raw_behaviour['userid'].map(lambda x: user2ind.get(x,0))

We have 50001 unique users in the dataset


In [5]:
raw_behaviour.head()

Unnamed: 0,id,userid,timestamp,click_history,impressions,userIdx
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0,1
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...,2
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...,3
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0,4
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...,5


In [6]:
# Execute a query for news
cur.execute("SELECT * FROM news1")
# Fetch all the rows for news
rows = cur.fetchall()
column_names = [desc[0] for desc in cur.description] # Get the column names 
news = pd.DataFrame(rows, columns=column_names) # Create a DataFrame from the rows, with the column names
print(f"The news data consist in total of {len(news)} number of news.")

# Build index of items
ind2item = {idx +1: itemid for idx, itemid in enumerate(news['id'].values)}
item2ind = {itemid : idx for idx, itemid in ind2item.items()}

news.head()

The news data consist in total of 51282 number of news.


Unnamed: 0,id,category,subcategory,title,abstract,url,title_entities,abstract_entities,author,date
0,N2073,sports,football_nfl,Should NFL be able to fine players for critici...,Several fines came down against NFL players fo...,https://assets.msn.com/labs/mind/AAJ4lap.html,"[{""Label"": ""National Football League"", ""Type"":...","[{""Label"": ""National Football League"", ""Type"":...",Alice Brown,"April 22, 2019"
1,N49186,weather,weathertopstories,It's been Orlando's hottest October ever so fa...,There won't be a chill down to your bones this...,https://assets.msn.com/labs/mind/AAJwoxD.html,"[{""Label"": ""Orlando, Florida"", ""Type"": ""G"", ""W...","[{""Label"": ""Orlando, Florida"", ""Type"": ""G"", ""W...",Ivy Moore,"July 15, 2019"
2,N59295,news,newsworld,Chile: Three die in supermarket fire amid prot...,Three people have died in a supermarket fire a...,https://assets.msn.com/labs/mind/AAJ43pw.html,"[{""Label"": ""Chile"", ""Type"": ""G"", ""WikidataId"":...","[{""Label"": ""Santiago"", ""Type"": ""G"", ""WikidataI...",Ivy Brown,"January 26, 2019"
3,N24510,entertainment,gaming,Best PS5 games: top PlayStation 5 titles to lo...,Every confirmed or expected PS5 game we can't ...,https://assets.msn.com/labs/mind/AACHUn8.html,"[{""Label"": ""PlayStation"", ""Type"": ""J"", ""Wikida...",[],Alice Davis,"April 6, 2019"
4,N39237,news,newsscienceandtechnology,"How to report weather-related closings, delays","When there are active closings, view them here...",https://assets.msn.com/labs/mind/AAlErhA.html,[],"[{""Label"": ""WXII-TV"", ""Type"": ""M"", ""WikidataId...",Bob Wilson,"June 12, 2019"


In [7]:
# Indexize click history field
def process_click_history(s):
    list_of_strings = str(s).split(" ")
    return [item2ind.get(l, 0) for l in list_of_strings]
        
raw_behaviour['click_history_idx'] = raw_behaviour.click_history.map(lambda s:  process_click_history(s))
raw_behaviour.head()

Unnamed: 0,id,userid,timestamp,click_history,impressions,userIdx,click_history_idx
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0,1,"[22043, 8947, 13784, 18716, 22678, 4511, 44472..."
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...,2,"[22225, 8952, 27070, 27060, 11774, 12624, 1129..."
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...,3,"[4980, 3833, 10366, 22671, 7281, 18815, 19914,..."
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0,4,"[28768, 16832, 39854, 6297, 19610, 14655, 1381..."
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...,5,"[19785, 4416, 18687, 26618]"


In [8]:
# collect one click and one no-click from impressions:
def process_impression(s):
    list_of_strings = s.split(" ")
    itemid_rel_tuple = [l.split("-") for l in list_of_strings]
    noclicks = []
    for entry in itemid_rel_tuple:
        if entry[1] =='0':
            noclicks.append(entry[0])
        if entry[1] =='1':
            click = entry[0]
    return noclicks, click

raw_behaviour['noclicks'], raw_behaviour['click'] = zip(*raw_behaviour['impressions'].map(process_impression))
# We can then indexize these two new columns:
raw_behaviour['noclicks'] = raw_behaviour['noclicks'].map(lambda list_of_strings: [item2ind.get(l, 0) for l in list_of_strings])
raw_behaviour['click'] = raw_behaviour['click'].map(lambda x: item2ind.get(x,0))

In [9]:
raw_behaviour.head()

Unnamed: 0,id,userid,timestamp,click_history,impressions,userIdx,click_history_idx,noclicks,click
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0,1,"[22043, 8947, 13784, 18716, 22678, 4511, 44472...",[50911],34229
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...,2,"[22225, 8952, 27070, 27060, 11774, 12624, 1129...","[31553, 36854, 29576, 29884, 37835, 31399, 366...",27365
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...,3,"[4980, 3833, 10366, 22671, 7281, 18815, 19914,...","[33180, 28318, 32427, 36305, 728, 32091, 45696...",5227
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0,4,"[28768, 16832, 39854, 6297, 19610, 14655, 1381...","[50911, 28166, 50559]",50928
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...,5,"[19785, 4416, 18687, 26618]","[1844, 28240, 32948, 31399, 38370, 50636, 2979...",43748


In [10]:
# convert timestamp value to hours since epoch
raw_behaviour['epochhrs'] = pd.to_datetime(raw_behaviour['timestamp']).values.astype(np.int64)/(1e6)/1000/3600
raw_behaviour['epochhrs'] = raw_behaviour['epochhrs'].round()

## find first publish date
#raw_behaviour[['click','epochhrs']].groupby("click").min("epochhrs").reset_index()

In [11]:
## Select the columns that we now want to use for further analysis
behaviour = raw_behaviour[['epochhrs','userIdx','click_history_idx','noclicks','click']]
behaviour.head()

Unnamed: 0,epochhrs,userIdx,click_history_idx,noclicks,click
0,437073.0,1,"[22043, 8947, 13784, 18716, 22678, 4511, 44472...",[50911],34229
1,437106.0,2,"[22225, 8952, 27070, 27060, 11774, 12624, 1129...","[31553, 36854, 29576, 29884, 37835, 31399, 366...",27365
2,437143.0,3,"[4980, 3833, 10366, 22671, 7281, 18815, 19914,...","[33180, 28318, 32427, 36305, 728, 32091, 45696...",5227
3,437069.0,4,"[28768, 16832, 39854, 6297, 19610, 14655, 1381...","[50911, 28166, 50559]",50928
4,437104.0,5,"[19785, 4416, 18687, 26618]","[1844, 28240, 32948, 31399, 38370, 50636, 2979...",43748


In [12]:
behaviour.loc[:,'noclick'] = behaviour['noclicks'].map(lambda x : x[0])
behaviour.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


Unnamed: 0,epochhrs,userIdx,click_history_idx,noclicks,click,noclick
0,437073.0,1,"[22043, 8947, 13784, 18716, 22678, 4511, 44472...",[50911],34229,50911
1,437106.0,2,"[22225, 8952, 27070, 27060, 11774, 12624, 1129...","[31553, 36854, 29576, 29884, 37835, 31399, 366...",27365,31553
2,437143.0,3,"[4980, 3833, 10366, 22671, 7281, 18815, 19914,...","[33180, 28318, 32427, 36305, 728, 32091, 45696...",5227,33180
3,437069.0,4,"[28768, 16832, 39854, 6297, 19610, 14655, 1381...","[50911, 28166, 50559]",50928,50911
4,437104.0,5,"[19785, 4416, 18687, 26618]","[1844, 28240, 32948, 31399, 38370, 50636, 2979...",43748,1844


In [13]:
# Let us use the last 10pct of the data as our validation data:
test_time_th = behaviour['epochhrs'].quantile(0.9)
train = behaviour[behaviour['epochhrs']< test_time_th]
valid =  behaviour[behaviour['epochhrs']>= test_time_th]

In [14]:
class MindDataset(Dataset):
    # A fairly simple torch dataset module that can take a pandas dataframe (as above), 
    # and convert the relevant fields into a dictionary of arrays that can be used in a dataloader
    def __init__(self, df):
        # Create a dictionary of tensors out of the dataframe
        self.data = {
            'userIdx' : torch.tensor(df.userIdx.values),
            'click' : torch.tensor(df.click.values),
            'noclick' : torch.tensor(df.noclick.values)
        }
    def __len__(self):
        return len(self.data['userIdx'])
    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.data.items()}

In [15]:
# Build datasets and dataloaders of train and validation dataframes:
bs = 1024
ds_train = MindDataset(df=train)
train_loader = DataLoader(ds_train, batch_size=bs, shuffle=True)
ds_valid = MindDataset(df=valid)
valid_loader = DataLoader(ds_valid, batch_size=bs, shuffle=False)

batch = next(iter(train_loader))

In [16]:
# Build a matrix factorization model
class NewsMF(pl.LightningModule):
    def __init__(self, num_users, num_items, dim = 10):
        super().__init__()
        self.dim=dim
        self.useremb = nn.Embedding(num_embeddings=num_users, embedding_dim=dim)
        self.itememb = nn.Embedding(num_embeddings=num_items, embedding_dim=dim)

    def step(self, batch, batch_idx, phase="train"):
        batch_size = batch['userIdx'].size(0)
        score_click = self.forward(batch["userIdx"], batch["click"])
        score_noclick = self.forward(batch["userIdx"], batch["noclick"])         
        scores_all = torch.concat((score_click, score_noclick), dim=1)
        loss = F.cross_entropy(input=scores_all, target=torch.zeros(batch_size, device=scores_all.device).long())
        return loss
    
    def forward(self, users, items):
        uservec =  self.useremb(users)
        itemvec = self.itememb(items)
        score = (uservec*itemvec).sum(-1).unsqueeze(-1)
        return score
               
        
    def predict_single_user(self, user_idx):
        items = torch.arange(0, len(ind2item))
        user = torch.zeros_like(items) + user_idx
        scores = self.forward(user, items)
        recommendations = [item.item() for item in torch.topk(scores, 500, dim=0)[1]]
        return recommendations
    
    def predict(self, users):
        recommendations = []
        for user in users:
            recommendation = self.predict_single_user(user)
            recommendations.append(recommendation) 
        return recommendations        
        
    def training_step(self, batch, batch_idx):
        return self.step(batch, batch_idx, "train")
    
    def validation_step(self, batch, batch_idx):
        # for now, just do the same computation as during training
        return self.step(batch, batch_idx, "val")

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer
    

In [17]:
# mf_model = NewsMF(num_users=len(ind2user)+1, num_items = len(ind2item)+1, dim=15)

# trainer = pl.Trainer(max_epochs=10, accelerator="gpu")
# trainer.fit(model=mf_model, train_dataloaders=train_loader, val_dataloaders=valid_loader)

# Save the model
# trainer.save_checkpoint("model_user1.ckpt")

# Load the trained model
mf_model = NewsMF.load_from_checkpoint(checkpoint_path="model_user1.ckpt", num_users=len(ind2user)+1, num_items = len(ind2item)+1, dim=15)

In [18]:
valid_batch = next(iter(valid_loader))
predictions = mf_model.predict(valid_batch["userIdx"])
true_values = [item.item() for item in valid_batch["click"]]

In [19]:
def accuracy_at_k(predictions: List[List], true_values: List):
    hits = 0
    for preds, true in zip(predictions, true_values):
        if true in preds:
            hits += 1
    return hits / len(true_values)

accuracy_at_k(predictions, true_values)

0.009765625

In [20]:
user_idx = raw_behaviour[raw_behaviour['userid'] == 'U13740'].userIdx[0]
print(user_idx)
items = torch.arange(0, len(ind2item))
user = torch.zeros_like(items) + user_idx
recommendations = mf_model.predict_single_user(user)
# print(recommendations)

1


In [21]:
# read news article
article_id = behaviour[behaviour.userIdx == user_idx]["click"].values[0]
news[news.id == ind2item[article_id]]

Unnamed: 0,id,category,subcategory,title,abstract,url,title_entities,abstract_entities,author,date
34228,N55689,sports,football_nfl,"Charles Rogers, former Michigan State football...","Charles Rogers, the former Michigan State foot...",https://assets.msn.com/labs/mind/BBWAPO6.html,"[{""Label"": ""Charles Rogers (American football)...","[{""Label"": ""2003 NFL Draft"", ""Type"": ""U"", ""Wik...",Charlie Davis,"February 9, 2019"


In [22]:
news[news.id.isin([(ind2item[item + 1])  for item in recommendations])]

Unnamed: 0,id,category,subcategory,title,abstract,url,title_entities,abstract_entities,author,date
50,N48239,autos,autosenthusiasts,"2020 Toyota Supra GT4 details announced, makes...",It goes on sale next year,https://assets.msn.com/labs/mind/AAJwpE1.html,[],[],Diana Taylor,"February 17, 2019"
269,N27822,news,newsus,"The plan to close Rikers Island, explained","The review process is at a crucial juncture, h...",https://assets.msn.com/labs/mind/AAE5mif.html,"[{""Label"": ""Rikers Island"", ""Type"": ""L"", ""Wiki...",[],Alice Smith,"January 13, 2019"
462,N44438,sports,football_nfl,Ramsey promises Rams no holdout,The Los Angeles Rams' blockbuster acquisition ...,https://assets.msn.com/labs/mind/AAJ4nHP.html,"[{""Label"": ""Los Angeles Rams"", ""Type"": ""O"", ""W...","[{""Label"": ""Los Angeles Rams"", ""Type"": ""O"", ""W...",Charlie Brown,"February 12, 2019"
507,N36357,sports,football_nfl,Jalen Ramsey expresses desire to stay with Ram...,New Rams cornerback Jalen Ramsey has yet to sp...,https://assets.msn.com/labs/mind/AAISy8L.html,"[{""Label"": ""Jalen Ramsey"", ""Type"": ""P"", ""Wikid...","[{""Label"": ""Jalen Ramsey"", ""Type"": ""P"", ""Wikid...",Bob Smith,"January 22, 2019"
514,N30816,autos,autossuvs,2021 Ford Bronco: What We Know So Far,Ford's rugged SUV will make its triumphant ret...,https://assets.msn.com/labs/mind/AAIp8W9.html,"[{""Label"": ""Ford Bronco"", ""Type"": ""V"", ""Wikida...","[{""Label"": ""Ford Bronco"", ""Type"": ""V"", ""Wikida...",Felix Miller,"February 11, 2019"
...,...,...,...,...,...,...,...,...,...,...
51051,N63997,travel,travelarticle,Orweiler Road bridge replacement project behin...,The road was closed to traffic on July 22 and ...,https://assets.msn.com/labs/mind/BBWyqxH.html,[],"[{""Label"": ""Clear Fork Reservoir"", ""Type"": ""L""...",Alice Jones,"February 16, 2019"
51064,N58369,travel,travelnews,Here's what's open and closed on Veterans Day ...,"City of Pittsburgh, Allegheny County, Commonwe...",https://assets.msn.com/labs/mind/BBWyrcW.html,"[{""Label"": ""Veterans Day"", ""Type"": ""H"", ""Wikid...","[{""Label"": ""Veterans Day"", ""Type"": ""H"", ""Wikid...",Bob Jones,"April 11, 2019"
51073,N32240,news,newsworld,Turkey should scrap Russian missile system or ...,Turkey should scrap Russian missile system or ...,https://assets.msn.com/labs/mind/BBWysCy.html,"[{""Label"": ""Turkey"", ""Type"": ""G"", ""WikidataId""...","[{""Label"": ""Turkey"", ""Type"": ""G"", ""WikidataId""...",Alice Brown,"March 26, 2019"
51085,N38607,sports,icehockey_nhl,What We've Learned About Each Player On Bruins...,https://www.dailymotion.com/embed/video/x nnj ...,https://assets.msn.com/labs/mind/BBWyscn.html,"[{""Label"": ""Boston Bruins"", ""Type"": ""O"", ""Wiki...","[{""Label"": ""Boston Bruins"", ""Type"": ""O"", ""Wiki...",Ivy Taylor,"October 15, 2019"


In [28]:
## Add more information to the article data 
# The item index
news["ind"] = news["id"].map(item2ind)
news = news.sort_values("ind").reset_index(drop=True)
# Number of clicks in training data per article, investigate the cold start issue
news["n_click_training"] = news["ind"].map(dict(Counter(train.click)))
# 5 most clicked articles
# news.sort_values("n_click_training",ascending=False).head()
trendingnews = news.sort_values("n_click_training",ascending=False)
trendingnews.head()

Unnamed: 0,id,category,subcategory,title,abstract,url,title_entities,abstract_entities,author,date,ind,n_click_training
34228,N55689,sports,football_nfl,"Charles Rogers, former Michigan State football...","Charles Rogers, the former Michigan State foot...",https://assets.msn.com/labs/mind/BBWAPO6.html,"[{""Label"": ""Charles Rogers (American football)...","[{""Label"": ""2003 NFL Draft"", ""Type"": ""U"", ""Wik...",Charlie Davis,"February 9, 2019",34229,3662.0
50910,N35729,news,newsus,Porsche launches into second story of New Jers...,The Porsche went airborne off a median in Toms...,https://assets.msn.com/labs/mind/BBWyjM9.html,"[{""Label"": ""Porsche"", ""Type"": ""O"", ""WikidataId...","[{""Label"": ""Porsche"", ""Type"": ""O"", ""WikidataId...",Charlie Moore,"September 1, 2019",50911,2638.0
34776,N33619,news,newsus,College gymnast dies following training accide...,"Melanie Coleman, 20, of Milford, was practicin...",https://assets.msn.com/labs/mind/BBWBKRg.html,"[{""Label"": ""Connecticut"", ""Type"": ""G"", ""Wikida...",[],Ivy Johnson,"October 29, 2019",34777,2464.0
38249,N53585,tv,tvnews,"Rip Taylor's Cause of Death Revealed, Memorial...",The comedian died at the age of 84 last month.,https://assets.msn.com/labs/mind/BBWBgRz.html,"[{""Label"": ""Rip Taylor"", ""Type"": ""P"", ""Wikidat...",[],Alice Johnson,"May 13, 2019",38250,2360.0
34773,N63970,finance,finance-companies,Dean Foods files for bankruptcy,"Dean Foods, America's largest milk producer, i...",https://assets.msn.com/labs/mind/BBWEnS3.html,"[{""Label"": ""Dean Foods"", ""Type"": ""O"", ""Wikidat...","[{""Label"": ""Dean Foods"", ""Type"": ""O"", ""Wikidat...",Henry Jones,"July 23, 2019",34774,2021.0
