# Set Up

In [1]:
# Import relevant libraries
import pandas as pd
from google.colab import drive

import urllib.request
import os
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

from datetime import timedelta

In [2]:
# Function to select multiple user IDs & timestamps for evaluation
def select_user_ids_timestamps(minimum_history=5, minimum_impressions=1, k=5):

  # Convert timestamps to string
  behaviors_df['Timestamp'] = behaviors_df['Timestamp'].astype(str)

  # Select minimum number of articles in history
  filtered_behaviors_df = behaviors_df[behaviors_df['History'].str.split().str.len() >= minimum_history]

  # Select minimum number of articles in impressions
  filtered_behaviors_df = filtered_behaviors_df[filtered_behaviors_df['Impressions'].str.split().str.len() >= minimum_impressions]

  # Select the top 10 rows
  filtered_behaviors_df = filtered_behaviors_df.tail(k)

  # Create a list of tuples containing values from columns 'a' and 'b'
  user_ids_timestamps = [(row['User ID'], row['Timestamp']) for _, row in filtered_behaviors_df.iterrows()]

  return user_ids_timestamps

# Import Data

In [3]:
drive.mount('/content/drive', force_remount=True)

# import behaviors df
behaviors_df = pd.read_pickle("/content/drive/MyDrive/Group_19/01.Dataset/Small/Clean/Train/behaviors.pkl")

# import news df
news_df = pd.read_pickle("/content/drive/MyDrive/Group_19/01.Dataset/Small/Clean/Train/news.pkl")

Mounted at /content/drive


# Frequency & Categorical Recommender

In [4]:
def single_user_recomendations_frequency(user_id, timestamp, categories=None, k=5):

    # Settings to filter for the relevant timeperiod
    behaviors_df["Timestamp"] = pd.to_datetime(behaviors_df["Timestamp"])
    timestamp_threshold = pd.to_datetime(timestamp)
    max_old_date = timestamp_threshold - timedelta(weeks=2)

    # Code to correctly filter and access for most read articles and respective category
    filtered_behaviors = behaviors_df[(behaviors_df["Timestamp"] < timestamp_threshold) & (behaviors_df["Timestamp"] > max_old_date)]
    articles_df = filtered_behaviors["History"].str.split().explode().to_frame()
    articles_df["lectures"] = 1
    articles_most_read = articles_df.groupby("History").sum().sort_values(by="lectures", ascending=False)
    articles_most_read["Category"] = articles_most_read.index.map(news_df.set_index("News ID")["Category"].get)

    # For user that exist in the df and have history
    if behaviors_df["User ID"].isin([user_id]).any():
        # Check if user is in the df and determine his favorite categories
        user_history = behaviors_df.loc[behaviors_df["User ID"] == user_id, "History"].str.split().explode()
        article_counts = user_history.value_counts()
        top_categories = article_counts.index.map(news_df.set_index("News ID")["Category"]).value_counts().index[:3]

        # Filter articles based on top categories and remove articles already read by the user
        filtered_articles = articles_most_read[articles_most_read["Category"].isin(top_categories)]
        recommended_articles = filtered_articles.index[~filtered_articles.index.isin(user_history)].tolist()[:k]

        return recommended_articles

    # For users with no history
    else:
        # Filter articles based on favorite categories selected
        filtered_articles = articles_most_read[articles_most_read["Category"].isin(categories)]
        article_id = filtered_articles.index.tolist()[:k]

        return article_id


In [5]:
def multiple_user_recomendations_frequency(user_ids_timestamps, categories=None, k=5):
  # Create empty dictionary to store recommendations
  user_recommendations_dict = {}

  # Keep track of how many iterations have run
  counter = 0

  # Iterate over users & timestamps
  for user_id, timestamp in user_ids_timestamps:
    # Update counter
    counter += 1
    print(counter)
    user_recommendations_dict[(user_id, timestamp)] = single_user_recomendations_frequency(user_id, timestamp, categories=categories, k=k)

  return user_recommendations_dict

# Frequency & Categorical Predictions

### Single User

In [7]:
final_recommended_ids = single_user_recomendations_frequency(user_id='U13740', timestamp='2019-11-13 15:27:40', categories=None, k=5)

In [8]:
final_recommended_ids

['N43142', 'N871', 'N29177', 'N16715', 'N51706']

### Multiple Users

In [9]:
# Select a subset of users of size k to test on
user_ids_timestamps = select_user_ids_timestamps(k=5)

# Make recommednations for multiple users
final_recommended_ids_multiple = multiple_user_recomendations_frequency(user_ids_timestamps, categories=None, k=5)

1
2
3
4
5


In [10]:
# View recommednations for multiple users
final_recommended_ids_multiple

{('U17467', '2019-11-12 17:19:50'): ['N42620',
  'N871',
  'N29177',
  'N55189',
  'N52551'],
 ('U21593', '2019-11-14 22:24:05'): ['N42620',
  'N31801',
  'N45794',
  'N16715',
  'N46392'],
 ('U10123', '2019-11-13 06:57:04'): ['N31801',
  'N45794',
  'N16715',
  'N51706',
  'N54827'],
 ('U75630', '2019-11-14 10:58:13'): ['N31801',
  'N45794',
  'N16715',
  'N46392',
  'N54827'],
 ('U44625', '2019-11-13 14:57:02'): ['N42620',
  'N31801',
  'N45794',
  'N16715',
  'N46392']}