# README

This notebook contains all model iterations created for this project starting from a simple random recommender to outr final model, a collaborative and content based filtering recommender system.

To use any of the models to generate recommendations for a given user ID at a given timestamp (eg. "2019-11-13 15:27:40"), or for multiple user IDs at different timestamps follow the steps below:

1. Run notebook from start to end of 'Model 3: Collaborative & Content Based Recommender'

2. Open 'Make Predictions' section
  - If testing on multiple users select number of test users

3. Open subheading of model you want to use

4. Select single user or multi user recommendations

5. Run chosen model

6. Optionally uncomment code to export results to json file

# Set Up

## Install Required Libraries

In [1]:
import pandas as pd
import numpy as np
import datetime
import scipy.sparse as sp
from scipy.sparse import csc_matrix
from scipy.sparse import load_npz
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
import nltk
from nltk.corpus import stopwords
import heapq
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
pd.set_option('display.max_colwidth', None)
import json
import ast
import warnings
warnings.filterwarnings("ignore")
import urllib.request
import os
from datetime import timedelta
from scipy.spatial.distance import cosine
import itertools
import random

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Import Data



In [8]:
from google.colab import drive
drive.mount('/content/drive')

# import behaviors df
behaviors_df = pd.read_pickle("/content/drive/MyDrive/Group_19/01.Dataset/Small/Clean/Train/behaviors.pkl")

# import news df
news_df = pd.read_pickle("/content/drive/MyDrive/Group_19/01.Dataset/Small/Clean/Train/news.pkl")

# # Load pre-trained Google News Word2Vec model
# model_path = "/content/drive/MyDrive/Group_19/01.Dataset/GoogleNews-vectors-negative300.bin"
# google_model = KeyedVectors.load_word2vec_format(model_path, binary=True)

# # Load pre-trained Glove Word2Vec model
# model_path = "/content/drive/MyDrive/Group_19/01.Dataset/glove.6B.300d.txt"
# glove_model = {}

# with open(model_path, 'r', encoding='utf8') as file:
#     for line in file:
#         parts = line.split()
#         word = parts[0]
#         embedding = np.array([float(val) for val in parts[1:]])
#         glove_model[word] = embedding

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
behaviors_df.loc[behaviors_df['User ID']=='U13740']

Unnamed: 0,User ID,Timestamp,History,Impressions,History & Impressions,Average Vector
0,U13740,2019-11-11 09:05:58,N42782 N18445 N55189 N19347 N45794 N10414 N63302 N31801 N34694,N55689,N42782 N18445 N55189 N19347 N45794 N10414 N63302 N31801 N34694 N55689,"[-0.014606062493754818, -0.02841637492969335, 0.02371981330357809, 0.009556050663624305, -0.0422950266549711, -0.012634033702098712, 0.025006863912623013, -0.002610287599750112, 0.007933776577744732, 0.0057190535123700665, 0.0491532812601282, 0.05749945489872381, 0.026925346099389746, -0.014982412822799274, -0.025055876820228112, -0.023746189567295, 0.005080339706172858, 0.008010757221064476, -0.04396915791593488, 0.002112705376362015, -0.06642412956471191, -0.012746648982221035, -0.0022455996219859233, 0.0006853153529116356, -0.0012351444187181488, -0.016091391157134573, 0.002439112567173429, -0.006423315420792992, 0.01211253476254524, 0.009400782066094678, -0.014237181864117415, 0.019026322500717258, 0.04758785196220639, -0.04587253953831038, 0.024527962524958157, -0.03714662646084465, -0.05322331354597981, -0.035100688131982125, -0.004473486433301354, 0.037951936227451084, -0.025963429143521833, 0.03638469596056955, -0.009913495753708366, 0.01663233791264424, -0.018381362213831222, -0.029326694539909253, -0.006303412268038785, 0.039395503069485215, -0.00180327118308573, 0.0072839031204378485, -0.0231982012384921, -0.01576067722869785, 0.035454009785580574, 0.05028668191649095, 0.014613864029587953, 0.03431599349521232, -0.0028338826792617153, -0.012185012552739315, -0.02298737854135876, 0.009832405998956071, -0.004609509098362196, 0.03469953616090543, 0.011528941747851795, 0.01476928494028599, 0.016092007446671122, 0.026100055105888604, -0.022858431022648507, -0.03895301730120605, 0.027643572927700626, -0.014615578402528157, -0.045407649509863146, -0.0055726218631066025, 0.04967636279998334, 0.0429824294916797, -0.036944973509377414, -0.00439484496218269, -0.009726584098703457, 0.044619129220500875, -0.00026324803551897415, 0.04958850295594698, 0.022808497740811864, -0.023784744676203402, -0.0053312269170054435, 0.04307054397984106, 0.046043928934022996, -0.000992646750708694, 0.013062171327589151, -0.021695856995925966, -0.011250172181620416, -0.009029016311564381, -0.004436389195224033, 0.0404441875799851, 0.022881511630580217, -0.020203295700649174, 0.013582978203417778, -0.05494556589388144, 0.018376246195553475, -0.035659490832620135, -0.002349582923667594, 0.03061314961582341]"
35262,U13740,2019-11-09 05:59:43,N42782 N18445 N55189 N19347 N45794 N10414 N63302 N31801 N34694,N28910,N42782 N18445 N55189 N19347 N45794 N10414 N63302 N31801 N34694 N28910,"[-0.010981766104865925, -0.029624159991421744, 0.027244411359133643, 0.007351641373500849, -0.03657101727225505, -0.01646845947370365, 0.020065892245956347, -0.004404096982466162, 0.008678827195028683, 0.016270772709900932, 0.04940365916136276, 0.05628198906539047, 0.028854384617908263, -0.01480641461292273, -0.024724894165907126, -0.022788770740134502, -0.0004018167753086242, 0.010598921572916328, -0.03958575461346573, -0.0005292904878355153, -0.06257701552150205, -0.014289750587159306, -0.00433521780099827, 0.005783365352911636, -0.0026956031532860505, -0.016415801527504943, 0.00023985124001293516, -0.00896253193313867, 0.0066187410279773375, 0.013842476047576158, -0.017869161061648278, 0.011976404506890095, 0.043247099739984175, -0.04065141793337211, 0.017933288265698896, -0.03066350670775823, -0.04755645397807858, -0.041275204767784596, -0.014060182544412467, 0.03876759048671034, -0.029651643742287258, 0.03554753592970535, -0.014748052975930587, 0.012888706215113371, -0.022387584374325052, -0.03326139083620555, -0.009666625570507922, 0.037445462822571635, -0.0064088256892585686, 0.010258421299450194, -0.022624137966887163, -0.011647145068204023, 0.028387276205333666, 0.045820876330071196, 0.012354245819711408, 0.026114436519903684, -0.006008908543459245, -0.014299091688541788, -0.02168033903518592, -0.0005057054825254092, -0.011283417339102935, 0.0435747490930042, 0.0117384829206913, 0.021689618736582282, 0.019088498033090876, 0.0161309826367528, -0.0264907162078337, -0.034088050325897415, 0.020029997649922852, -0.01773368750746643, -0.0489901375654187, -0.005341304208785616, 0.048350196874057415, 0.04895404801019822, -0.03381068810814285, -0.008061452801688863, -0.014573398820925676, 0.04416542101062433, 0.0044459332607773225, 0.05293511236952723, 0.023569952864268655, -0.021495212886079945, -0.003073252626881987, 0.04583455756008797, 0.04075015325501065, -0.0066736016889803, 0.014034017932527426, -0.01796681258234572, -0.0128409573668056, -0.014999166836255743, -0.007889771448310452, 0.0348101114071456, 0.02089525169230861, -0.02864211443521707, 0.007089298481195554, -0.0550092576840049, 0.023571314559751005, -0.04006091178941026, -0.0015419003310750022, 0.02903490115903329]"
154836,U13740,2019-11-13 15:27:40,N42782 N18445 N55189 N19347 N45794 N10414 N63302 N31801 N34694,N58133,N42782 N18445 N55189 N19347 N45794 N10414 N63302 N31801 N34694 N58133,"[-0.013295162493754818, -0.03683597492969335, 0.027015733303578086, 0.008463010663624305, -0.03511638665497109, -0.016911013702098715, 0.020322843912623012, -0.008159407599750111, 0.01571723657774473, 0.013127013512370066, 0.05795892126012819, 0.06206919489872381, 0.033033226099389745, -0.013415112822799271, -0.030497736820228116, -0.018950689567295002, 0.005552539706172858, 0.016516357221064478, -0.03863017791593487, 0.0008932453763620153, -0.06501572956471191, -0.018877328982221032, 0.0006657603780140769, 0.0015387953529116358, -0.006922904418718148, -0.018790971157134576, 0.001702832567173429, -0.005637895420792993, 0.00877015476254524, 0.01246232206609468, -0.021662841864117415, 0.01861642250071726, 0.048091811962206396, -0.04038821953831038, 0.019236662524958155, -0.033857706460844655, -0.05362673354597981, -0.04045744813198213, -0.013833066433301355, 0.03666595622745108, -0.03623120914352183, 0.036570475960569546, -0.017925435753708367, 0.015333177912644236, -0.025425042213831222, -0.031651554539909255, -0.016167652268038783, 0.040379603069485216, -0.0047002311830857315, 0.006582243120437847, -0.024645901238492102, -0.01519541722869785, 0.03293450978558058, 0.050069841916490956, 0.019057964029587954, 0.03126317349521232, -0.006587762679261716, -0.014045792552739318, -0.02929505854135876, 0.004411325998956071, -0.007994829098362194, 0.03854827616090543, 0.009039741747851795, 0.025962924940285986, 0.018530867446671123, 0.011351295105888604, -0.02661867102264851, -0.034776197301206055, 0.01157397292770063, -0.016911378402528158, -0.04996402950986314, -0.012486421863106603, 0.049396682799983334, 0.0448499694916797, -0.030522153509377415, -0.0032535649621826894, -0.013438504098703456, 0.05246936922050087, -0.0006849080355189735, 0.05603364295594698, 0.026106577740811866, -0.0202511046762034, -0.000903206917005444, 0.04516004397984105, 0.044757908934023, -0.005157186750708695, 0.01629351132758915, -0.02104193699592597, -0.021222812181620414, -0.009010796311564382, -0.009542429195224035, 0.035533747579985106, 0.020696491630580217, -0.02716111570064917, 0.0062255582034177765, -0.054727845893881445, 0.014750446195553477, -0.04270347083262013, -0.0009777229236675937, 0.026759389615823415]"


## Create Function for Evaluation on Multiple Users

In [None]:
# Function to select multiple user IDs & timestamps for evaluation
def select_user_ids_timestamps(minimum_history=5, minimum_impressions=1, k=5):

  # Convert timestamps to string
  behaviors_df['Timestamp'] = behaviors_df['Timestamp'].astype(str)

  # Select minimum number of articles in history
  filtered_behaviors_df = behaviors_df[behaviors_df['History'].str.split().str.len() >= minimum_history]

  # Select minimum number of articles in impressions
  filtered_behaviors_df = filtered_behaviors_df[filtered_behaviors_df['Impressions'].str.split().str.len() >= minimum_impressions]

  # Select the top 10 rows
  filtered_behaviors_df = filtered_behaviors_df.tail(k)

  # Create a list of tuples containing values from columns 'a' and 'b'
  user_ids_timestamps = [(row['User ID'], row['Timestamp']) for _, row in filtered_behaviors_df.iterrows()]

  return user_ids_timestamps

# Baseline Model: Random Recommender

In [None]:
def single_user_recommendations_random(user_id, timestamp, k=5):
  # Convert input timestamp to datetime
  timestamp = pd.to_datetime(timestamp)

  # Set max old article date in news_df
  max_old_date = timestamp - timedelta(weeks = 2)

  # Filter news_df for articles released in the last 2 weeks before input timestamp
  filtered_news_df = news_df[(news_df["Release Date"] < timestamp) & (news_df["Release Date"] > max_old_date)]

  # Put all article IDs in a list
  article_ids = filtered_news_df["News ID"].tolist()

  # Randomly select k elements
  final_recommended_article_ids = random.sample(article_ids, k)

  return final_recommended_article_ids

In [None]:
def multiple_user_recomendations_random(user_ids_timestamps, k=5):
  # Create empty dictionary to store recommendations
  user_recommendations_dict = {}

  # Keep track of how many iterations have run
  counter = 0

  # Iterate over users & timestamps
  for user_id, timestamp in user_ids_timestamps:
    # Update counter
    counter += 1
    print(counter)
    user_recommendations_dict[(user_id, timestamp)] = single_user_recommendations_random(user_id, timestamp, k=k)

  return user_recommendations_dict

# Model 1: Frequency & Category Recommender

In [None]:
def single_user_recomendations_frequency(user_id, timestamp, categories=None, k=5):

    if behaviors_df["User ID"].isin([user_id]).any():
        # User exists in the DataFrame, implement logic for recommending articles based on user history
        sorted_df = behaviors_df.sort_values(by = "Timestamp", ascending = False)
        user_last_interaction = sorted_df.loc[behaviors_df["User ID"] == user_id].iloc[-1]

        user_history = user_last_interaction["History"].split(" ")
        user_history = pd.Series(user_history).explode()

        # Count the occurrences of each news article in user history
        article_counts = user_history.value_counts()
        # Get the top 3 categories based on the most-read articles in user history
        top_categories = article_counts.index.map(news_df.set_index("News ID")["Category"]).value_counts().index[:3]

        behaviors_df["Timestamp"] = pd.to_datetime(behaviors_df["Timestamp"])

        # Specify the datetime threshold
        timestamp_threshold = pd.to_datetime(timestamp)

        # Set max old article date in df
        max_old_date = timestamp_threshold - timedelta(weeks = 2)

        # Filter the "behaviors_train" DataFrame for timestamps greater than the threshold
        filtered_behaviors = behaviors_df[(behaviors_df["Timestamp"] < timestamp_threshold) & (behaviors_df["Timestamp"] > max_old_date)]

        filtered_behaviors.drop("Timestamp", axis=1, inplace=True)

        articles_df = filtered_behaviors["History"].str.split(" ").explode('History')
        articles_df = articles_df.to_frame()
        articles_df['lectures'] = 1

        articles_most_read = articles_df.groupby("History").sum().sort_values(by="lectures", ascending=False)
        articles_most_read["Category"] = articles_most_read.index.map(news_df.set_index("News ID")["Category"].get)
        # Filter articles_df for the top 3 categories
        filtered_articles = articles_most_read[articles_most_read["Category"].isin(top_categories)]
        # Remove articles that the user has already read
        filtered_articles = filtered_articles[~filtered_articles.index.isin(user_history)]
        # Recommend the K most popular articles
        recommended_articles = filtered_articles.index[:k].to_list()

        return recommended_articles

    else:
      # Convert the "Timestamp" column to datetime format
      behaviors_df["Timestamp"] = pd.to_datetime(behaviors_df["Timestamp"])

      # Specify the datetime threshold
      timestamp_threshold = pd.to_datetime(timestamp)

      # Set max old article date in df
      max_old_date = timestamp_threshold - timedelta(weeks = 2)

      # Filter the "behaviors_train" DataFrame for timestamps greater than the threshold
      filtered_behaviors = behaviors_df[(behaviors_df["Timestamp"] < timestamp_threshold) & (behaviors_df["Timestamp"] > max_old_date)]

      filtered_behaviors.drop("Timestamp", axis=1, inplace=True)

      articles_df = filtered_behaviors["History"].str.split(" ").explode('History')
      articles_df = articles_df.to_frame()
      articles_df['lectures'] = 1

      articles_most_read = articles_df.groupby("History").sum().sort_values(by="lectures", ascending=False)

      articles_most_read["Category"] = articles_most_read.index.map(news_df.set_index("News ID")["Category"].get)

      # Filter articles_most_read for the specified categories
      filtered_articles = articles_most_read[articles_most_read["Category"].isin(categories)]

      # Retrieve the top k articles
      final_recommended_article_ids = filtered_articles.index[:k].to_list()

      return final_recommended_article_ids

In [None]:
def multiple_user_recomendations_frequency(user_ids_timestamps, categories=None, k=5):
  # Create empty dictionary to store recommendations
  user_recommendations_dict = {}

  # Keep track of how many iterations have run
  counter = 0

  # Iterate over users & timestamps
  for user_id, timestamp in user_ids_timestamps:
    # Update counter
    counter += 1
    print(counter)
    user_recommendations_dict[(user_id, timestamp)] = single_user_recomendations_frequency(user_id, timestamp, categories=categories, k=k)

  return user_recommendations_dict

# Model 2: Content Based Recommender

## Define Functions for Content Based Filtering



### Word2Vec (Glove Model)

In [None]:
def create_previously_read_content(user_id, timestamp):
  '''Inputs:
  previously_read_article_ids: list of article IDs previously read by a given user, provided by recommend_articles_collaborative function
  news_df: clean news dataframe imported from drive

  Outputs:
  previously_read_content: list of words in all of the articles that were previously read by a given user
  '''
  # Get IDs in user's history
  previously_read_article_ids = (
      list(
          behaviors_df.loc[
              ((behaviors_df['User ID'] == user_id) & (behaviors_df['Timestamp'] == timestamp)),
              'History'
          ].str.split()
      )[0]
  )

  # create filtered news df for articles previously read by a user
  previously_read_articles_df = news_df.loc[news_df['News ID'].isin(previously_read_article_ids), ['News ID', 'Content']]

  # create list of words containing all content words from rpeviously read articles
  previously_read_content = ' '.join(previously_read_articles_df['Content']).split()

  return previously_read_content

In [None]:
def create_unread_content(user_id, timestamp):
  '''Inputs:
  recommended_article_ids: list of article IDs recommended by the recommend_articles_collaborative function
  news_df: clean news dataframe imported from drive

  Outputs:
  recommended_content: dictionary with recommended article_ids as keys and list of words in article content as values
  '''
  # Get IDs in user's history
  previously_read_article_ids = (
      list(
          behaviors_df.loc[
              ((behaviors_df['User ID'] == user_id) & (behaviors_df['Timestamp'] == timestamp)),
              'History'
          ].str.split()
      )[0]
  )
  # create filtered news df for unread articles
  unread_articles_df = news_df.loc[~news_df['News ID'].isin(previously_read_article_ids), ['News ID', 'Content']]

  # Create an empty dictionary
  unread_content = {}

  # Iterate over the rows of the DataFrame
  for row in unread_articles_df.itertuples(index=False):
      news_id = row[0]
      content = row[1]

      # Split the content string into words
      words = content.split()

      # Add the key-value pair to the dictionary
      unread_content[news_id] = words

  return unread_content

In [None]:
def single_user_recommendations_pure_content_glove(previously_read_content, unread_content, k=5):
    """
    previously_read_content: list of content words in a user's previously read articles obtained from create_previously_read_content function
    unread_content: dictionary with article_ids as keys and list of words in article content as values obtained from create_unread_content function
    """
    # Remove words not present in the GloVe model from interests & articles
    previously_read_content = [content for content in previously_read_content if content in glove_model]
    unread_content = {key: [word for word in content if word.lower() in glove_model] for key, content in unread_content.items()}

    # Create empty list to store key-value pairs
    key_value_pairs = []

    # Iterate through articles dictionary
    for news_id, content in unread_content.items():
        # Calculate cosine similarity between the list of keywords of an article and the list of user interests
        article_embedding = np.mean([glove_model[word] for word in content], axis=0)
        interest_embedding = np.mean([glove_model[word] for word in previously_read_content], axis=0)
        similarity_score = cosine_similarity([interest_embedding], [article_embedding])[0][0]

        key_value_pairs.append((news_id, similarity_score))

    # Sort the key-value pairs based on the similarity score (descending order)
    key_value_pairs.sort(key=lambda x: x[1], reverse=True)

    # Get the top n key-value pairs
    final_recommended_articles = key_value_pairs[:k]

    return final_recommended_articles

In [None]:
def multiple_user_recommendations_pure_content_glove(user_ids_timestamps, articles_k=5):
  # Create empty dictionary to store recommendations
  user_recommendations_dict = {}

  # Initiate counter
  counter = 0

  # Iterate over users & timestamps
  for user_id, timestamp in user_ids_timestamps:
    # Update counter
    counter += 1
    print(counter)
    # create previously read content list
    previously_read_content = create_previously_read_content(user_id, timestamp)
    # create unread content dictionary
    unread_content = create_unread_content(user_id, timestamp)
    # find final similar articles
    final_recommended_ids = single_user_recommendations_pure_content_glove(previously_read_content, unread_content, k=articles_k)
    user_recommendations_dict[(user_id, timestamp)] = final_recommended_ids

  return user_recommendations_dict

### Embeddings

In [None]:
def single_user_recommendations_pure_content_embeddings(user_id, timestamp, articles_k=5):
   # Get IDs in user's history
  previously_read_article_ids = (
      list(
          behaviors_df.loc[
              ((behaviors_df['User ID'] == user_id) & (behaviors_df['Timestamp'] == timestamp)),
              'History'
          ].str.split()
      )[0]
  )

  # Get average vector of user's history news IDs
  average_news_vector = news_df.loc[news_df['News ID'].isin(previously_read_article_ids), 'Average Vector'].mean()

  # Filter news_df to exlcude articles in user history
  filtered_news_df = news_df.loc[~news_df['News ID'].isin(previously_read_article_ids)]

  # Convert input timestamp to date time
  timestamp = pd.to_datetime(timestamp)

  # # Filter news_df to exlcude any articles released after date of interaction
  filtered_news_df = filtered_news_df[filtered_news_df['Release Date'] <= timestamp]

  # Compute cosine similarity between average_news_vector and each unread news article
  filtered_news_df['Similarity'] = filtered_news_df['Average Vector'].apply(lambda x: cosine_similarity([average_news_vector], [x])[0][0])

  # Sort dataframe in descending order
  filtered_news_df = filtered_news_df.sort_values(by='Similarity', ascending=False)

  # Select top k articles and return as a list of tuples
  final_recommended_articles = [(news_id, similarity) for news_id, similarity in zip(filtered_news_df.head(articles_k)['News ID'], filtered_news_df.head(articles_k)['Similarity'])]

  return final_recommended_articles


In [None]:
def multiple_user_recommendations_pure_content_embeddings(user_ids_timestamps, articles_k=5):
  '''Inputs:
  user_ids_timestaps: tuple with user_id & timestamp
  '''
  # create an empty dictionary to populate with recommendations
  user_recommendations_dict = {}

  # initiate counter to track progress
  counter = 0

  for user_id, timestamp in user_ids_timestamps:
    # run function for single users
    final_recommended_ids = single_user_recommendations_pure_content_embeddings(user_id, timestamp, articles_k=articles_k)

    # create dictionary for final recommendations
    user_recommendations_dict[(user_id, timestamp)] = final_recommended_ids

    # increase counter
    counter += 1
    print(counter)

  return user_recommendations_dict

### TFIDF

In [None]:
def create_tfidf_features(news_df):
  # Create the TF-IDF vectorizer with preprocessing
  tfidf = TfidfVectorizer(strip_accents=None,
                          lowercase=True,
                          tokenizer=word_tokenize,
                          use_idf=True,
                          norm='l2',
                          smooth_idf=True,
                          stop_words='english',
                          max_df=0.5,
                          sublinear_tf=True)

  # Fit and transform the combined column
  features = tfidf.fit_transform(news_df['Content'])

  return features

In [None]:
def recommendations_pure_content_tfidf(user_id, timestamp, features, articles_k=5):

  # Get IDs in user's history
  previously_read_article_ids = (
      list(
          behaviors_df.loc[
              ((behaviors_df['User ID'] == user_id) & (behaviors_df['Timestamp'] == timestamp)),
              'History'
          ].str.split()
      )[0]
  )

  # Get the indices of the relevant news in the features matrix (removing those read already)
  previously_read_indices = news_df[news_df['News ID'].isin(previously_read_article_ids)].index.tolist()

  # # Get the indices of news articles read after the input timestamp (to exclude them in the next step)
  timestamp = pd.to_datetime(timestamp)

  all_indices = list(range(features.shape[0]))
  future_article_indices = news_df[news_df['Release Date'] > timestamp].index.tolist()
  not_previously_read_indices = [idx for idx in all_indices if idx not in previously_read_indices and idx not in future_article_indices]

  # Aggregate the feature vectors of the read articles
  user_profile = np.asarray(features[previously_read_indices].sum(axis=0)/len(previously_read_indices))

  # Calculate the similarity scores between the user profile and other articles
  similarity_scores = cosine_similarity(user_profile.reshape(1, -1), features[not_previously_read_indices]).flatten()

  # Find the indices of the top 5 recommendations
  top_indices = similarity_scores.argsort()[-articles_k:][::-1]

  # Get the top recommended news articles as a list of tuples
  final_recommended_article_ids = [(news_id, similarity) for news_id, similarity in
                                zip(list(news_df.iloc[np.array(not_previously_read_indices)[top_indices].tolist(),]['News ID']),
                                    similarity_scores[top_indices])]

  return final_recommended_article_ids

In [None]:
def single_user_recommendations_pure_content_tfidf(user_id, timestamp, articles_k=5):
  # Create features
  features = create_tfidf_features(news_df)

  #Run recommendations function
  final_recommended_article_ids = recommendations_pure_content_tfidf(user_id, timestamp, features, articles_k=articles_k)

  return final_recommended_article_ids

In [None]:
def multiple_user_recommendations_pure_content_tfidf(user_ids_timestamps, articles_k=5):
  # Create features
  features = create_tfidf_features(news_df)

  # create an empty dictionary to populate with recommendations
  user_recommendations_dict = {}

  # initiate counter to track progress
  counter = 0

  for user_id, timestamp in user_ids_timestamps:
    # run function for single users
    final_recommended_ids = recommendations_pure_content_tfidf(user_id, timestamp, features, articles_k=articles_k)

    # create dictionary for final recommendations
    user_recommendations_dict[(user_id, timestamp)] = final_recommended_ids

    # increase counter
    counter += 1
    print(counter)

  return user_recommendations_dict



## Define Functions that consolidate Pipeline

### Single User

In [None]:
def single_user_recommendations_pure_content(user_id, timestamp, method='embeddings', articles_k=10):
  if method == 'word2vec_glove':
      # create previously read content list
    previously_read_content = create_previously_read_content(user_id, timestamp)
    # create unread content dictionary
    unread_content = create_unread_content(user_id, timestamp)
    final_recommended_article_ids = single_user_recommendations_pure_content_glove(previously_read_content, unread_content, k=articles_k)

  elif method == 'embeddings':
    final_recommended_article_ids = single_user_recommendations_pure_content_embeddings(user_id, timestamp, articles_k=articles_k)

  elif method == 'tfidf':
    final_recommended_article_ids = single_user_recommendations_pure_content_tfidf(user_id, timestamp, articles_k=articles_k)

  return final_recommended_article_ids

### Multiple Users

In [None]:
def multiple_user_recommendations_pure_content(user_ids_timestamps, method='embeddings', articles_k=10):
  user_recommendations_dict = {}

  if method == 'word2vec_glove':
    user_recommendations_dict = multiple_user_recommendations_pure_content_glove(user_ids_timestamps, articles_k=articles_k)
  elif method == 'embeddings':
    user_recommendations_dict = multiple_user_recommendations_pure_content_embeddings(user_ids_timestamps, articles_k=articles_k)
  elif method == 'tfidf':
    user_recommendations_dict = multiple_user_recommendations_pure_content_tfidf(user_ids_timestamps, articles_k=articles_k)

  return user_recommendations_dict

# Model 3: Collaborative & Content Based Recommender

## Define Functions for User to User Collaborative Filtering

In [None]:
def fetch_similar_users(user_id, timestamp, k=5):
  # Get IDs in user's history
  presiouvly_read_article_ids = (
      list(
          behaviors_df.loc[
              ((behaviors_df['User ID'] == user_id) & (behaviors_df['Timestamp'] == timestamp)),
              'History'
          ].str.split()
      )[0]
  )

  # Get average vector of user's history news IDs
  average_user_vector = news_df.loc[news_df['News ID'].isin(presiouvly_read_article_ids), 'Average Vector'].mean()

  # Create a copy of behaviours_df
  user_similarity_df = behaviors_df.copy()

  # Removes users without history & impressions
  user_similarity_df = user_similarity_df.dropna()

  # Filter out input user from user_similarity_df
  user_similarity_df = user_similarity_df.loc[user_similarity_df['User ID'] != user_id]

  # Drop duplicate users
  user_similarity_df = user_similarity_df.drop_duplicates(subset=['User ID', 'History & Impressions'])

  # Compute cosine similarity between average_news_vector and each user
  user_similarity_df['Similarity'] = user_similarity_df['Average Vector'].apply(lambda x: cosine_similarity([average_user_vector], [x])[0][0])

  # Sort dataframe in descending order
  user_similarity_df = user_similarity_df.sort_values(by='Similarity', ascending=False).head(k)

  # Get similar users
  similar_users_timestamps = [(row['User ID'], row['Timestamp']) for _, row in user_similarity_df.iterrows()]

  return similar_users_timestamps

In [None]:
def recommend_articles_collaborative(user_id, timestamp, similar_users_timestamps):
  # Filter behaviors df for similar users & timestamps
  similar_users_df = behaviors_df[behaviors_df[['User ID', 'Timestamp']].apply(tuple, axis=1).isin(similar_users_timestamps)]

  # Initialize list to store relevant article IDs
  recommended_article_ids = []

  # Iterate over the rows of the DataFrame
  for index, row in similar_users_df.iterrows():
    # Split the text into words and add them to the word_list
    recommended_article_ids.extend(row['History & Impressions'].split())

  # Get IDs in user's history
  previously_read_article_ids = (
      list(
          behaviors_df.loc[
              ((behaviors_df['User ID'] == user_id) & (behaviors_df['Timestamp'] == timestamp)),
              'History'
          ].str.split()
      )[0]
  )

  # Remove any already read articles from the recommended articles
  recommended_article_ids = list(set([id for id in recommended_article_ids if id not in previously_read_article_ids]))

  return recommended_article_ids

## Define Functions for Content Based Filtering - Post Collaborative Filtering

### Word2Vec

In [None]:
def create_previously_read_content(user_id, timestamp):
  '''Inputs:
  previously_read_article_ids: list of article IDs previously read by a given user, provided by recommend_articles_collaborative function
  news_df: clean news dataframe imported from drive

  Outputs:
  previously_read_content: list of words in all of the articles that were previously read by a given user
  '''
  # Get IDs in user's history
  previously_read_article_ids = (
      list(
          behaviors_df.loc[
              ((behaviors_df['User ID'] == user_id) & (behaviors_df['Timestamp'] == timestamp)),
              'History'
          ].str.split()
      )[0]
  )

  # create filtered news df for articles previously read by a user
  previously_read_articles_df = news_df.loc[news_df['News ID'].isin(previously_read_article_ids), ['News ID', 'Content']]

  # create list of words containing all content words from rpeviously read articles
  previously_read_content = ' '.join(previously_read_articles_df['Content']).split()

  return previously_read_content

In [None]:
def create_recommended_content(recommended_article_ids):
    '''Inputs:
    recommended_article_ids: list of article IDs recommended by the recommend_articles_collaborative function
    news_df: clean news dataframe imported from drive

    Outputs:
    recommended_content: dictionary with recommended article_ids as keys and list of words in article content as values
    '''
    # create filtered news df for recommended articles
    recommended_articles_df = news_df.loc[news_df['News ID'].isin(recommended_article_ids), ['News ID', 'Content']]

    # Create an empty dictionary
    recommended_content = {}

    # Iterate over the rows of the DataFrame
    for row in recommended_articles_df.itertuples(index=False):
        news_id = row[0]
        content = row[1]

        # Split the content string into words
        words = content.split()

        # Add the key-value pair to the dictionary
        recommended_content[news_id] = words

    return recommended_content

#### Word2vec (Google Model)

In [None]:
# function to process inputs and identify most relevant article (based on cosine similarity)
def recommend_articles_content_google(previously_read_content, recommended_content, k=5):
    '''
    previously_read_content: list of content words in a user's previously read articles obtained from create_previously_read_content function
    recommended_content: dictionary with article_ids as keys and list of words in article content as values obtained from create_recommended_content function
    '''
    # Remove words not used in model training from interests & articles
    previously_read_content = [content for content in previously_read_content if content in list(google_model.key_to_index.keys())]
    recommended_content = {key: [word for word in content if word.lower() in google_model.key_to_index] for key, content in recommended_content.items()}

    # Create empty list to store key-value pairs
    key_value_pairs = []

    # iterate through articles dictionary
    for news_id, content in recommended_content.items():
        # calculate cosine similarity between the list of keywords of an article and the list of user interests
        similarity_score = google_model.n_similarity(previously_read_content, content)

        key_value_pairs.append((news_id, similarity_score))

    # Sort the key-value pairs based on the similarity score (descending order)
    key_value_pairs.sort(key=lambda x: x[1], reverse=True)

    # Get the top n key-value pairs
    final_recommended_article_ids = key_value_pairs[:k]

    return final_recommended_article_ids

#### Word2vec (Glove Model)

In [None]:
# function to process inputs and identify most relevant article (based on cosine similarity)
def recommend_articles_content_glove(previously_read_content, recommended_content, k=5):
    """
    previously_read_content: list of content words in a user's previously read articles obtained from create_previously_read_content function
    recommended_content: dictionary with article_ids as keys and list of words in article content as values obtained from create_recommended_content function
    """
    # Remove words not present in the GloVe model from interests & articles
    previously_read_content = [content for content in previously_read_content if content in glove_model]
    recommended_content = {key: [word for word in content if word.lower() in glove_model] for key, content in recommended_content.items()}

    # Create empty list to store key-value pairs
    key_value_pairs = []

    # Iterate through articles dictionary
    for news_id, content in recommended_content.items():
        # Calculate cosine similarity between the list of keywords of an article and the list of user interests
        article_embedding = np.mean([glove_model[word] for word in content], axis=0)
        interest_embedding = np.mean([glove_model[word] for word in previously_read_content], axis=0)
        similarity_score = cosine_similarity([interest_embedding], [article_embedding])[0][0]

        key_value_pairs.append((news_id, similarity_score))

    # Sort the key-value pairs based on the similarity score (descending order)
    key_value_pairs.sort(key=lambda x: x[1], reverse=True)

    # Get the top n key-value pairs
    final_recommended_article_ids = key_value_pairs[:k]

    return final_recommended_article_ids

### Embeddings

In [None]:
def get_top_k_recommended_article_ids_avgvec(user_id, timestamp, recommended_article_ids, k=5):

  # create filtered_news_df based on recommended articles from collaborative based filtering
  filtered_news_df = news_df.loc[news_df['News ID'].isin(recommended_article_ids)]

  # Get IDs in user's history
  previously_read_article_ids = (
      list(
          behaviors_df.loc[
              ((behaviors_df['User ID'] == user_id) & (behaviors_df['Timestamp'] == timestamp)),
              'History'
          ].str.split()
      )[0]
  )

  # Get average vector of user's history news IDs
  average_news_vector = news_df.loc[news_df['News ID'].isin(previously_read_article_ids), 'Average Vector'].mean()

  # Filter news_df to exlcude articles in user history
  filtered_news_df = filtered_news_df.loc[~filtered_news_df['News ID'].isin(previously_read_article_ids)]

  # Compute cosine similarity between average_news_vector and each unread news article
  filtered_news_df['Similarity'] = filtered_news_df['Average Vector'].apply(lambda x: cosine_similarity([average_news_vector], [x])[0][0])

  # Sort dataframe in descending order
  filtered_news_df = filtered_news_df.sort_values(by='Similarity', ascending=False)

  # Select top k articles and return as a list of tuples
  top_k_recommended_article_ids = [(news_id, similarity) for news_id, similarity in zip(filtered_news_df.head(k)['News ID'], filtered_news_df.head(k)['Similarity'])]

  return top_k_recommended_article_ids

### TFIDF

In [None]:
def create_tfidf_features(news_df):
  # Create the TF-IDF vectorizer with preprocessing
  tfidf = TfidfVectorizer(strip_accents=None,
                          lowercase=True,
                          tokenizer=word_tokenize,
                          use_idf=True,
                          norm='l2',
                          smooth_idf=True,
                          stop_words='english',
                          max_df=0.5,
                          sublinear_tf=True)

  # Fit and transform the combined column
  features = tfidf.fit_transform(news_df['Content'])

  return features

In [None]:
def recommend_articles_content_tfidf(user_id, timestamp, recommended_articles_ids, features, k=5):

  # Get IDs in user's history
  previously_read_article_ids = (
      list(
          behaviors_df.loc[
              ((behaviors_df['User ID'] == user_id) & (behaviors_df['Timestamp'] == timestamp)),
              'History'
          ].str.split()
      )[0]
    )

  # Get the indices of the relevant news in the features matrix (removing those read already)
  news_indices = news_df[news_df['News ID'].isin(previously_read_article_ids)].index.tolist()

  # Get the indices of the recommended news in the features matrix
  recomended_articles_indices = news_df[news_df['News ID'].isin(recommended_articles_ids)].index.tolist()

  all_indices = list(range(features.shape[0]))
  not_news_indices = [idx for idx in all_indices if idx in recomended_articles_indices]

  # Aggregate the feature vectors of the read articles
  user_profile = np.asarray(features[news_indices].sum(axis=0)/len(news_indices))

  # Calculate the similarity scores between the user profile and other articles
  similarity_scores = cosine_similarity(user_profile.reshape(1, -1), features[not_news_indices]).flatten()

  # Find the indices of the top 5 recommendations
  top_indices = similarity_scores.argsort()[-k:][::-1]

  # Get the top recommended news articles as a list of tuples
  final_recommended_article_ids = [(news_id, similarity) for news_id, similarity in
                                zip(list(news_df.iloc[np.array(not_news_indices)[top_indices].tolist(),]['News ID']),
                                    similarity_scores[top_indices])]

  return final_recommended_article_ids

## Define Functions that consolidate Pipeline

### Single User

In [None]:
def single_user_recommendations_combined(user_id, timestamp, method='word2vec', similar_user_k=5, articles_k=5):

  # Get user ID & time stamp of similar user interactions
  similar_users_timestamps = fetch_similar_users(user_id, timestamp, k=similar_user_k)

  # Get article IDs read by similar user interactions
  recommended_article_ids = recommend_articles_collaborative(user_id, timestamp, similar_users_timestamps)

  # Apply embeddings or TFIDF methodology

  if method=='word2vec_google':
    # create previously read content list
    previously_read_content = create_previously_read_content(user_id, timestamp)
    # create recommended content dictionary
    recommended_content = create_recommended_content(recommended_article_ids)
    # find final similar articles
    final_recommended_article_ids = recommend_articles_content_google(previously_read_content, recommended_content, k=articles_k)

  if method=='word2vec_glove':
    # create previously read content list
    previously_read_content = create_previously_read_content(user_id, timestamp)
    # create recommended content dictionary
    recommended_content = create_recommended_content(recommended_article_ids)
    # find final similar articles
    final_recommended_article_ids = recommend_articles_content_glove(previously_read_content, recommended_content, k=5)

  elif method == 'embeddings':
    # find final similar articles
    final_recommended_article_ids = get_top_k_recommended_article_ids_avgvec(user_id, timestamp, recommended_article_ids, k=articles_k)

  elif method == 'tfidf':
    # create features
    features = create_tfidf_features(news_df)
    # find final similar articles
    final_recommended_article_ids = recommend_articles_content_tfidf(user_id, timestamp, recommended_article_ids, features, k=articles_k)

  return final_recommended_article_ids

### Multiple Users

In [None]:
def multiple_user_recommendations_combined(user_ids_timestamps, method='tfidf', similar_user_k=5, articles_k=5):

  # Create empty dictionary to store recommendations
  user_recommendations_dict = {}

  # If method is tfidf create features
  if method == 'tfidf':
    features = create_tfidf_features(news_df)

  # Keep track of how many iterations have run
  counter = 0

  # Iterate over users & timestamps
  for user_id, timestamp in user_ids_timestamps:
    # Update counter
    counter += 1
    print(counter)

    # Get user ID & time stamp of similar user interactions
    similar_users_timestamps = fetch_similar_users(user_id, timestamp, k=similar_user_k)

    # Get article IDs read by similar user interactions
    recommended_article_ids = recommend_articles_collaborative(user_id, timestamp, similar_users_timestamps)

    if method=='word2vec_google':
      # create previously read content list
      previously_read_content = create_previously_read_content(user_id, timestamp)
      # create recommended content dictionary
      recommended_content = create_recommended_content(recommended_article_ids)
      # find final similar articles
      final_recommended_ids = recommend_articles_content_google(previously_read_content, recommended_content, k=articles_k)
      user_recommendations_dict[(user_id, timestamp)] = final_recommended_ids

    if method=='word2vec_glove':
      # create previously read content list
      previously_read_content = create_previously_read_content(user_id, timestamp)
      # create recommended content dictionary
      recommended_content = create_recommended_content(recommended_article_ids)
      # find final similar articles
      final_recommended_ids = recommend_articles_content_glove(previously_read_content, recommended_content, k=5)
      user_recommendations_dict[(user_id, timestamp)] = final_recommended_ids

    # Apply embeddings or TFIDF methodology
    elif method == 'embeddings':
      final_recommended_ids = get_top_k_recommended_article_ids_avgvec(user_id, timestamp, recommended_article_ids, k=articles_k)
      user_recommendations_dict[(user_id, timestamp)] = final_recommended_ids

    elif method == 'tfidf':
      final_recommended_ids = recommend_articles_content_tfidf(user_id, timestamp, recommended_article_ids, features, k=articles_k)
      user_recommendations_dict[(user_id, timestamp)] = final_recommended_ids

  return user_recommendations_dict

# Make Predictions

In [None]:
# Select number of users to evaluate on
number_test_users = 5
# Select a subset of users of size k to test on
user_ids_timestamps = select_user_ids_timestamps(minimum_history=15, minimum_impressions=8, k=number_test_users)

## Baseline Model: Random Recommender

### Single User

In [None]:
# Run model
final_recommended_ids = single_user_recommendations_random("U8500", "2019-11-13 15:27:40", k=10)
final_recommended_ids

['N25120',
 'N15540',
 'N59374',
 'N47392',
 'N913',
 'N9244',
 'N22818',
 'N43368',
 'N12397',
 'N40421']

### Multi User

In [None]:
# Run model
final_recommended_ids_multiple = multiple_user_recomendations_random(user_ids_timestamps, k=10)
final_recommended_ids_multiple

1
2
3
4
5


{'U11241': ['N31801',
  'N16715',
  'N4607',
  'N46392',
  'N59704',
  'N619',
  'N32004',
  'N1150',
  'N13138',
  'N16233'],
 'U46273': ['N306',
  'N31801',
  'N45794',
  'N16715',
  'N46392',
  'N59704',
  'N619',
  'N1150',
  'N28088',
  'N16233'],
 'U50429': ['N31801',
  'N45794',
  'N43142',
  'N16715',
  'N51706',
  'N54827',
  'N46392',
  'N59704',
  'N41375',
  'N27448'],
 'U85170': ['N42620',
  'N871',
  'N29177',
  'N55189',
  'N18870',
  'N33276',
  'N4020',
  'N60702',
  'N35022',
  'N35671'],
 'U77421': ['N42620',
  'N31801',
  'N45794',
  'N16715',
  'N46392',
  'N54827',
  'N59704',
  'N18870',
  'N55743',
  'N41375']}

In [None]:
# # Convert to exportable format
# final_recommended_ids_multiple = {json.dumps(key): value for key, value in final_recommended_ids_multiple.items()}

# # Export results
# with open(f'/content/drive/MyDrive/Group_19/01.Dataset/Predictions/predictions_baseline_model_{number_test_users}_users.json', 'w') as json_file:
#     json.dump(final_recommended_ids_multiple, json_file)

## Model 1: Frequency & Category Recommender

### Single User

In [None]:
# Run model
final_recommended_ids = single_user_recomendations_frequency("U8500", "2019-11-13 15:27:40", ["news", "weather", "lifestyle"], k=10)
final_recommended_ids

['N42620',
 'N31801',
 'N45794',
 'N16715',
 'N46392',
 'N54827',
 'N59704',
 'N18870',
 'N55743',
 'N32004']

### Multi User

In [None]:
# Run model
final_recommended_ids_multiple = multiple_user_recomendations_frequency(user_ids_timestamps, categories=None, k=10)
final_recommended_ids_multiple

1
2
3
4
5


{'U11241': ['N31801',
  'N16715',
  'N4607',
  'N46392',
  'N59704',
  'N619',
  'N32004',
  'N1150',
  'N13138',
  'N16233'],
 'U46273': ['N306',
  'N31801',
  'N45794',
  'N16715',
  'N46392',
  'N59704',
  'N619',
  'N1150',
  'N28088',
  'N16233'],
 'U50429': ['N31801',
  'N45794',
  'N43142',
  'N16715',
  'N51706',
  'N54827',
  'N46392',
  'N59704',
  'N41375',
  'N27448'],
 'U85170': ['N42620',
  'N871',
  'N29177',
  'N55189',
  'N18870',
  'N33276',
  'N4020',
  'N60702',
  'N35022',
  'N35671'],
 'U77421': ['N42620',
  'N31801',
  'N45794',
  'N16715',
  'N46392',
  'N54827',
  'N59704',
  'N18870',
  'N55743',
  'N41375']}

In [None]:
# # Convert to exportable format
# final_recommended_ids_multiple = {json.dumps(key): value for key, value in final_recommended_ids_multiple.items()}

# # Export results
# with open(f'/content/drive/MyDrive/Group_19/01.Dataset/Predictions/predictions_model1_{number_test_users}_users.json', 'w') as json_file:
#     json.dump(final_recommended_ids_multiple, json_file)

## Model 2: Content Based Recommender
Methods:
- tfidf
- embeddings
- word2vec_glove

Note that tfidf is the least computationally expensive and word2vec_glove is the most computationally expensive

### Single User

In [None]:
# Define method to use:
method = 'tfidf'

# Run model
final_recommended_ids = single_user_recommendations_pure_content(user_id='U13740', timestamp='2019-11-13 15:27:40', method=method, articles_k=10)
final_recommended_ids

[('N59426', 0.2389679796572606),
 ('N34069', 0.2357753342049917),
 ('N61980', 0.212007250112408),
 ('N59336', 0.21097985822568008),
 ('N628', 0.20479455592138335),
 ('N19522', 0.20223114224583788),
 ('N28476', 0.19642963415927053),
 ('N54172', 0.19642963415927053),
 ('N5035', 0.1951850328229925),
 ('N9080', 0.19028199938135584)]

### Multi User

In [None]:
# Define method to use
method = 'tfidf'

# Run model
final_recommended_ids_multiple = multiple_user_recommendations_pure_content(user_ids_timestamps, method=method, articles_k=10)
final_recommended_ids_multiple

1
2
3
4
5


{('U11241', '2019-11-13 06:56:56'): [('N3736', 0.1832456463717506),
  ('N11196', 0.18276597206082024),
  ('N44506', 0.16504580014337983),
  ('N39235', 0.1599936156264622),
  ('N20930', 0.15899050062827763),
  ('N58637', 0.15798458863277243),
  ('N62353', 0.1576325960752563),
  ('N26123', 0.15709720017976284),
  ('N18741', 0.15665689684980832),
  ('N39380', 0.1559073751709623)],
 ('U46273', '2019-11-13 09:56:13'): [('N27230', 0.35617495130014276),
  ('N22066', 0.34850869259284145),
  ('N11368', 0.34809167108044603),
  ('N51998', 0.34799238971405944),
  ('N41462', 0.3449104729664711),
  ('N10597', 0.33811927004359327),
  ('N51278', 0.3265380619397497),
  ('N45385', 0.326427077471173),
  ('N49867', 0.32053660115859645),
  ('N5844', 0.31248449107864407)],
 ('U50429', '2019-11-12 19:51:39'): [('N53960', 0.3005274877075451),
  ('N48134', 0.29561359903961365),
  ('N14031', 0.29186127864644795),
  ('N5580', 0.2833622708675733),
  ('N15443', 0.28190545375493825),
  ('N13105', 0.2792421720634918

In [None]:
# # Convert to exportable format
# final_recommended_ids_multiple = {json.dumps(key): value for key, value in final_recommended_ids_multiple.items()}

# # Export results
# with open(f'/content/drive/MyDrive/Group_19/01.Dataset/Predictions/predictions_model2_{method}_{number_test_users}_users.json', 'w') as json_file:
#     json.dump(final_recommended_ids_multiple, json_file)

## Model 3: Collaborative & Content Based Recommender
Methods:
- tfidf
- embeddings
- word2vec_glove
- word2vec_google

Note that tfidf is the least computationally expensive and word2vec_google is the most computationally expensive



### Single User

In [None]:
# Define method to use
method = 'tfidf'

# Run model
final_recommended_ids = single_user_recommendations_combined('U13740', '2019-11-13 15:27:40', method=method, similar_user_k=5, articles_k=10)
final_recommended_ids

[('N62646', 0.13573152549658354),
 ('N28678', 0.1302081059461651),
 ('N4378', 0.11881006371425713),
 ('N23571', 0.07628988282376056),
 ('N33998', 0.07600726928910853),
 ('N17799', 0.07299124648972674),
 ('N36150', 0.06675080281851244),
 ('N21547', 0.06073790473438189),
 ('N2203', 0.05876959033254194),
 ('N41777', 0.0586725991082965)]

### Multi User

In [None]:
# Define method to use
method = 'tfidf'

# Run model
final_recommended_ids_multiple = multiple_user_recommendations_combined(user_ids_timestamps, method=method, similar_user_k=5, articles_k=10)
final_recommended_ids_multiple

1
2
3
4
5


{('U11241', '2019-11-13 06:56:56'): [('N39235', 0.1599936156264622),
  ('N44021', 0.1268015529185742),
  ('N38998', 0.12598292144696782),
  ('N4607', 0.10506280986184377),
  ('N19620', 0.10187923754169145),
  ('N11323', 0.09987245918044703),
  ('N18893', 0.09938526931447786),
  ('N63302', 0.09807044375344426),
  ('N21448', 0.09716616835629371),
  ('N24591', 0.09613721468410572)],
 ('U46273', '2019-11-13 09:56:13'): [('N39960', 0.2910661724553247),
  ('N46128', 0.27630906269721384),
  ('N831', 0.2722299425189194),
  ('N59704', 0.25707938073856795),
  ('N44676', 0.2406771078474222),
  ('N30241', 0.20842701709171604),
  ('N59173', 0.19972123729520147),
  ('N61943', 0.1970088343186164),
  ('N25898', 0.18870565180244828),
  ('N46392', 0.18640181345992035)],
 ('U50429', '2019-11-12 19:51:39'): [('N33131', 0.25626322115709976),
  ('N6951', 0.24859033240718564),
  ('N12948', 0.21530844830944126),
  ('N26683', 0.20004414409516394),
  ('N51058', 0.19787473995666338),
  ('N53428', 0.1904062508245

In [None]:
# # Convert to exportable format
# final_recommended_ids_multiple = {json.dumps(key): value for key, value in final_recommended_ids_multiple.items()}

# # Export results
# with open(f'/content/drive/MyDrive/Group_19/01.Dataset/Predictions/predictions_model3_{method}_{number_test_users}_users.json', 'w') as json_file:
#     json.dump(final_recommended_ids_multiple, json_file)