## Set Up

In [None]:
import pandas as pd
import numpy as np
import datetime
from scipy.sparse import csc_matrix
from scipy.sparse import load_npz
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
import nltk
from nltk.corpus import stopwords
import heapq
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
pd.set_option('display.max_colwidth', None)
import json
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# Function to select multiple user IDs & timestamps for evaluation
def select_user_ids_timestamps(k=5):
  # Select the top 10 rows
  filtered_behaviors_df = behaviors_df.tail(k)

  # Create a list of tuples containing values from columns 'a' and 'b'
  user_ids_timestamps = [(row['User ID'], row['Timestamp']) for _, row in filtered_behaviors_df.iterrows()]

  return user_ids_timestamps

## Import Data



In [None]:
from google.colab import drive
drive.mount('/content/drive')

# import news df
news_df = pd.read_pickle("/content/drive/MyDrive/Group_19/01.Dataset/Small/Clean/Train/news.pkl")

# import behaviors df
behaviors_df = pd.read_pickle("/content/drive/MyDrive/Group_19/01.Dataset/Small/Clean/Train/behaviors.pkl")

Mounted at /content/drive


# Collaborative Model


## Define Functions for User to User Collaborative Filtering

In [None]:
def fetch_similar_users(user_id, timestamp, k=5):
  # Get IDs in user's history
  presiouvly_read_article_ids = (
      list(
          behaviors_df.loc[
              ((behaviors_df['User ID'] == user_id) & (behaviors_df['Timestamp'] == timestamp)),
              'History'
          ].str.split()
      )[0]
  )

  # Get average vector of user's history news IDs
  average_user_vector = news_df.loc[news_df['News ID'].isin(presiouvly_read_article_ids), 'Average Vector'].mean()

  # Create a copy of behaviours_df
  user_similarity_df = behaviors_df.copy()
  user_similarity_df = user_similarity_df.dropna()

  # Filter out input user from user_similarity_df
  user_similarity_df = user_similarity_df.loc[user_similarity_df['User ID'] != user_id]

  # Drop duplicate users
  user_similarity_df = user_similarity_df.drop_duplicates(subset=['User ID', 'History & Impressions'])

  # Compute cosine similarity between average_news_vector and each unread news article
  user_similarity_df['Similarity'] = user_similarity_df['Average Vector'].apply(lambda x: cosine_similarity([average_user_vector], [x])[0][0])

  # Sort dataframe in descending order
  user_similarity_df = user_similarity_df.sort_values(by='Similarity', ascending=False).head(k)

  # Get similar users
  similar_users_timestamps = [(row['User ID'], row['Timestamp']) for _, row in user_similarity_df.iterrows()]

  return similar_users_timestamps

In [None]:
def recommend_articles_collaborative(user_id, timestamp, similar_users_timestamps, k=5):
  # Filter behaviors df for similar users & timestamps
  similar_users_df = behaviors_df[behaviors_df[['User ID', 'Timestamp']].apply(tuple, axis=1).isin(similar_users_timestamps)]

  # Initialize list to store relevant article IDs
  recommended_article_ids = []

  # Iterate over the rows of the DataFrame
  for index, row in similar_users_df.iterrows():
    # Split the text into words and add them to the word_list
    recommended_article_ids.extend(row['History & Impressions'].split())

  # Get IDs in user's history
  previously_read_article_ids = (
      list(
          behaviors_df.loc[
              ((behaviors_df['User ID'] == user_id) & (behaviors_df['Timestamp'] == timestamp)),
              'History'
          ].str.split()
      )[0]
  )

  # Remove any already read articles from the recommended articles
  recommended_article_ids = list(set([id for id in recommended_article_ids if id not in previously_read_article_ids]))
  recommended_article_ids = recommended_article_ids[:k]

  return recommended_article_ids

## Define Functions that consolidate Pipeline

### Single User

In [None]:
def single_user_recommendations_collaborative(user_id, timestamp, similar_user_k=5, articles_k=5):

  # Get user ID & time stamp of similar user interactions
  similar_users_timestamps = fetch_similar_users(user_id, timestamp, k=similar_user_k)

  # Get article IDs read by similar user interactions
  recommended_article_ids = recommend_articles_collaborative(user_id, timestamp, similar_users_timestamps, k=articles_k)

  return recommended_article_ids

### Multiple Users

In [None]:
def multiple_user_recommendations_collaborative(user_ids_timestamps, similar_user_k=5, articles_k=5):

  # Create empty dictionary to store recommendations
  user_recommendations_dict = {}

  # Keep track of how many iterations have run
  counter = 0

  # Iterate over users & timestamps
  for user_id, timestamp in user_ids_timestamps:
    # Update counter
    counter += 1
    # Run function for single user recommendation
    single_user_recommendations_collaborative(user_id, timestamp, similar_user_k=similar_user_k, articles_k=articles_k)

    # create dictionary entry for recommendation
    user_recommendations_dict[user_id] = final_recommended_ids

  return user_recommendations_dict

## Test on Sample User

In [None]:
 # Run recommender system
final_recommended_ids = single_user_recommendations_collaborative('U13740', '2019-11-13 15:27:40', similar_user_k=5, articles_k=5)

In [None]:
# view recommendations - avg vec
final_recommended_ids

['N27919', 'N48076', 'N47765', 'N9317', 'N38367']

## Test on Multiple Users

In [None]:
# Select a subset of users of size k to test on
user_ids_timestamps = select_user_ids_timestamps(k=5)

# Run recommender system
final_recommended_ids_multiple = multiple_user_recommendations_collaborative(user_ids_timestamps, similar_user_k=5, articles_k=5)

In [None]:
final_recommended_ids_multiple

{'U21593': ['N27919', 'N48076', 'N47765', 'N9317', 'N38367'],
 'U10123': ['N27919', 'N48076', 'N47765', 'N9317', 'N38367'],
 'U75630': ['N27919', 'N48076', 'N47765', 'N9317', 'N38367'],
 'U44625': ['N27919', 'N48076', 'N47765', 'N9317', 'N38367'],
 'U64800': ['N27919', 'N48076', 'N47765', 'N9317', 'N38367']}

### Export recommendations for evaluation

In [None]:
with open('/content/drive/MyDrive/Group_19/01.Dataset/Predictions/predictions_collaborative.json', 'w') as json_file:
    json.dump(final_recommended_ids_multiple, json_file)