## Set Up

In [None]:
import pandas as pd
import numpy as np
import json
from scipy.sparse import load_npz
from sklearn.metrics.pairwise import cosine_similarity
import scipy.sparse as sp
from sklearn.feature_extraction.text import TfidfVectorizer
import ast
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Function to select multiple user IDs & timestamps for evaluation
def select_user_ids_timestamps(k=5):
  # Select the top 10 rows
  filtered_behaviors_df = behaviors_df.tail(k)

  # Create a list of tuples containing values from columns 'a' and 'b'
  user_ids_timestamps = [(row['User ID'], row['Timestamp']) for _, row in filtered_behaviors_df.iterrows()]

  return user_ids_timestamps

## Import Data

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# import behaviors df
behaviors_df = pd.read_csv("/content/drive/MyDrive/Group_19/01.Dataset/Small/Clean/Train/behaviors.csv")

# import news df
news_df = pd.read_pickle("/content/drive/MyDrive/Group_19/01.Dataset/Small/Clean/Train/news.pkl")
#news_df['Release Date'] = pd.to_datetime(news_df['Release Date'])


Mounted at /content/drive


# Content Based Model

## Define Functions for Content Based Filtering



### Embeddings

In [None]:
def single_user_recommendations_pure_content_embeddings(user_id, timestamp, articles_k=5):
   # Get IDs in user's history
  previously_read_article_ids = (
      list(
          behaviors_df.loc[
              ((behaviors_df['User ID'] == user_id) & (behaviors_df['Timestamp'] == timestamp)),
              'History'
          ].str.split()
      )[0]
  )

  # Get average vector of user's history news IDs
  average_news_vector = news_df.loc[news_df['News ID'].isin(previously_read_article_ids), 'Average Vector'].mean()

  # Filter news_df to exlcude articles in user history
  filtered_news_df = news_df.loc[~news_df['News ID'].isin(previously_read_article_ids)]

  # Convert input timestamp to date time
  timestamp = pd.to_datetime(timestamp)

  # # Filter news_df to exlcude any articles released after date of interaction
  # filtered_news_df = filtered_news_df[filtered_news_df['Release Date'] <= timestamp]

  # Compute cosine similarity between average_news_vector and each unread news article
  filtered_news_df['Similarity'] = filtered_news_df['Average Vector'].apply(lambda x: cosine_similarity([average_news_vector], [x])[0][0])

  # Sort dataframe in descending order
  filtered_news_df = filtered_news_df.sort_values(by='Similarity', ascending=False)

  #select top k articles
  final_recommended_ids = filtered_news_df.head(articles_k)['News ID'].tolist()

  return final_recommended_ids


In [None]:
def multiple_user_recommendations_pure_content_embeddings(user_ids_timestamps, articles_k=5):
  '''Inputs:
  user_ids_timestaps: tuple with user_id & timestamp
  '''
  # create an empty dictionary to populate with recommendations
  user_recommendations_dict = {}

  for user_id, timestamp in user_ids_timestamps:
    # run function for single users
    final_recommended_ids = single_user_recommendations_pure_content_embeddings(user_id, timestamp, articles_k=articles_k)

    # create dictionary for final recommendations
    user_recommendations_dict[user_id] = final_recommended_ids

  return user_recommendations_dict

### TFIDF

In [None]:
def create_tfidf_features(news_df):
  # Create the TF-IDF vectorizer with preprocessing
  tfidf = TfidfVectorizer(strip_accents=None,
                          lowercase=True,
                          tokenizer=word_tokenize,
                          use_idf=True,
                          norm='l2',
                          smooth_idf=True,
                          stop_words='english',
                          max_df=0.5,
                          sublinear_tf=True)

  # Fit and transform the combined column
  features = tfidf.fit_transform(news_df['Content'])

  return features

In [None]:
def recommendations_pure_content_tfidf(user_id, timestamp, features, articles_k=5):

  # Get IDs in user's history
  previously_read_article_ids = (
      list(
          behaviors_df.loc[
              ((behaviors_df['User ID'] == user_id) & (behaviors_df['Timestamp'] == timestamp)),
              'History'
          ].str.split()
      )[0]
  )

  # Get the indices of the relevant news in the features matrix (removing those read already)
  previously_read_indices = news_df[news_df['News ID'].isin(previously_read_article_ids)].index.tolist()

  # # Get the indices of news articles read after the input timestamp (to exclude them in the next step)
  # timestamp = pd.to_datetime(timestamp)
  # future_article_indices = news_df[news_df['Release Date'] > timestamp].index.tolist()

  all_indices = list(range(features.shape[0]))
  not_previously_read_indices = [idx for idx in all_indices if idx not in previously_read_indices]
  #not_previously_read_indices = [idx for idx in all_indices if idx not in previously_read_indices and idx not in future_article_indices]

  # Aggregate the feature vectors of the read articles
  user_profile = np.asarray(features[previously_read_indices].sum(axis=0)/len(previously_read_indices))

  # Calculate the similarity scores between the user profile and other articles
  similarity_scores = cosine_similarity(user_profile.reshape(1, -1), features[not_previously_read_indices]).flatten()

  # Find the indices of the top 5 recommendations
  top_indices = similarity_scores.argsort()[-articles_k:][::-1]

  # Get the top 5 recommended news articles
  final_recommended_article_ids = list(news_df.iloc[np.array(not_previously_read_indices)[top_indices].tolist(),[0]]['News ID'])

  return final_recommended_article_ids

In [None]:
def single_user_recommendations_pure_content_tfidf(user_id, timestamp, articles_k=5):
  # Create features
  features = create_tfidf_features(news_df)

  #Run recommendations function
  final_recommended_article_ids = recommendations_pure_content_tfidf(user_id, timestamp, features, articles_k=articles_k)

  return final_recommended_article_ids

In [None]:
def multiple_user_recommendations_pure_content_tfidf(user_ids_timestamps, articles_k=5):
  # Create features
  features = create_tfidf_features(news_df)

  # create an empty dictionary to populate with recommendations
  user_recommendations_dict = {}

  for user_id, timestamp in user_ids_timestamps:
    # run function for single users
    final_recommended_ids = recommendations_pure_content_tfidf(user_id, timestamp, features, articles_k=articles_k)

    # create dictionary for final recommendations
    user_recommendations_dict[user_id] = final_recommended_ids

  return user_recommendations_dict



## Define Functions that consolidate Pipeline

### Single User

In [None]:
def single_user_recommendations_pure_content(user_id, timestamp, method='embeddings', articles_k=10):
  if method == 'embeddings':
    final_recommended_article_ids = single_user_recommendations_pure_content_embeddings(user_id, timestamp, articles_k=articles_k)
  elif method == 'tfidf':
    final_recommended_article_ids = single_user_recommendations_pure_content_tfidf(user_id, timestamp, articles_k=articles_k)

  return final_recommended_article_ids

### Multiple Users

In [None]:
def multiple_user_recommendations_pure_content(user_ids_timestamps, method='embeddings', articles_k=10):
  if method == 'embeddings':
    user_recommendations_dict = multiple_user_recommendations_pure_content_embeddings(user_ids_timestamps, articles_k=articles_k)
  elif method == 'tfidf':
    user_recommendations_dict = multiple_user_recommendations_pure_content_tfidf(user_ids_timestamps, articles_k=5)

  return user_recommendations_dict

# Content Based Predictions

## Test on Sample User

In [None]:
final_recommended_ids = single_user_recommendations_pure_content(user_id='U13740', timestamp='2019-11-13 15:27:40', method='tfidf', articles_k=5)

In [None]:
final_recommended_ids

['N59426', 'N34069', 'N61980', 'N59336', 'N628']

## Test on Multiple Users

In [None]:
# Select a subset of users of size k to test on
user_ids_timestamps = select_user_ids_timestamps(k=5)

# Make recommednations for multiple users
final_recommended_ids_multiple = multiple_user_recommendations_pure_content(user_ids_timestamps, method='tfidf', articles_k=10)

In [None]:
# View recommednations for multiple users
final_recommended_ids_multiple

{'U21593': ['N62353', 'N32663', 'N46293', 'N55619', 'N2224'],
 'U10123': ['N13700', 'N42846', 'N16148', 'N30022', 'N39112'],
 'U75630': ['N21336', 'N15209', 'N47120', 'N63515', 'N47615'],
 'U44625': ['N1410', 'N33038', 'N6163', 'N35827', 'N61178'],
 'U64800': ['N45575', 'N34904', 'N42252', 'N36327', 'N48603']}

In [None]:
# Export file
with open('/content/drive/MyDrive/Group_19/01.Dataset/prediction_content.json', 'w') as json_file:
    json.dump(final_recommended_ids_multiple, json_file)