To do:
- Remove dropping NAs from user_similarity_df in fetch_similar_users()

## Set Up

In [14]:
import pandas as pd
import numpy as np
import datetime
from scipy.sparse import csc_matrix
from scipy.sparse import load_npz
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
import nltk
from nltk.corpus import stopwords
import heapq
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
pd.set_option('display.max_colwidth', None)
import json
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [39]:
# Function to select multiple user IDs & timestamps for evaluation
def select_user_ids_timestamps(k=5):
  # Select the top 10 rows
  filtered_behaviors_df = behaviors_df.tail(k)

  # Create a list of tuples containing values from columns 'a' and 'b'
  user_ids_timestamps = [(row['User ID'], row['Timestamp']) for _, row in filtered_behaviors_df.iterrows()]

  return user_ids_timestamps

## Import Data



In [51]:
from google.colab import drive
drive.mount('/content/drive')

# # import unique behaviours df
# unique_user_behaviors = pd.read_csv("/content/drive/MyDrive/Group_19/01.Dataset/Small/Clean/Train/unique_user_behaviors_train.csv")

# # Keep all columns in a separate df
# unique_user_behaviors_with_time = unique_user_behaviors.copy()

# import news df
news_df = pd.read_pickle("/content/drive/MyDrive/Group_19/01.Dataset/Small/Clean/Train/news.pkl")

# # import user article sparse matrix
# sparse_user_matrix = load_npz("/content/drive/MyDrive/Group_19/01.Dataset/Small/Clean/Train/unique_user_sparse.npz")

# Load pre-trained Google News Word2Vec model
model_path = "/content/drive/MyDrive/Group_19/01.Dataset/GoogleNews-vectors-negative300.bin"
google_model = KeyedVectors.load_word2vec_format(model_path, binary=True)

###########
# import behaviors df
behaviors_df = pd.read_pickle("/content/drive/MyDrive/Group_19/01.Dataset/Small/Clean/Train/behaviors.pkl")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Collaborative + Content Model

## Define Functions for User to User Collaborative Filtering

In [16]:
def fetch_similar_users(user_id, timestamp, k=5):
  # Get IDs in user's history
  presiouvly_read_article_ids = (
      list(
          behaviors_df.loc[
              ((behaviors_df['User ID'] == user_id) & (behaviors_df['Timestamp'] == timestamp)),
              'History'
          ].str.split()
      )[0]
  )

  # Get average vector of user's history news IDs
  average_user_vector = news_df.loc[news_df['News ID'].isin(presiouvly_read_article_ids), 'Average Vector'].mean()

  # Create a copy of behaviours_df
  user_similarity_df = behaviors_df.copy()
  user_similarity_df = user_similarity_df.dropna()

  # Filter out input user from user_similarity_df
  user_similarity_df = user_similarity_df.loc[user_similarity_df['User ID'] != user_id]

  # Drop duplicate users
  user_similarity_df = user_similarity_df.drop_duplicates(subset=['User ID', 'History & Impressions'])

  # Compute cosine similarity between average_news_vector and each unread news article
  user_similarity_df['Similarity'] = user_similarity_df['Average Vector'].apply(lambda x: cosine_similarity([average_user_vector], [x])[0][0])

  # Sort dataframe in descending order
  user_similarity_df = user_similarity_df.sort_values(by='Similarity', ascending=False).head(k)

  # Get similar users
  similar_users_timestamps = [(row['User ID'], row['Timestamp']) for _, row in user_similarity_df.iterrows()]

  return similar_users_timestamps

In [17]:
def recommend_articles_collaborative(user_id, timestamp, similar_users_timestamps):
  # Filter behaviors df for similar users & timestamps
  similar_users_df = behaviors_df[behaviors_df[['User ID', 'Timestamp']].apply(tuple, axis=1).isin(similar_users_timestamps)]

  # Initialize list to store relevant article IDs
  recommended_article_ids = []

  # Iterate over the rows of the DataFrame
  for index, row in similar_users_df.iterrows():
    # Split the text into words and add them to the word_list
    recommended_article_ids.extend(row['History & Impressions'].split())

  # Get IDs in user's history
  previously_read_article_ids = (
      list(
          behaviors_df.loc[
              ((behaviors_df['User ID'] == user_id) & (behaviors_df['Timestamp'] == timestamp)),
              'History'
          ].str.split()
      )[0]
  )

  # Remove any already read articles from the recommended articles
  recommended_article_ids = list(set([id for id in recommended_article_ids if id not in previously_read_article_ids]))

  return recommended_article_ids

## Define Functions for Content Based Filtering - Post Collaborative Filtering

### Word2Vec

In [46]:
def create_previously_read_content(user_id, timestamp):
  '''Inputs:
  previously_read_article_ids: list of article IDs previously read by a given user, provided by recommend_articles_collaborative function
  news_df: clean news dataframe imported from drive

  Outputs:
  previously_read_content: list of words in all of the articles that were previously read by a given user
  '''
  # Get IDs in user's history
  previously_read_article_ids = (
      list(
          behaviors_df.loc[
              ((behaviors_df['User ID'] == user_id) & (behaviors_df['Timestamp'] == timestamp)),
              'History'
          ].str.split()
      )[0]
  )

  # create filtered news df for articles previously read by a user
  previously_read_articles_df = news_df.loc[news_df['News ID'].isin(previously_read_article_ids), ['News ID', 'Content']]

  # create list of words containing all content words from rpeviously read articles
  previously_read_content = ' '.join(previously_read_articles_df['Content']).split()

  return previously_read_content

In [47]:
def create_recommended_content(recommended_article_ids):
    '''Inputs:
    recommended_article_ids: list of article IDs recommended by the recommend_articles_collaborative function
    news_df: clean news dataframe imported from drive

    Outputs:
    recommended_content: dictionary with recommended article_ids as keys and list of words in article content as values
    '''
    # create filtered news df for recommended articles
    recommended_articles_df = news_df.loc[news_df['News ID'].isin(recommended_article_ids), ['News ID', 'Content']]

    # Create an empty dictionary
    recommended_content = {}

    # Iterate over the rows of the DataFrame
    for row in recommended_articles_df.itertuples(index=False):
        news_id = row[0]
        content = row[1]

        # Split the content string into words
        words = content.split()

        # Add the key-value pair to the dictionary
        recommended_content[news_id] = words

    return recommended_content

In [48]:
# function to process inputs and identify most relevant article (based on cosine similarity)
def recommend_articles_content_w2v(previously_read_content, recommended_content, k=5):
    '''
    previously_read_content: list of content words in a user's previously read articles obtained from create_previously_read_content function
    recommended_content: dictionary with article_ids as keys and list of words in article content as values obtained from create_recommended_content function
    '''
    # Remove words not used in model training from interests & articles
    previously_read_content = [content for content in previously_read_content if content in list(google_model.key_to_index.keys())]
    recommended_content = {key: [word for word in content if word.lower() in google_model.key_to_index] for key, content in recommended_content.items()}

    # create empty dictionary to store similarity scores
    similarity_scores = {}

    # iterate through articles dictionary
    for news_id, content in recommended_content.items():
        # calculate cosine similarity between the list of keywords of an article and the list of user interests
        similarity_score = google_model.n_similarity(previously_read_content, content)

        similarity_scores[news_id] = similarity_score

    # Get the top n 'News ID' with the highest values
    final_recommended_article_ids = heapq.nlargest(k, similarity_scores, key=similarity_scores.get)

    return final_recommended_article_ids

### Embeddings

In [18]:
def get_top_k_recommended_article_ids_avgvec(user_id, timestamp, recommended_article_ids, k=5):

  # create filtered_news_df based on recommended articles from collaborative based filtering
  filtered_news_df = news_df.loc[news_df['News ID'].isin(recommended_article_ids)]

  # Get IDs in user's history
  previously_read_article_ids = (
      list(
          behaviors_df.loc[
              ((behaviors_df['User ID'] == user_id) & (behaviors_df['Timestamp'] == timestamp)),
              'History'
          ].str.split()
      )[0]
  )

  # Get average vector of user's history news IDs
  average_news_vector = news_df.loc[news_df['News ID'].isin(previously_read_article_ids), 'Average Vector'].mean()

  # Filter news_df to exlcude articles in user history
  filtered_news_df = filtered_news_df.loc[~filtered_news_df['News ID'].isin(previously_read_article_ids)]

  # Compute cosine similarity between average_news_vector and each unread news article
  filtered_news_df['Similarity'] = filtered_news_df['Average Vector'].apply(lambda x: cosine_similarity([average_news_vector], [x])[0][0])

  # Sort dataframe in descending order
  filtered_news_df = filtered_news_df.sort_values(by='Similarity', ascending=False)

  #select top k articles
  top_k_recommended_article_ids = filtered_news_df.head(k)['News ID'].tolist()

  return top_k_recommended_article_ids

### TFIDF

In [19]:
def create_tfidf_features(news_df):
  # Create the TF-IDF vectorizer with preprocessing
  tfidf = TfidfVectorizer(strip_accents=None,
                          lowercase=True,
                          tokenizer=word_tokenize,
                          use_idf=True,
                          norm='l2',
                          smooth_idf=True,
                          stop_words='english',
                          max_df=0.5,
                          sublinear_tf=True)

  # Fit and transform the combined column
  features = tfidf.fit_transform(news_df['Content'])

  return features

In [20]:
def recommend_articles_content_tfidf(user_id, timestamp, recommended_articles_ids, features, k=5):

  # Get IDs in user's history
  previously_read_article_ids = (
      list(
          behaviors_df.loc[
              ((behaviors_df['User ID'] == user_id) & (behaviors_df['Timestamp'] == timestamp)),
              'History'
          ].str.split()
      )[0]
    )

  # Get the indices of the relevant news in the features matrix (removing those read already)
  news_indices = news_df[news_df['News ID'].isin(previously_read_article_ids)].index.tolist()

  # Get the indices of the recommended news in the features matrix
  recomended_articles_indices = news_df[news_df['News ID'].isin(recommended_articles_ids)].index.tolist()

  all_indices = list(range(features.shape[0]))
  not_news_indices = [idx for idx in all_indices if idx in recomended_articles_indices]

  # Aggregate the feature vectors of the read articles
  user_profile = np.asarray(features[news_indices].sum(axis=0)/len(news_indices))

  # Calculate the similarity scores between the user profile and other articles
  similarity_scores = cosine_similarity(user_profile.reshape(1, -1), features[not_news_indices]).flatten()

  # Find the indices of the top 5 recommendations
  top_indices = similarity_scores.argsort()[-k:][::-1]

  # Get the top 5 recommended news articles
  final_recommended_article_ids = list(news_df.iloc[np.array(not_news_indices)[top_indices].tolist(),[0]]['News ID'])

  return final_recommended_article_ids

## Define Functions that consolidate Pipeline

### Single User

In [49]:
def single_user_recommendations_combined(user_id, timestamp, method='word2vec', similar_user_k=5, articles_k=5):

  # Get user ID & time stamp of similar user interactions
  similar_users_timestamps = fetch_similar_users(user_id, timestamp, k=similar_user_k)

  # Get article IDs read by similar user interactions
  recommended_article_ids = recommend_articles_collaborative(user_id, timestamp, similar_users_timestamps)

  # Apply embeddings or TFIDF methodology

  if method=='word2vec':
    # create previously read content list
    previously_read_content = create_previously_read_content(user_id, timestamp)
    # create recommended content dictionary
    recommended_content = create_recommended_content(recommended_article_ids)
    # find final similar articles
    final_recommended_article_ids = recommend_articles_content_w2v(previously_read_content, recommended_content, k=articles_k)

  elif method == 'embeddings':
    # find final similar articles
    final_recommended_article_ids = get_top_k_recommended_article_ids_avgvec(user_id, timestamp, recommended_article_ids, k=articles_k)

  elif method == 'tfidf':
    # create features
    features = create_tfidf_features(news_df)
    # find final similar articles
    final_recommended_article_ids = recommend_articles_content_tfidf(user_id, timestamp, recommended_article_ids, features, k=articles_k)

  return final_recommended_article_ids

### Multiple Users

In [54]:
def multiple_user_recommendations_combined(user_ids_timestamps, method='tfidf', similar_user_k=5, articles_k=5):

  # Create empty dictionary to store recommendations
  user_recommendations_dict = {}

  # If method is tfidf create features
  if method == 'tfidf':
    features = create_tfidf_features(news_df)

  # Keep track of how many iterations have run
  counter = 0

  # Iterate over users & timestamps
  for user_id, timestamp in user_ids_timestamps:
    # Update counter
    counter += 1
    # Get user ID & time stamp of similar user interactions
    similar_users_timestamps = fetch_similar_users(user_id, timestamp, k=similar_user_k)

    # Get article IDs read by similar user interactions
    recommended_article_ids = recommend_articles_collaborative(user_id, timestamp, similar_users_timestamps)

    if method=='word2vec':
      # create previously read content list
      previously_read_content = create_previously_read_content(user_id, timestamp)
      # create recommended content dictionary
      recommended_content = create_recommended_content(recommended_article_ids)
      # find final similar articles
      final_recommended_article_ids = recommend_articles_content_w2v(previously_read_content, recommended_content, k=articles_k)

    # Apply embeddings or TFIDF methodology
    elif method == 'embeddings':
      final_recommended_ids = get_top_k_recommended_article_ids_avgvec(user_id, timestamp, recommended_article_ids, k=articles_k)
      user_recommendations_dict[user_id] = final_recommended_ids

    elif method == 'tfidf':
      final_recommended_ids = recommend_articles_content_tfidf(user_id, timestamp, recommended_article_ids, features, k=articles_k)
      user_recommendations_dict[user_id] = final_recommended_ids
      print(counter)

  return user_recommendations_dict

## Test on Sample User

In [52]:
 # Run recommender system
final_recommended_ids = single_user_recommendations_combined('U13740', '2019-11-13 15:27:40', method='word2vec', similar_user_k=5, articles_k=5)

In [53]:
# view recommendations - avg vec
final_recommended_ids

['N9674', 'N24691', 'N25635', 'N64273', 'N21547']

## Test on Multiple Users

In [None]:
# Select a subset of users of size k to test on
user_ids_timestamps = select_user_ids_timestamps(k=5)

# Run recommender system
final_recommended_ids_multiple = multiple_user_recommendations_combined(user_ids_timestamps, method='word2vec', similar_user_k=5, articles_k=5)

In [41]:
final_recommended_ids_multiple

{'U21593': ['N42781', 'N59704', 'N30665', 'N16655', 'N46039'],
 'U10123': ['N8448', 'N16344', 'N27612', 'N41172', 'N10843'],
 'U75630': ['N16384', 'N37304', 'N61352', 'N64305', 'N52236'],
 'U44625': ['N10928', 'N4255', 'N58860', 'N63302', 'N11523'],
 'U64800': ['N17303', 'N27951', 'N287', 'N44021', 'N35170']}

### Export recommendations for evaluation

In [None]:
with open('/content/drive/MyDrive/Group_19/01.Dataset/final_recommended_ids_multiple.json', 'w') as json_file:
    json.dump(final_recommended_ids_multiple, json_file)