To do:
- Remove dropping NAs from user_similarity_df in fetch_similar_users()

## Set Up

In [None]:
import pandas as pd
import numpy as np
import datetime
from scipy.sparse import csc_matrix
from scipy.sparse import load_npz
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
import nltk
from nltk.corpus import stopwords
import heapq
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
pd.set_option('display.max_colwidth', None)
import json
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Function to select multiple user IDs & timestamps for evaluation
def select_user_ids_timestamps(k=5):
  # Select the top 10 rows
  filtered_behaviors_df = behaviors_df.tail(k)

  # Create a list of tuples containing values from columns 'a' and 'b'
  user_ids_timestamps = [(row['User ID'], row['Timestamp']) for _, row in filtered_behaviors_df.iterrows()]

  return user_ids_timestamps

## Import Data



In [None]:
from google.colab import drive
drive.mount('/content/drive')



news_df = pd.read_pickle("/content/news.csv")

)

# Load pre-trained Google News Word2Vec model
model_path = "/content/drive/MyDrive/GoogleNews-vectors-negative300.bin"
google_model = KeyedVectors.load_word2vec_format(model_path, binary=True)

# import behaviors df
behaviors_df = pd.read_csv("/content/behaviors.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Data Preprocessing

## Install Necessary Libraries
!pip install wordninja --quiet
import pandas as pd
import numpy as np
import random
import string
import ast
import wordninja
import warnings
from scipy import sparse
from scipy.sparse import csc_matrix, lil_matrix, save_npz
from google.colab import drive
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
pd.set_option('display.max_colwidth', None)


# Choose Dataset Characteristics
size = 'Small'
version = 'Train'

## Clean News Dataset

# Load & Visualize Input Datasets
news = pd.read_csv(f"/content/news.csv",
                   header=None, sep='\t',
                   names=['News ID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'Title Entities', 'Abstract Entities'],
                   usecols=['News ID', 'Category', 'SubCategory', 'Title', 'Abstract', 'Title Entities', 'Abstract Entities'])

# Transformations
def get_clean_news(news, embeddings):
    # Remove rows with missing values
    news.dropna(inplace=True)

    # Remove duplicate rows
    news.drop_duplicates(inplace=True)

    # Split and replace words
    def split_and_replace(word):
        split_words = wordninja.split(word)
        new_word = ' '.join(split_words)
        return new_word

    news['Category'] = news['Category'].apply(lambda x: split_and_replace(x))
    news['SubCategory'] = news['SubCategory'].apply(lambda x: split_and_replace(x))

    # Convert selected columns to lowercase
    columns_to_lower = ['Category', 'SubCategory', 'Title', 'Abstract']
    news[columns_to_lower] = news[columns_to_lower].applymap(lambda x: x.lower() if isinstance(x, str) else x)

    # Create content column
    news['Content'] = news[['Category', 'SubCategory', 'Title', 'Abstract']].apply(' '.join, axis=1)

    # Remove punctuation and stopwords from content
    news['Content'] = news['Content'].str.replace('[{}]'.format(string.punctuation), '', regex=True)
    stop_words = set(stopwords.words('english'))
    news['Content'] = [' '.join(word for word in content.split() if word.lower() not in stop_words) for content in news['Content']]

    # Calculate word count
    news['Content_WC'] = news['Content'].str.split().str.len()

    # Process embeddings for title and abstract
    news['Title Entities'] = news['Title Entities'].apply(ast.literal_eval)
    news['Abstract Entities'] = news['Abstract Entities'].apply(ast.literal_eval)
    news['Title Wikidata IDs'] = news['Title Entities'].apply(lambda x: ' '.join([d['WikidataId'] for d in x]))
    news['Abstract Wikidata IDs'] = news['Abstract Entities'].apply(lambda x: ' '.join([d['WikidataId'] for d in x]))
    news['All Wikidata IDs'] = news['Title Wikidata IDs'] + ' ' + news['Abstract Wikidata IDs']

    def calculate_average_vector(vector_ids, embeddings):
        vectors = [embeddings.get(vector_id) for vector_id in vector_ids if vector_id in embeddings]
        return np.mean(vectors, axis=0) if vectors else np.nan

    news['Average Vector'] = news['All Wikidata IDs'].apply(lambda x: calculate_average_vector(x.split(), embeddings))

    # Impute missing values
    subset = news.dropna(subset=['Average Vector'])
    avg_vector_cat_subcat = subset.groupby(['Category', 'SubCategory'])['Average Vector'].mean().to_dict()
    news['Average Vector'] = news.apply(
        lambda row: avg_vector_cat_subcat.get((row['Category'], row['SubCategory']))
        if pd.isna(row['Average Vector']) else row['Average Vector'], axis=1)

    avg_vector_cat = subset.groupby('Category')['Average Vector'].mean().to_dict()
    news['Average Vector'] = news.apply(
        lambda row: avg_vector_cat.get(row['Category'])
        if pd.isna(row['Average Vector']) else row['Average Vector'], axis=1)

    news.drop(columns=['Title Entities', 'Abstract Entities', 'Title Wikidata IDs', 'Abstract Wikidata IDs', 'All Wikidata IDs'], inplace=True)
    return news

news_df = get_clean_news(news, embeddings)


## Clean Behaviors Dataset

# Load and visualize input dataset
behaviors = pd.read_csv("/content/behaviors.tsv",
                        header=None, sep='\t',
                        names=['Impression ID', 'User ID', 'Timestamp', 'History', 'Impressions'],
                        usecols=['User ID', 'Timestamp', 'History', 'Impressions'])

# Transformations
def get_clean_behaviors(behaviors, news):
    behaviors['Timestamp'] = pd.to_datetime(behaviors['Timestamp'])
    behaviors['Impressions'] = behaviors['Impressions'].str.replace(r'\w+-0', '', regex=True).str.strip().str.replace('-1', '')
    behaviors['History'] = behaviors['History'].str.strip().astype(str)
    behaviors['History'] = behaviors['History'].apply(lambda x: ' '.join(list(set(x.split()))))
    behaviors.drop_duplicates(inplace=True)

    valid_news_ids = set(news['News ID'])
    behaviors['History'] = behaviors['History'].apply(lambda x: ' '.join(news_id for news_id in x.split() if news_id in valid_news_ids))
    behaviors['Impressions'] = behaviors['Impressions'].apply(lambda x: ' '.join(news_id for news_id in x.split() if news_id in valid_news_ids))

    behaviors['History & Impressions'] = behaviors['History'] + ' ' + behaviors['Impressions']
    user_articles = behaviors['History & Impressions'].str.split().apply(set)
    avg_vectors_dict = news.groupby('News ID')['Average Vector'].mean().to_dict()

    def calculate_average_vector(article_ids):
        vectors = [avg_vectors_dict.get(article_id) for article_id in article_ids if article_id in avg_vectors_dict]
        return np.mean(vectors, axis=0) if vectors else None

    behaviors['Average Vector'] = user_articles.apply(calculate_average_vector)
    return behaviors

behaviors_df = get_clean_behaviors(behaviors, news)





# Collaborative + Content Model

## Define Functions for User to User Collaborative Filtering

In [None]:
def recommend_articles_collaborative(user_id, timestamp, similar_users_timestamps):
    # Extract rows corresponding to similar users and their respective timestamps
    similar_users_df = behaviors_df[
        behaviors_df[['User ID', 'Timestamp']].apply(tuple, axis=1).isin(similar_users_timestamps)
    ]

    # Initialize a list to collect article IDs for recommendations
    recommended_article_ids = []

    # Collect articles from the interaction history of similar users
    for _, row in similar_users_df.iterrows():
        recommended_article_ids.extend(row['History & Impressions'].split())

    # Retrieve the list of articles the target user has already interacted with
    previously_read_article_ids = (
        list(
            behaviors_df.loc[
                ((behaviors_df['User ID'] == user_id) & (behaviors_df['Timestamp'] == timestamp)),
                'History'
            ].str.split()
        )[0]
    )

    # Exclude articles already interacted with by the user from the recommendations
    recommended_article_ids = list(
        set([article_id for article_id in recommended_article_ids if article_id not in previously_read_article_ids])
    )

    return recommended_article_ids


## Define Functions for Content Based Filtering - Post Collaborative Filtering

### Word2Vec

### Explanation: `create_previously_read_content`

The `create_previously_read_content` function extracts and returns a list of words from all articles previously read by a specific user. Below is a detailed explanation of the function:

---

#### **Purpose**
This function is designed to collect content from articles a user has read. The aggregated content can later be used in similarity-based recommendation systems.

---

#### **Inputs**
- **`user_id`**: A string representing the unique identifier of the target user.
- **`timestamp`**: A string representing the specific interaction time of the user.

---

#### **Outputs**
- **`previously_read_content`**: A list of all words extracted from the content of articles the user has read.

---

#### **Steps**
1. **Retrieve Previously Read Articles**:
   - The function filters the `behaviors_df` dataframe to locate the row corresponding to the specified `user_id` and `timestamp`.
   - From the `History` column, it extracts a list of article IDs representing the user's previously read articles.

2. **Filter News Content**:
   - Using the list of previously read article IDs, the function filters the `news_df` dataframe.
   - Only the rows with matching article IDs are selected, and their `Content` is retrieved.

3. **Aggregate Words**:
   - The content of all relevant articles is concatenated into a single string.
   - The string is then split into individual words to create a list of all words from the articles.

4. **Return Content**:
   - The final list of words is returned for use in content-based recommendation systems.

---

#### **Example Usage**
```python
# Example
previously_read_words = create_previously_read_content(user_id='U13740', timestamp='2019-11-13 15:27:40')
print(previously_read_words)


In [None]:
def create_previously_read_content(user_id, timestamp):
    """
    Extracts all content words from articles previously read by a given user.

    Parameters:
    - user_id: ID of the target user.
    - timestamp: Specific timestamp of the user's interaction.

    Returns:
    - previously_read_content: List of words from all articles the user has previously read.
    """
    # Retrieve the IDs of articles in the user's interaction history
    previously_read_article_ids = (
        list(
            behaviors_df.loc[
                (behaviors_df['User ID'] == user_id) & (behaviors_df['Timestamp'] == timestamp),
                'History'
            ].str.split()
        )[0]
    )

    # Filter the news dataframe to include only articles the user has interacted with
    previously_read_articles_df = news_df[
        news_df['News ID'].isin(previously_read_article_ids)
    ][['News ID', 'Content']]

    # Combine the content of all previously read articles and split into individual words
    previously_read_content = ' '.join(previously_read_articles_df['Content']).split()

    return previously_read_content


# Explanation of the `create_recommended_content` Function

## **Purpose**
The function generates a dictionary where each key is a `News ID` from the recommended articles, and the value is a list of words in the content of the respective article.

---

## **Inputs**
1. **`recommended_article_ids`**:  
   A list of `News ID`s recommended by the collaborative filtering algorithm.

2. **`news_df`**:  
   The cleaned news dataset, which contains columns like `News ID` and `Content`.

---

## **Output**
- **`recommended_content`**:  
  A dictionary with:
  - **Keys**: `News ID`s of the recommended articles.
  - **Values**: Lists of words present in the article content.

---

## **Steps in the Function**
1. **Filter the DataFrame**:
   - Use `query()` to filter `news_df` based on the `recommended_article_ids`.
   - Select only the columns `News ID` and `Content` for further processing.

   ```python
   filtered_df = news_df.query("`News ID` in @recommended_article_ids")[['News ID', 'Content']]


In [None]:
def create_recommended_content(recommended_article_ids):
    '''
    Inputs:
    recommended_article_ids: List of article IDs recommended by the recommend_articles_collaborative function
    news_df: Cleaned news DataFrame imported from drive

    Outputs:
    recommended_content: Dictionary with recommended article IDs as keys and list of words in article content as values
    '''
    # Filter the news DataFrame for the recommended article IDs
    filtered_df = news_df.query("`News ID` in @recommended_article_ids")[['News ID', 'Content']]

    # Create the dictionary using a dictionary comprehension
    recommended_content = {
        row['News ID']: row['Content'].split()
        for _, row in filtered_df.iterrows()
    }

    return recommended_content



## **Overview**
This function recommends articles to a user based on the similarity of their content to articles the user has previously read. It uses Word2Vec embeddings to calculate cosine similarity between the word vectors of the previously read content and the content of recommended articles.

---

## **Inputs**
1. **`previously_read_content`**:  
   - A list of words representing the content of articles the user has already read.

2. **`recommended_content`**:  
   - A dictionary where:
     - **Keys**: Article IDs of recommended articles.
     - **Values**: Lists of words representing the content of those articles.

3. **`k`** (optional):  
   - Number of top recommendations to return.  
   - Default value: `5`.

---

## **Output**
- **`final_recommended_article_ids`**:  
   A list of the top `k` recommended article IDs based on cosine similarity.

---

## **Steps**

### **1. Filter Words Using Word2Vec Vocabulary**
   - Ensure that only words present in the Word2Vec vocabulary are used.  
   - This prevents errors when calculating similarities with words not known by the model.

---

### **2. Compute Similarity Scores**
   - Use the Word2Vec model's `n_similarity` function to calculate cosine similarity between:
     - **Previously read content**.
     - **Each recommended article’s content**.
   - Store the results in a dictionary where:
     - **Keys**: Article IDs.
     - **Values**: Similarity scores.

---

### **3. Retrieve Top `k` Articles**
   - Use `heapq.nlargest` to efficiently select the top `k` article IDs with the highest similarity scores.

---

### **4. Return Recommendations**
   - Return the `final_recommended_article_ids` list as the final output.

---



In [None]:
# Function to recommend articles based on word2vec similarity
def recommend_articles_content_w2v(previously_read_content, recommended_content, k=5):
    """
    Inputs:
    previously_read_content: List of words in the content of articles the user has previously read.
    recommended_content: Dictionary where keys are article IDs and values are lists of words in article content.

    Output:
    final_recommended_article_ids: List of the top `k` article IDs with the highest similarity scores.
    """
    # Filter words present in the word2vec model vocabulary
    filtered_user_content = [word for word in previously_read_content if word in google_model.key_to_index]
    filtered_recommended_content = {
        article_id: [word for word in content if word in google_model.key_to_index]
        for article_id, content in recommended_content.items()
    }

    # Compute similarity scores
    similarity_scores = {
        article_id: google_model.n_similarity(filtered_user_content, content)
        for article_id, content in filtered_recommended_content.items()
    }

    # Retrieve the top `k` articles with the highest similarity scores
    final_recommended_article_ids = heapq.nlargest(k, similarity_scores, key=similarity_scores.get)

    return final_recommended_article_ids


### Embeddings

### Explanation of `get_top_k_recommended_article_ids_avgvec`

This function identifies the top-k recommended articles for a user by computing the cosine similarity between the average vector of the articles the user has previously read and the vectors of recommended articles. Here's a step-by-step explanation:

---

#### Inputs:
- **`user_id`**: The ID of the user for whom the recommendations are being generated.
- **`timestamp`**: The timestamp of the user's current interaction.
- **`recommended_article_ids`**: A list of article IDs obtained from collaborative filtering recommendations.
- **`k`**: The number of top recommendations to return.

---

#### Steps:
1. **Filter News DataFrame for Recommended Articles**:
   - Extracts only the rows in `news_df` that correspond to the `recommended_article_ids`.

2. **Fetch User's Previously Read Article IDs**:
   - Retrieves the history of articles read by the user based on their `user_id` and `timestamp`.

3. **Calculate Average Vector for Previously Read Articles**:
   - Combines the vectors of all previously read articles into a single "average vector" that represents the user's interests.

4. **Exclude Previously Read Articles**:
   - Filters out articles from the `filtered_news_df` that the user has already read to avoid recommending duplicates.

5. **Compute Cosine Similarity**:
   - Calculates the similarity between the user's average vector and the vectors of unread articles in `filtered_news_df`.

6. **Sort Articles by Similarity**:
   - Ranks the articles in descending order of similarity scores.

7. **Select Top-k Articles**:
   - Extracts the IDs of the top `k` articles with the highest similarity scores.

8. **Handle Edge Cases**:
   - Ensures the function can handle scenarios like empty user history or an empty list of recommendations.

---

#### Outputs:
- **`top_k_recommended_article_ids`**:
  A list containing the IDs of the top `k` recommended articles based on similarity.

---

This function ensures personalized recommendations by leveraging collaborative filtering results and refining them with content-based similarity using article vectors.


In [None]:
def get_top_k_recommended_article_ids_avgvec(user_id, timestamp, recommended_article_ids, k=5):
    # Filter news_df to include only recommended articles from collaborative filtering
    filtered_news_df = news_df[news_df['News ID'].isin(recommended_article_ids)].copy()

    # Extract user's previously read article IDs
    user_history_row = behaviors_df.loc[
        (behaviors_df['User ID'] == user_id) & (behaviors_df['Timestamp'] == timestamp),
        'History'
    ]
    previously_read_article_ids = user_history_row.iloc[0].split() if not user_history_row.empty else []

    # Calculate the average vector of previously read articles
    previous_articles_df = news_df[news_df['News ID'].isin(previously_read_article_ids)]
    average_news_vector = previous_articles_df['Average Vector'].mean()

    # Exclude articles the user has already read
    filtered_news_df = filtered_news_df[~filtered_news_df['News ID'].isin(previously_read_article_ids)]

    # Calculate similarity scores
    filtered_news_df['Similarity'] = filtered_news_df['Average Vector'].apply(
        lambda x: cosine_similarity([average_news_vector], [x])[0][0] if average_news_vector is not None else 0
    )

    # Select top-k articles by similarity
    top_k_recommended_article_ids = (
        filtered_news_df.nlargest(k, 'Similarity')['News ID'].tolist()
        if not filtered_news_df.empty else []
    )

    return top_k_recommended_article_ids


### TFIDF

### Explanation of `recommend_articles_content_tfidf`

This function refines the list of recommended articles by utilizing TF-IDF-based content similarity. It calculates the similarity between the TF-IDF features of articles read by a user and the articles recommended by collaborative filtering.

---

#### Inputs:
- **`user_id`**: The ID of the user for whom recommendations are being generated.
- **`timestamp`**: The timestamp of the user's current interaction.
- **`recommended_articles_ids`**: A list of article IDs recommended by collaborative filtering.
- **`features`**: A TF-IDF feature matrix, where rows correspond to articles and columns represent term importance.
- **`k`**: The number of top articles to recommend.

---

#### Steps:
1. **Retrieve User History**:
   - Fetches the list of articles previously read by the user based on their `user_id` and `timestamp`.

2. **Identify Relevant Indices**:
   - **For Read Articles**: Finds the indices of the articles in `news_df` that match the IDs in the user's history.
   - **For Recommended Articles**: Identifies indices of articles from `recommended_articles_ids` in `news_df`.

3. **Filter Unread Articles**:
   - Excludes articles already read by the user from the list of potential recommendations.

4. **Create a User Profile**:
   - Aggregates the TF-IDF feature vectors of articles in the user's history to form a "user profile" vector. This is calculated as the average feature vector of the read articles.

5. **Calculate Similarity Scores**:
   - Computes the cosine similarity between the user profile vector and the feature vectors of the recommended (but unread) articles.

6. **Sort and Select Top-k Articles**:
   - Sorts the recommended articles by their similarity scores in descending order.
   - Extracts the IDs of the top `k` articles with the highest similarity scores.

7. **Output the Recommendations**:
   - Returns a list of article IDs corresponding to the top `k` recommendations.

---

#### Outputs:
- **`final_recommended_article_ids`**:
  A list of the top `k` recommended article IDs based on TF-IDF similarity.

---

This approach enhances personalized recommendations by aligning the TF-IDF-based content of unread articles with the user’s preferences derived from their reading history.


In [None]:
def create_tfidf_features(news_df):
  # Create the TF-IDF vectorizer with preprocessing
  tfidf = TfidfVectorizer(strip_accents=None,
                          lowercase=True,
                          tokenizer=word_tokenize,
                          use_idf=True,
                          norm='l2',
                          smooth_idf=True,
                          stop_words='english',
                          max_df=0.5,
                          sublinear_tf=True)

  # Fit and transform the combined column
  features = tfidf.fit_transform(news_df['Content'])

  return features

In [None]:
def recommend_articles_content_tfidf(user_id, timestamp, recommended_article_ids, features, k=5):
    """
    Recommend articles based on TF-IDF similarity.
    """

    # Fetch previously read article IDs for the user
    previously_read_article_ids = (
        behaviors_df.loc[
            (behaviors_df['User ID'] == user_id) & (behaviors_df['Timestamp'] == timestamp),
            'History'
        ].iloc[0].split()
    )

    # Find indices of previously read articles in the feature matrix
    user_read_indices = news_df.index[news_df['News ID'].isin(previously_read_article_ids)].tolist()

    # Find indices of recommended articles in the feature matrix
    recommended_indices = news_df.index[news_df['News ID'].isin(recommended_article_ids)].tolist()

    # Ensure recommended indices exclude already read articles
    filtered_indices = [idx for idx in recommended_indices if idx not in user_read_indices]

    # Aggregate TF-IDF vectors for the articles read by the user
    user_profile_vector = features[user_read_indices].mean(axis=0).A1

    # Compute cosine similarity between the user profile and recommended articles
    similarity_scores = cosine_similarity([user_profile_vector], features[filtered_indices]).flatten()

    # Retrieve the indices of the top-k articles based on similarity scores
    top_indices = np.argsort(similarity_scores)[-k:][::-1]

    # Map back to article IDs and return the top recommendations
    top_article_ids = news_df.iloc[np.array(filtered_indices)[top_indices]]['News ID'].tolist()

    return top_article_ids


### Single User

In [None]:
def single_user_recommendations_combined(user_id, timestamp, method='word2vec', similar_user_k=5, articles_k=5):
    """
    Generate article recommendations for a single user using a chosen method.

    Args:
    - user_id (str): User ID for whom recommendations are generated.
    - timestamp (str): Timestamp of the interaction.
    - method (str): Recommendation method ('word2vec', 'embeddings', or 'tfidf').
    - similar_user_k (int): Number of similar users to consider.
    - articles_k (int): Number of articles to recommend.

    Returns:
    - list: Recommended article IDs.
    """

    # Fetch similar users and their interaction timestamps
    similar_users_timestamps = fetch_similar_users(user_id, timestamp, k=similar_user_k)

    # Identify articles read by similar users
    recommended_article_ids = recommend_articles_collaborative(user_id, timestamp, similar_users_timestamps)

    # Handle recommendations based on the chosen method
    if method == 'word2vec':
        # Generate content from previously read and recommended articles
        previously_read_content = create_previously_read_content(user_id, timestamp)
        recommended_content = create_recommended_content(recommended_article_ids)

        # Recommend articles using Word2Vec-based content similarity
        final_recommended_article_ids = recommend_articles_content_w2v(
            previously_read_content, recommended_content, k=articles_k
        )

    elif method == 'embeddings':
        # Recommend articles using embeddings-based collaborative filtering
        final_recommended_article_ids = get_top_k_recommended_article_ids_avgvec(
            user_id, timestamp, recommended_article_ids, k=articles_k
        )

    elif method == 'tfidf':
        # Generate TF-IDF features from the news data
        features = create_tfidf_features(news_df)

        # Recommend articles using TF-IDF-based similarity
        final_recommended_article_ids = recommend_articles_content_tfidf(
            user_id, timestamp, recommended_article_ids, features, k=articles_k
        )

    else:
        raise ValueError(f"Unsupported method: {method}")

    return final_recommended_article_ids


### Multiple Users

In [None]:
def multiple_user_recommendations_combined(user_ids_timestamps, method='tfidf', similar_user_k=5, articles_k=5):
    """
    Generate article recommendations for multiple users based on the chosen method.

    Args:
    - user_ids_timestamps (list): List of tuples containing user IDs and their timestamps.
    - method (str): Recommendation method ('word2vec', 'embeddings', or 'tfidf').
    - similar_user_k (int): Number of similar users to consider for recommendations.
    - articles_k (int): Number of articles to recommend for each user.

    Returns:
    - dict: A dictionary where keys are user IDs and values are lists of recommended article IDs.
    """

    # Initialize dictionary to store recommendations for each user
    user_recommendations_dict = {}

    # Precompute TF-IDF features if the chosen method is 'tfidf'
    features = None
    if method == 'tfidf':
        features = create_tfidf_features(news_df)

    # Iterate over the list of user IDs and timestamps
    for counter, (user_id, timestamp) in enumerate(user_ids_timestamps, start=1):
        # Fetch similar users and their interaction timestamps
        similar_users_timestamps = fetch_similar_users(user_id, timestamp, k=similar_user_k)

        # Identify articles read by similar users
        recommended_article_ids = recommend_articles_collaborative(user_id, timestamp, similar_users_timestamps)

        if method == 'word2vec':
            # Generate content for previously read and recommended articles
            previously_read_content = create_previously_read_content(user_id, timestamp)
            recommended_content = create_recommended_content(recommended_article_ids)

            # Recommend articles using Word2Vec-based content similarity
            final_recommended_ids = recommend_articles_content_w2v(
                previously_read_content, recommended_content, k=articles_k
            )

        elif method == 'embeddings':
            # Recommend articles using embeddings-based collaborative filtering
            final_recommended_ids = get_top_k_recommended_article_ids_avgvec(
                user_id, timestamp, recommended_article_ids, k=articles_k
            )

        elif method == 'tfidf':
            # Recommend articles using TF-IDF-based similarity
            final_recommended_ids = recommend_articles_content_tfidf(
                user_id, timestamp, recommended_article_ids, features, k=articles_k
            )

        else:
            raise ValueError(f"Unsupported method: {method}")

        # Store recommendations for the user in the dictionary
        user_recommendations_dict[user_id] = final_recommended_ids

        # Optional progress tracking
        print(f"Processed user {counter}/{len(user_ids_timestamps)}")

    return user_recommendations_dict


## Test on Sample User

In [None]:
 # Run recommender system
final_recommended_ids = single_user_recommendations_combined('U13740', '2019-11-13 15:27:40', method='word2vec', similar_user_k=5, articles_k=5)

In [None]:
# view recommendations - avg vec
final_recommended_ids

['N9674', 'N24691', 'N25635', 'N64273', 'N21547']

## Test on Multiple Users

In [None]:
# Select a subset of users of size k to test on
user_ids_timestamps = select_user_ids_timestamps(k=5)

# Run recommender system
final_recommended_ids_multiple = multiple_user_recommendations_combined(user_ids_timestamps, method='word2vec', similar_user_k=5, articles_k=5)

In [None]:
final_recommended_ids_multiple

{'U21593': ['N42781', 'N59704', 'N30665', 'N16655', 'N46039'],
 'U10123': ['N8448', 'N16344', 'N27612', 'N41172', 'N10843'],
 'U75630': ['N16384', 'N37304', 'N61352', 'N64305', 'N52236'],
 'U44625': ['N10928', 'N4255', 'N58860', 'N63302', 'N11523'],
 'U64800': ['N17303', 'N27951', 'N287', 'N44021', 'N35170']}