In [1]:
import pandas as pd


In [13]:
def preprocess_data(user_tsv_path, article_tsv_path):
    # Read the TSVs
    user_data = pd.read_csv(user_tsv_path, sep='\t', header=None)
    article_data = pd.read_csv(article_tsv_path, sep='\t', header=None)
    
    print("Initial user_data shape:", user_data.shape)
    print("Initial article_data shape:", article_data.shape)
    
    user_ids_col = user_data[1]
    consumption_times_col = user_data[2].apply(lambda x: int(x.split(' ')[0].split('/')[1]))  # Extract day of the month
    article_ids_col = user_data[4]
    
    article_id_col = article_data[0]
    headlines_col = article_data[3]
    
    article_ids_col = article_ids_col.apply(lambda x: [i[:-2] for i in str(x).split(' ') if i.endswith('-1')])
        
    users_with_minus_one = article_ids_col[article_ids_col.apply(len) > 0]
    print("Number of users with at least one article with -1 suffix:", len(users_with_minus_one))
    
    user_data_processed = pd.DataFrame({
        'User_ID': user_ids_col,
        'Date_of_Consumption': consumption_times_col,
        'Article_IDs': article_ids_col
    })
    
    print("User data after processing columns shape:", user_data_processed.shape)
    
    # Filter out users with less than two articles
    user_data_processed = user_data_processed[user_data_processed['Article_IDs'].apply(len) >= 2]
    
    print("User data after filtering shape:", user_data_processed.shape)
    
    # Create a dataframe for articles
    article_data_processed = pd.DataFrame({
        'Article_ID': article_id_col,
        'Headline': headlines_col
    })
    
    merged_data = user_data_processed.explode('Article_IDs').merge(article_data_processed, left_on='Article_IDs', right_on='Article_ID')
    
    print("Merged data shape:", merged_data.shape)
    
    grouped_data = merged_data.groupby(['User_ID', 'Date_of_Consumption'])['Headline'].apply(list).reset_index()
    
    print("Grouped data shape:", grouped_data.shape)
    
    user_ids = grouped_data['User_ID'].tolist()
    consumption_days = grouped_data['Date_of_Consumption'].tolist()
    article_headlines = grouped_data['Headline'].tolist()
    
    return user_ids, consumption_days, article_headlines


user_tsv_path = 'originalData/behaviors.tsv'
article_tsv_path = 'originalData/news.tsv'
user_ids, consumption_times, article_headlines = preprocess_data(user_tsv_path, article_tsv_path)

Initial user_data shape: (2232748, 5)
Initial article_data shape: (101527, 8)
0    [N94157, N78699, N71090, N31174]
1                    [N25587, N36266]
2                            [N47925]
3                           [N114935]
4                            [N86258]
5                            [N98178]
6                            [N94572]
7                            [N98178]
8                           [N123077]
9                            [N28902]
Name: 4, dtype: object
Number of users with at least one article with -1 suffix: 2232748
User data after processing columns shape: (2232748, 3)
User data after filtering shape: (618930, 3)
Merged data shape: (1769838, 5)
Grouped data shape: (512971, 3)


In [30]:
print (user_ids[9])

U100003


In [27]:
print(article_headlines[10])

["Stephen Curry calls out Michael Jordan for being a 'hater'", 'U.S. Drones Appear to Show Turkish- Backed Forces Targeting Civilians']


In [28]:
print(consumption_times[10])

12


In [31]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [34]:
class UserEncoder(nn.Module):
    def __init__(self, embedding_dim, lstm_hidden_dim, mlp_hidden_dim, lambda_val, alpha, beta):
        super(UserEncoder, self).__init__()
        
        self.sentence_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
        
        self.lstm = nn.LSTM(embedding_dim, lstm_hidden_dim, batch_first=True)
        
        self.mlp = nn.Sequential(
            nn.Linear(embedding_dim, mlp_hidden_dim),
            nn.ReLU(),
            nn.Linear(mlp_hidden_dim, 2*embedding_dim)
        )
        
        self.lambda_val = lambda_val
        self.alpha = alpha
        self.beta = beta

    def forward(self, user_ids, article_headlines, consumption_times):
        article_embeddings = self.sentence_model.encode(article_headlines)
        article_embeddings = torch.tensor(article_embeddings)
        
        current_time = torch.tensor(15) # day after entries end 
        consumption_days_tensor = torch.tensor(consumption_days, dtype=torch.float32)  
        day_diff = current_day - consumption_days_tensor
        weights = torch.exp(-self.lambda_val * day_diff)
        
        weighted_sum = torch.sum(weights.unsqueeze(-1) * article_embeddings, dim=1)
        
        mlp_output = self.mlp(weighted_sum)
        
        # Split into consistent and transient user interests
        p_consistent = mlp_output[:, :mlp_output.shape[1]//2]
        p_transient = mlp_output[:, mlp_output.shape[1]//2:]
        
        #Okay so one thing that confuses me here is what the point of the LSTM is if you only have one time entry 
        #One idea was to filter out entries that only appear multiple times (there are a few users who appear multiple times)
        #Not sure if this would provide enough training data and would also require pretty big modifications to the code. 
        
        lstm_out, _ = self.lstm(p_consistent.unsqueeze(1))
        updated_p_consistent = lstm_out.squeeze(1)
        
        subsampled_headlines = [random.sample(headlines, len(headlines) // 2) for headlines in article_headlines]
        subsampled_embeddings = self.sentence_model.encode(subsampled_headlines)
        subsampled_embeddings = torch.tensor(subsampled_embeddings)
        
        positive_weighted_sum = torch.sum(weights.unsqueeze(-1) * subsampled_embeddings, dim=1)
        positive_mlp_output = self.mlp(positive_weighted_sum)
        p_positive_consistent = positive_mlp_output[:, :positive_mlp_output.shape[1]//2]
        
        return updated_p_consistent, p_transient
    # ngl needed chatGPT/stack overflow to explain a lot of the loss function 
    def compute_loss(self, p, p_positive, p_negative):
        contrastive_loss = torch.sum(F.relu(torch.norm(p - p_positive, dim=1)**2 - torch.norm(p - p_negative, dim=1)**2 + self.alpha))
        consistency_reg = self.beta * torch.norm(p[:, :p.shape[1]//2] - p_positive[:, :p_positive.shape[1]//2], dim=1)**2
        total_loss = contrastive_loss + consistency_reg
        return total_loss

In [35]:
model = UserEncoder(embedding_dim=768, lstm_hidden_dim=256, mlp_hidden_dim=512, lambda_val=0.5, alpha=0.5, beta=0.1)


In [36]:
p_consistent, p_transient = model(user_ids, article_headlines, consumption_times)

TypeError: unsupported operand type(s) for -: 'Tensor' and 'list'