In [1]:
import pandas as pd


In [2]:
def preprocess_data(user_tsv_path, article_tsv_path):
    # Read the TSVs
    user_data = pd.read_csv(user_tsv_path, sep='\t', header=None)
    article_data = pd.read_csv(article_tsv_path, sep='\t', header=None)
    
    print("Initial user_data shape:", user_data.shape)
    print("Initial article_data shape:", article_data.shape)
    
    user_ids_col = user_data[1]
    consumption_times_col = user_data[2].apply(lambda x: int(x.split(' ')[0].split('/')[1]))  # Extract day of the month
    article_ids_col = user_data[4]
    
    article_id_col = article_data[0]
    headlines_col = article_data[3]
    
    article_ids_col = article_ids_col.apply(lambda x: [i[:-2] for i in str(x).split(' ') if i.endswith('-1')])
        
    users_with_minus_one = article_ids_col[article_ids_col.apply(len) > 0]
    print("Number of users with at least one article with -1 suffix:", len(users_with_minus_one))
    
    user_data_processed = pd.DataFrame({
        'User_ID': user_ids_col,
        'Date_of_Consumption': consumption_times_col,
        'Article_IDs': article_ids_col
    })
    
    print("User data after processing columns shape:", user_data_processed.shape)
    
    # Filter out users with less than two articles
    user_data_processed = user_data_processed[user_data_processed['Article_IDs'].apply(len) >= 2]
    
    print("User data after filtering shape:", user_data_processed.shape)
    
    # Create a dataframe for articles
    article_data_processed = pd.DataFrame({
        'Article_ID': article_id_col,
        'Headline': headlines_col
    })
    
    merged_data = user_data_processed.explode('Article_IDs').merge(article_data_processed, left_on='Article_IDs', right_on='Article_ID')
    
    print("Merged data shape:", merged_data.shape)
    
    grouped_data = merged_data.groupby(['User_ID', 'Date_of_Consumption'])['Headline'].apply(list).reset_index()
    
    print("Grouped data shape:", grouped_data.shape)
    
    user_ids = grouped_data['User_ID'].tolist()
    consumption_days = grouped_data['Date_of_Consumption'].tolist()
    article_headlines = grouped_data['Headline'].tolist()
    
    
    #user_counts = user_data_processed['User_ID'].value_counts()
    #valid_users = user_counts[user_counts > 1].index.tolist()
    #user_data_processed = user_data_processed[user_data_processed['User_ID'].isin(valid_users)]
    
    print("User data after filtering users appearing multiple times shape:", user_data_processed.shape)
    
    # Filter out corresponding article headlines and consumption times
    #valid_indices = user_data_processed.index.tolist()
    #user_ids = [user_ids[i] for i in valid_indices]
    #consumption_days = [consumption_days[i] for i in valid_indices]
    #article_headlines = [article_headlines[i] for i in valid_indices]
    
    
    
    return user_ids, consumption_days, article_headlines
    

user_tsv_path = 'originalData/behaviors.tsv'
article_tsv_path = 'originalData/news.tsv'
user_ids, consumption_times, article_headlines = preprocess_data(user_tsv_path, article_tsv_path)

Initial user_data shape: (2232748, 5)
Initial article_data shape: (101527, 8)
Number of users with at least one article with -1 suffix: 2232748
User data after processing columns shape: (2232748, 3)
User data after filtering shape: (618930, 3)
Merged data shape: (1769838, 5)
Grouped data shape: (512971, 3)
User data after filtering users appearing multiple times shape: (618930, 3)


In [3]:
print (user_ids[1000])

U101166


In [4]:
print(article_headlines[10])

["Stephen Curry calls out Michael Jordan for being a 'hater'", 'U.S. Drones Appear to Show Turkish- Backed Forces Targeting Civilians']


In [5]:
print(consumption_times[10])

12


In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sentence_transformers import SentenceTransformer
import random

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
unique_user_ids = []
for user_id in user_ids:
    if user_id not in unique_user_ids:
        unique_user_ids.append(user_id)
    if len(unique_user_ids) == 10000:
        break

user_ids = [user_id for user_id in user_ids if user_id in unique_user_ids]
article_headlines = [article_headlines[i] for i, user_id in enumerate(user_ids) if user_id in unique_user_ids]
consumption_times = [consumption_times[i] for i, user_id in enumerate(user_ids) if user_id in unique_user_ids]

from collections import Counter
user_counts = Counter(user_ids)
multiple_occurrence_users = [user for user, count in user_counts.items() if count > 1]

combined_article_headlines = {user: [] for user in multiple_occurrence_users}
combined_consumption_times = {user: [] for user in multiple_occurrence_users}

for user_id, headlines, times in zip(user_ids, article_headlines, consumption_times):
    if user_id in multiple_occurrence_users:
        combined_article_headlines[user_id].append(headlines)
        combined_consumption_times[user_id].append(times)

In [8]:
import torch
from torch.nn.functional import pad

def pad_and_stack(tensor_list):
    max_rows = max(tensor.size(0) for tensor in tensor_list)
    max_cols = max(tensor.size(1) for tensor in tensor_list)

    padded_tensors = []
    for tensor in tensor_list:
        row_padding = max_rows - tensor.size(0)
        col_padding = max_cols - tensor.size(1)

        padded_tensor = pad(tensor, (0, col_padding, 0, row_padding))

        padded_tensors.append(padded_tensor)

    stacked_tensor = torch.stack(padded_tensors)

    return stacked_tensor

def pad_to_max(*tensors):
        max_rows = max(tensor.size(0) for tensor in tensors)
        max_cols = max(tensor.size(1) for tensor in tensors)
        max_depth = max(tensor.size(2) for tensor in tensors)  # Add this line

        # Pad each tensor and store them in a new list
        padded_tensors = []
        for tensor in tensors:
            row_padding = max_rows - tensor.size(0)
            col_padding = max_cols - tensor.size(1)
            depth_padding = max_depth - tensor.size(2)  # Add this line

            padded_tensor = F.pad(tensor, (0, depth_padding, 0, col_padding, 0, row_padding))

            padded_tensors.append(padded_tensor)

        return tuple(padded_tensors)

In [9]:
class UserEncoder(nn.Module):
    def __init__(self, embedding_dim, lstm_hidden_dim, mlp_hidden_dim, lambda_val, alpha, beta):
        super(UserEncoder, self).__init__()
        
        self.sentence_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
        self.lstm = nn.LSTM(embedding_dim // 2, lstm_hidden_dim // 2, batch_first=True)
        
        self.mlp = nn.Sequential(
            nn.Linear(embedding_dim, mlp_hidden_dim),
            nn.ReLU(),
            nn.Linear(mlp_hidden_dim, embedding_dim)
        )
        
        self.lambda_val = lambda_val
        self.alpha = alpha
        self.beta = beta

    def forward(self, user_ids, article_headlines, consumption_times):
        all_embeddings = [torch.tensor(self.sentence_model.encode(headlines)) for headlines in article_headlines]
        max_articles = max([emb.shape[0] for emb in all_embeddings])
    
        # Pad the embeddings to have the same size
        padded_embeddings = []
        for emb in all_embeddings:
            pad_size = max_articles - emb.shape[0]
            if pad_size > 0:
                pad = torch.zeros(pad_size, emb.shape[1])
                padded_emb = torch.cat([emb, pad], dim=0)
            else:
                padded_emb = emb
            padded_embeddings.append(padded_emb)

        # Stack the padded embeddings
        
        all_embeddings = torch.stack(padded_embeddings)

        # Compute weights for each timestamp in the sequence
        current_time = torch.tensor(15)  # day after entries end 
        all_weights = [torch.exp(-self.lambda_val * (current_time - times)) for times in consumption_times]

        # Compute weighted sums for each timestamp
        weighted_sums = [weights * embeddings for weights, embeddings in zip(all_weights, all_embeddings)]
        weighted_sums = torch.stack(weighted_sums)
        
        # Pass each weighted sum through the MLP to get p_consistent and p_transient
        mlp_outputs = [self.mlp(weighted_sum) for weighted_sum in weighted_sums]
        p_consistents = [output[:, :output.shape[1]//2] for output in mlp_outputs]
        p_transients = [output[:, output.shape[1]//2:] for output in mlp_outputs]
        #print (p_consistents)
        
        p_consistents = torch.stack(p_consistents)
        p_transients = torch.stack(p_transients)       
        
        
        lstm_out_consistent, _ = self.lstm(p_consistents)
        lstm_out_transient, _ = self.lstm(p_transients)

        updated_p_consistent = lstm_out_consistent[-1]
        updated_p_transient = lstm_out_transient[-1]

        return updated_p_consistent, updated_p_transient

    def generate_positive_embeddings(self, user_ids, headlines, consumption_times):
        random_idx = random.randint(0, len(headlines) - 1)
        selected_headlines = headlines[random_idx]
        selected_times = consumption_times[random_idx]

        # Randomly subsample half of the articles for the positive embeddings
        subsampled_headlines = random.sample(selected_headlines, len(selected_headlines) // 2)
        if len(subsampled_headlines) > 1:
            p_positive_consistent, p_positive_transient = self.forward(user_id, [subsampled_headlines], [selected_times])
        else:
            p_positive_consistent, p_positive_transient = self.forward(user_id, [selected_headlines], [selected_times])

        return torch.cat((p_positive_consistent, p_positive_transient), dim=0)

    def generate_negative_embeddings(self, user_embedding, batch_embeddings):
        distances = torch.norm(batch_embeddings - user_embedding.unsqueeze(0), dim=1)
        furthest_idx = distances.argmax().item()  # pick the furthest embedding
        furthest_idx = furthest_idx % batch_embeddings.shape[0]
        return batch_embeddings[furthest_idx]

    #def compute_loss(self, p, p_positive, p_negative):
        #contrastive_loss = torch.sum(F.relu(torch.norm(p - p_positive, dim=1)**2 - torch.norm(p - p_negative, dim=1)**2 + self.alpha))
        #consistency_reg = self.beta * torch.norm(p[:, :p.shape[1]//2] - p_positive[:, :p_positive.shape[1]//2], dim=1)**2
        #total_loss = contrastive_loss + torch.sum(consistency_reg)
    #    return total_loss
    
    def compute_loss(self, p, p_positive, p_negative):
        p, p_positive, p_negative = pad_to_max(p, p_positive, p_negative)
        contrastive_loss = torch.sum(F.relu(torch.norm(p[:, :p.shape[1]//2] - p_positive[:, :p_positive.shape[1]//2], dim=1)**2 - 
                                         torch.norm(p[:, :p.shape[1]//2] - p_negative[:, :p_negative.shape[1]//2], dim=1)**2 + self.alpha))
        consistency_reg = self.beta * torch.norm(p[:, :p.shape[1]//2] - p_positive[:, :p_positive.shape[1]//2], dim=1)**2
        total_loss = contrastive_loss + torch.sum(consistency_reg)
        return total_loss

    def training_step(self, multiple_occurrence_users, combined_article_headlines, combined_consumption_times):

        batch_p_consistent = []
        batch_p_positive_consistent = []
        batch_p_negative = []
        
        #max_seq_len = max([len(p) for user_id in multiple_occurrence_users for p in combined_article_headlines[user_id]])

        for user_id in multiple_occurrence_users:
            headlines_sequence = combined_article_headlines[user_id]
            times_sequence = combined_consumption_times[user_id]

            p_consistent, p_transient = self.forward(user_id, headlines_sequence, times_sequence)
            p_pos = self.generate_positive_embeddings(user_id, headlines_sequence, times_sequence)
            p = torch.cat((p_consistent, p_transient), dim=0)
            
            #pad_size = max_seq_len - p.size(0)
            #if pad_size > 0:
            #    p = F.pad(p, (0, 0, 0, pad_size))

            batch_p_consistent.append(p)
            
            batch_p_positive_consistent.append(p_pos)

        batch_p_consistent = pad_and_stack(batch_p_consistent)
        batch_p_positive_consistent = pad_and_stack(batch_p_positive_consistent)

        for p_consistent in batch_p_consistent:
            p_negative = self.generate_negative_embeddings(p_consistent, batch_p_consistent)
            batch_p_negative.append(p_negative)
        
        batch_p_negative = pad_and_stack(batch_p_negative)

        loss = self.compute_loss(batch_p_consistent, batch_p_positive_consistent, batch_p_negative)
        return loss

In [23]:
from torch.optim import lr_scheduler

def train(model, user_ids, consumption_times, article_headlines, optimizer, num_epochs=8, batch_size=128):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.05)
    model.train()
    
    user_counts = Counter(user_ids)
    all_batch_user_ids = multiple_occurrence_users
    all_batch_consumption_times = combined_consumption_times
    all_batch_article_headlines = combined_article_headlines
    
    batch_user_ids = all_batch_user_ids[:3000]
    batch_consumption_times = all_batch_consumption_times
    batch_article_headlines = all_batch_article_headlines
    
    scheduler = lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.9)
    
    for epoch in range(num_epochs):
        total_loss = 0
        num_batches = 0
        
        for i in range(0, len(batch_user_ids), batch_size):
            mini_batch_user_ids = batch_user_ids[i:i+batch_size]
            mini_batch_consumption_times = batch_consumption_times
            mini_batch_article_headlines = batch_article_headlines
            
            optimizer.zero_grad()
            loss = model.training_step(mini_batch_user_ids, mini_batch_article_headlines, mini_batch_consumption_times)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            num_batches += 1
            torch.cuda.empty_cache()
        
        scheduler.step()
        # Calculate and print the average loss per batch
        avg_loss = total_loss / num_batches
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss}, Learning Rate: {scheduler.get_last_lr()[0]}")

embedding_dim = 768
lstm_hidden_dim = 512
mlp_hidden_dim = 256
lambda_val = 0.5
alpha = 0.5
beta = 0.5

model = UserEncoder(embedding_dim, lstm_hidden_dim, mlp_hidden_dim, lambda_val, alpha, beta)
optimizer = optim.Adam(model.parameters(), lr=0.005)

In [24]:
train(model, user_ids, consumption_times, article_headlines, optimizer)

Epoch 1/8, Loss: 21881.178080240887, Learning Rate: 0.05
Epoch 2/8, Loss: 16422.940348307293, Learning Rate: 0.045000000000000005
Epoch 3/8, Loss: 16315.850016276041, Learning Rate: 0.045000000000000005
Epoch 4/8, Loss: 16237.884338378906, Learning Rate: 0.04050000000000001
Epoch 5/8, Loss: 16235.413635253906, Learning Rate: 0.04050000000000001
Epoch 6/8, Loss: 16236.9609375, Learning Rate: 0.03645000000000001
Epoch 7/8, Loss: 16236.731018066406, Learning Rate: 0.03645000000000001
Epoch 8/8, Loss: 16239.646219889322, Learning Rate: 0.03280500000000001


In [53]:
torch.save(model.state_dict(), 'encoderFinal2.pth')

In [26]:
headlines_sequence = []
times_sequence = []
user_ids_list = ['U102704']
user_ids2_list = ['U207812']
headlines_sequence_2 = [['Luxury store sets minimum spend for Santa encounters', 'University athlete meets tragic fate in practice session', 'Dairy giant Dean Foods goes bankrupt', 'Lawsuit against firearm maker gets green light from Supreme Court', 'Child succumbs to gunshot wound', 'College freshman dies after incident at fraternity house', 'Evidence of Turkish Forces targeting civilians captured by U.S. Drones', 'Seahawks triumph in nail-biting overtime against 49ers'], ['Parent brings baby to drug transaction, child gets injured', 'Prominent Solar Panel Manufacturer Shuts Down Production', 'For Sale: Deserted Missile Complex in Arizona listed at $400k', 'Firefighter loses life in line of duty in Massachusetts', 'Valiant Officer Pulls Driver From Flaming Wreckage', 'McLaren Reveals the Elva: A Windshield-less Hypercar Priced at $1.7 Million', 'Universities in Hong Kong under siege as police crackdown intensifies']]


headlines_sequence = combined_article_headlines['U102704']
print (headlines_sequence)
print (headlines_sequence_2)
times_sequence = combined_consumption_times[user_ids_list[0]]

[['Harrods accused of ruining the spirit of Christmas after limiting Santa visits to customers who spend over $2,500', 'College gymnast dies following training accident in Connecticut', 'Dean Foods files for bankruptcy', 'Supreme Court refuses to block lawsuit against gun manufacturer brought by Sandy Hook families', '11-year-old boy dies after being shot in chest', 'San Diego State University freshman hurt in fraternity incident dies', 'U.S. Drones Appear to Show Turkish- Backed Forces Targeting Civilians', "4 takeaways from the Seahawks' wild, overtime win over the 49ers"], ["Dad took 11-month-old to drug deal as 'human shield'. The baby was shot.", "One of America's biggest solar panel makers quits manufacturing", 'A decommissioned nuclear missile complex in Arizona that was abandoned for decades is now on sale for $400,000', 'Massachusetts fire lieutenant dies battling house fire', 'Hero Police Officer Saves Driver From Fiery Crash', 'The McLaren Elva Is a $1.7 Million Topless, Win

In [27]:
with torch.no_grad():  # Disable gradient computation during inference for efficiency
    predictions = model(user_ids_list, headlines_sequence, times_sequence)
    predictions2 = model(user_ids2_list, headlines_sequence_2, times_sequence)

In [19]:
print(len(predictions2))

2


In [None]:
#print (multiple_occurrence_users[3501])
print (predictions[0][7])


In [None]:
difference = predictions2 - predictions

In [28]:
difference_tuple = tuple(tensor2 - tensor1 for tensor1, tensor2 in zip(predictions2, predictions))

In [35]:
print(difference_tuple[1][6])

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., -0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,

In [None]:
predict_consumption_times = [combined_consumption_times[user_id] for user_id in predict_user_ids]
predict_article_headlines = [combined_article_headlines[user_id] for user_id in predict_user_ids]

In [None]:
print(predict_consumption_times)

In [30]:
are_identical = all(torch.equal(tensor1, tensor2) for tensor1, tensor2 in zip(predictions, predictions2))

print("Tuples are identical:", are_identical)

Tuples are identical: False


In [36]:
total_difference = sum(torch.abs(tensor2 - tensor1).sum() for tensor1, tensor2 in zip(predictions2, predictions))

In [37]:
print(total_difference)

tensor(2.4125e-21)


In [38]:
user_ids_list = ['U102704']
user_ids2_list = ['U207812']
headlines_sequence_2 = [['Luxury store sets minimum spend for Santa encounters', 'University athlete meets tragic fate in practice session', 'Dairy giant Dean Foods goes bankrupt', 'Lawsuit against firearm maker gets green light from Supreme Court', 'Child succumbs to gunshot wound', 'College freshman dies after incident at fraternity house', 'Evidence of Turkish Forces targeting civilians captured by U.S. Drones', 'Seahawks triumph in nail-biting overtime against 49ers'], ['Parent brings baby to drug transaction, child gets injured', 'Prominent Solar Panel Manufacturer Shuts Down Production', 'For Sale: Deserted Missile Complex in Arizona listed at $400k', 'Firefighter loses life in line of duty in Massachusetts', 'Valiant Officer Pulls Driver From Flaming Wreckage', 'McLaren Reveals the Elva: A Windshield-less Hypercar Priced at $1.7 Million', 'Universities in Hong Kong under siege as police crackdown intensifies']]


headlines_sequence = combined_article_headlines['U102704']
#print (headlines_sequence)
#print (headlines_sequence_2)
times_sequence = combined_consumption_times[user_ids_list[0]]

with torch.no_grad():  
    predictions = model(user_ids_list, headlines_sequence, times_sequence)
    predictions2 = model(user_ids2_list, headlines_sequence, times_sequence)

    are_identical = all(torch.equal(tensor1, tensor2) for tensor1, tensor2 in zip(predictions, predictions2))

print("Tuples are identical:", are_identical)

Tuples are identical: True


In [42]:
headlines_sequence_3 = [['Local bakery introduces new line of vegan pastries', 'Researchers discover new species of deep-sea creatures', 'City plans to open new public library next month', 'Scientists announce breakthrough in renewable energy technology', 'Young pianist wins international music competition', 'University unveils plans for environmentally-friendly campus renovations', 'Rare bird species spotted in local wildlife reserve', '5 highlights from the latest tech expo'], ['Gardener discovers ancient artifact in backyard', 'New planet discovered in our solar system', 'Local artist transforms abandoned building into public art space', 'Veterinarian volunteers to help injured wildlife in rainforest', 'Firefighter adopts dog he rescued from burning building', 'The latest electric car model breaks records for speed and efficiency', 'International Food Festival attracts visitors from around the globe']]

with torch.no_grad():  
    predictions = model(user_ids_list, headlines_sequence, times_sequence)
    predictions2 = model(user_ids2_list, headlines_sequence_3, times_sequence)

    are_identical = all(torch.equal(tensor1, tensor2) for tensor1, tensor2 in zip(predictions, predictions2))

print("Tuples are identical:", are_identical)

total_difference = sum(torch.abs(tensor2 - tensor1).sum() for tensor1, tensor2 in zip(predictions2, predictions))

print(total_difference)

Tuples are identical: False
tensor(1.0065e-08)


In [52]:
times_sequence_2 = [10,11]

with torch.no_grad():
    predictions = model(user_ids_list, headlines_sequence, times_sequence)
    predictions2 = model(user_ids2_list, headlines_sequence, times_sequence)

    are_identical = all(torch.equal(tensor1, tensor2) for tensor1, tensor2 in zip(predictions, predictions2))
    
total_difference = sum(torch.abs(tensor2 - tensor1).sum() for tensor1, tensor2 in zip(predictions2, predictions))

print(total_difference)

tensor(0.)
