In [1]:
import pandas as pd


In [7]:
import csv

def read_tsv_to_data(file_name):
    userIDs = []
    articleHeadlines = []
    dates = []

    with open(file_name, mode='r', encoding='utf-8') as file:
        tsv_reader = csv.reader(file, delimiter='\t')
        for row in tsv_reader:
            userID, headline, date = row[0], row[1], row[2]
            userIDs.append(userID)
            articleHeadlines.append(headline)
            dates.append(date)

    return userIDs, articleHeadlines, dates

def split_data_by_year(userIDs, articleHeadlines, dates):
    train = []
    validation = []
    test = []

    train_users = set()
    validation_users = set()
    test_users = set()

    for userID, headline, date in zip(userIDs, articleHeadlines, dates):
        if date[:4] == "2017":
            if len(train_users) < 40000:
                train.append((userID, headline, date))
                train_users.add(userID)
        elif date[:4] == "2018" and int(date[5:7]) < 7:
            if len(validation_users) < 20000:
                validation.append((userID, headline, date))
                validation_users.add(userID)
        elif date[:4] == "2018" and int(date[5:7]) >= 7:
            if len(test_users) < 20000:
                test.append((userID, headline, date))
                test_users.add(userID)

    return train, validation, test

combined_filename = 'combined_data_80k.tsv'
userIDs, articleHeadlines, dates = read_tsv_to_data(combined_filename)

filtered_data_train, filtered_data_validation, filtered_data_test = split_data_by_year(userIDs, articleHeadlines, dates)


In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sentence_transformers import SentenceTransformer
import random

In [39]:
from datetime import datetime, timedelta

def days_between_dates(date_str, end_date_str):
    date_format = "%Y-%m-%d"
    start_date = datetime.strptime(date_str, date_format)
    end_date = datetime.strptime(end_date_str, date_format)
    return (end_date - start_date).days

Number of unique users in training data: 40000


In [11]:


def process_data(filtered_data, sentence_model):
    user_ids = [user_id for user_id, _, _ in filtered_data]
    
    headlines_to_encode = [headline for _, headline, _ in filtered_data]
    encoded_headlines = sentence_model.encode(headlines_to_encode, batch_size=1500)
    encoded_headline_iter = iter(encoded_headlines)  

    combined_article_headlines = {}
    combined_consumption_times = {}

    for user_id, _, time in filtered_data:
        encoded_headline = next(encoded_headline_iter)  

        if user_id not in combined_article_headlines:
            combined_article_headlines[user_id] = []
            combined_consumption_times[user_id] = []

        if time not in combined_consumption_times[user_id]:
            combined_consumption_times[user_id].append(time)
            combined_article_headlines[user_id].append([encoded_headline])
        else:
            index = combined_consumption_times[user_id].index(time)
            combined_article_headlines[user_id][index].append(encoded_headline)

    return user_ids, combined_article_headlines, combined_consumption_times

sentence_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Process the data
user_ids_train, combined_article_headlines_train, combined_consumption_times_train = process_data(filtered_data_train, sentence_model)
user_ids_validation, combined_article_headlines_validation, combined_consumption_times_validation = process_data(filtered_data_validation, sentence_model)
user_ids_test, combined_article_headlines_test, combined_consumption_times_test = process_data(filtered_data_test, sentence_model)


In [40]:
class UserEncoder(nn.Module):
    def __init__(self, embedding_dim, lstm_hidden_dim, mlp_hidden_dim, lambda_val, alpha, beta):
        super(UserEncoder, self).__init__()
        
        self.lstm = nn.LSTM(lstm_hidden_dim*2, lstm_hidden_dim*2, batch_first=True)
        
        self.mlp = nn.Sequential(
            nn.Linear(embedding_dim, mlp_hidden_dim),
            nn.ReLU(),
            nn.Linear(mlp_hidden_dim, embedding_dim)
        )
        
        self.lambda_val = lambda_val
        self.alpha = alpha
        self.beta = beta

    def forward(self, user_ids, all_embeddings, consumption_times, curr_time):
                

        date_format = "%Y-%m-%d"
        weighted_sum = torch.zeros((1, 768))
        for (times, embedding) in zip(consumption_times, all_embeddings) : 
            embedding = [torch.from_numpy(np_array) for np_array in embedding]
            total_embed = torch.sum(torch.stack(embedding), dim=0)
            current_date_range = days_between_dates(times, curr_time)
            weight = torch.exp(-self.lambda_val * torch.tensor(current_date_range))
            weighted_sum += (weight * total_embed)
                
        output = self.mlp(weighted_sum)
        p_consistent = output[:, :output.shape[1]// 2]
        p_transients = output[:, output.shape[1]//2:]
        #lstm_out_consistent, _ = self.lstm(p_consistents)
        #lstm_out_transient, _ = self.lstm(p_transients)

        #updated_p_consistent = lstm_out_consistent[-1]
        #updated_p_transient = lstm_out_transient[-1]

        return p_consistent, p_transients
    
    def use_lstm(self, con, tran) : 
        combined = torch.cat((con, tran), dim=1).unsqueeze(0)
        lstm_output, _ = self.lstm(combined)
        lstm_output = lstm_output.squeeze(0)

        new_p_consistent = lstm_output[:, :lstm_output.shape[1]// 2]
        new_p_transient = lstm_output[:, lstm_output.shape[1]//2:]
        
        return new_p_consistent

    def generate_positive_embeddings(self, user_ids, headlines, consumption_times, curr_time):
        random_idx = random.randint(0, len(headlines) - 1)
        selected_headlines = headlines[random_idx]
        selected_times = consumption_times[random_idx]

        subsampled_headlines = random.sample(selected_headlines, len(selected_headlines) // 2)
        if len(subsampled_headlines) > 1:
            p_u_con, p_u_tra = self.forward(user_ids, [subsampled_headlines], [selected_times], curr_time)
        else:
            p_u_con, p_u_tra = self.forward(user_ids, [selected_headlines], [selected_times], curr_time)

        return torch.cat((p_u_con, p_u_tra), dim=1)

    def generate_negative_embeddings(self, target_embedding, all_embeddings):

        max_distance = -1
        negative_embedding_idx = -1

        for idx, emb in enumerate(all_embeddings):
            if torch.equal(target_embedding, emb):
                continue

            distance = torch.norm(target_embedding - emb, p=2)

            if distance > max_distance:
                max_distance = distance
                negative_embedding_idx = idx

        return all_embeddings[negative_embedding_idx] if negative_embedding_idx >= 0 else all_embeddings[0]


    def compute_loss(self, p_u, p_positive, p_negative, p_u_con, p_u_tra):

        contrastive_loss = torch.relu(torch.norm(p_u - p_positive, p=2)**2 - 
                              torch.norm(p_u - p_negative, p=2)**2 + 
                              self.alpha)
        
        if torch.all(contrastive_loss <= 0):
            return 0
        
        lstmVal = self.use_lstm(p_u_con, p_u_tra)        
        consistency_loss = self.beta * torch.norm(lstmVal - p_u_con, p=2, dim=1)**2
        
        
        return torch.mean(contrastive_loss + consistency_loss)


    def training_step(self, users, combined_article_headlines, combined_consumption_times, ttv):
        if (ttv == "train") : 
            startDate = "2017-01-01"
            date = "2017-12-31"
        if (ttv == "test") :
            startDate = "2018-01-01"
            date = "2018-06-30"
        if (ttv == "validation") : 
            startDate = "2018-07-01"
            date = "2018-12-31"
            
        
        
        date = datetime.strptime(date, "%Y-%m-%d")
        currDate = startDate
        currDate = datetime.strptime(currDate, "%Y-%m-%d")
        
        
        loss = 0 
        
        while currDate <= date : 
            batch_p = []
            batch_p_cons = []
            batch_p_transient = []
            batch_p_pos = []
            batch_p_negatives = []
            currDate = currDate.strftime("%Y-%m-%d")
            for user_id in users:
                
                headlines_sequence = combined_article_headlines[user_id]
                times_sequence = combined_consumption_times[user_id]                

                p_consistent, p_transient = self.forward(user_id, headlines_sequence, times_sequence, currDate)
                p_pos = self.generate_positive_embeddings(user_id, headlines_sequence, times_sequence, currDate)
                p = torch.cat((p_consistent, p_transient), dim=1)
                
                
                batch_p_cons.append(p_consistent)
                batch_p_transient.append(p_transient)
                batch_p.append(p)
                batch_p_pos.append(p_pos)


            for value in batch_p:
                p_negative = self.generate_negative_embeddings(value, batch_p)
                batch_p_negatives.append(p_negative)
            
            #compute_loss(self, p_u, p_positive, p_negative, p_u_con, p_u_tra):
            
            for input1, input2, input3, input4, input5 in zip (batch_p, batch_p_pos, batch_p_negatives, batch_p_cons, batch_p_transient) : 
                currLoss = self.compute_loss(input1, input2, input3, input4, input5)
                loss += currLoss
            
            currDate = datetime.strptime(currDate, "%Y-%m-%d")
            currDate = currDate + timedelta(days=1)
            print("Done")
            
                
        return loss

In [41]:
from torch.optim import lr_scheduler

def train(model, train_user_ids, train_article_headlines, train_consumption_times, 
          val_user_ids, val_article_headlines, val_consumption_times, 
          optimizer, num_epochs=8, batch_size=1000):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.05)
    model.train()
    
    batch_user_ids = train_user_ids[:10000]
    
    scheduler = lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.9)
    
    for epoch in range(num_epochs):
        total_loss = 0
        num_batches = 0

        for i in range(0, len(train_user_ids), batch_size):
            batch_user_ids = train_user_ids[i:i+batch_size]
            batch_article_headlines = train_article_headlines
            batch_consumption_times = train_consumption_times

            optimizer.zero_grad()
            loss = model.training_step(batch_user_ids, batch_article_headlines, batch_consumption_times, "train")
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            num_batches += 1
            torch.cuda.empty_cache()
            print(f"Batch {num_batches} done")

        scheduler.step()
        avg_loss = total_loss / num_batches
        print(f"Epoch {epoch+1}/{num_epochs}, Average Training Loss: {avg_loss}, Learning Rate: {scheduler.get_last_lr()[0]}")
    
    model.eval()  
    
    with torch.no_grad():  
        val_loss = model.training_step(val_user_ids, val_article_headlines, val_consumption_times, "validation")
    print(f"Epoch {epoch+1}/{num_epochs}, Validation Loss: {val_loss.item()}")

embedding_dim = 768
lstm_hidden_dim = 384
mlp_hidden_dim = 384
lambda_val = 0.5
alpha = 0.5
beta = 0.5

model = UserEncoder(embedding_dim, lstm_hidden_dim, mlp_hidden_dim, lambda_val, alpha, beta)
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [None]:
mu_users = list(user_ids_train)
train(model, mu_users , combined_article_headlines_train, combined_consumption_times_train, 
      user_ids_validation, combined_article_headlines_validation, combined_consumption_times_validation, optimizer)

In [None]:
torch.save(model.state_dict(), 'encoderFinal2.pth')