In [370]:
import torch
import pandas as pd
import pickle
import numpy as np
from tqdm import tqdm
from transformers import BertTokenizer, BertModel
train_data_path = 'data/MINDsmall_train/'
valid_data_path = 'data/MINDsmall_dev/'

In [263]:
with open(train_data_path + 'news_id_map.pkl', 'rb') as f:
    train_news_id_map = pickle.load(f)

In [264]:
valid_news_df = pd.read_csv(valid_data_path + 'news.tsv', sep='\t', header=None)
valid_news_ids = valid_news_df[0].values

In [265]:
news_ids = list(set([nid for nid in valid_news_ids if nid not in train_news_id_map]))
print(f'Số bài báo mới không có trong tập train: {len(news_ids)}')

Số bài báo mới không có trong tập train: 13956


In [266]:
valid_news_id_map = train_news_id_map.copy()

In [267]:
current_max = max(train_news_id_map.values())
for i, nid in enumerate(news_ids):
    valid_news_id_map[nid] = current_max + i + 1

with open(valid_data_path + 'news_id_map.pkl', 'wb') as f:
    pickle.dump(valid_news_id_map, f)
    
print(len(valid_news_id_map))
print(len(train_news_id_map))

65239
51283


In [268]:
with open(train_data_path + 'user_id_map.pkl', 'rb') as f:
    train_user_id_map = pickle.load(f)

In [269]:
valid_behavior_df = pd.read_csv(valid_data_path + 'behaviors.tsv', sep='\t', header=None)
valid_user_ids = valid_behavior_df[1].values

In [270]:
new_users = list(set([uid for uid in valid_user_ids if uid not in train_user_id_map]))
print(f'Số user mới không có trong tập train: {len(new_users)}')

Số user mới không có trong tập train: 44057


In [271]:
valid_user_id_map = train_user_id_map.copy()
current_max = max(train_user_id_map.values())

for i, uid in enumerate(new_users):
    valid_user_id_map[uid] = current_max + i + 1

with open(valid_data_path + 'user_id_map.pkl', 'wb') as f:
    pickle.dump(valid_user_id_map, f)

In [272]:
with open(valid_data_path + 'user_id_map.pkl', 'rb') as f:
    val_user_id_map = pickle.load(f)
    
with open(train_data_path + 'user_id_map.pkl', 'rb') as f:
    train_user_id_map = pickle.load(f)
print(len(val_user_id_map))
print(len(train_user_id_map))

94058
50001


In [273]:
train_edge_index = torch.load(train_data_path + 'edge_index.pt')

In [274]:
valid_behavior_df = pd.read_csv(valid_data_path + 'behaviors.tsv', sep='\t', header=None)
history_col = valid_behavior_df[3].values

In [275]:
new_edges_src = []
new_edges_dst = []

In [276]:
for history in tqdm(history_col):
	if pd.isna(history): 
		continue
		
	news_ids = history.split()
	if len(news_ids) < 2: 
		continue
	idx_list = [valid_news_id_map[nid] for nid in news_ids if nid in valid_news_id_map]
	
	for i in range(len(idx_list) - 1):
		src = idx_list[i]
		dst = idx_list[i+1]
		
		new_edges_src.append(src)
		new_edges_dst.append(dst)

new_edge_index = torch.tensor([new_edges_src, new_edges_dst], dtype=torch.long)
valid_edge_index = torch.cat([train_edge_index, new_edge_index], dim=1)
torch.save(valid_edge_index, valid_data_path + 'edge_index.pt')

100%|██████████| 73152/73152 [00:01<00:00, 45635.38it/s]


In [None]:

tokenizer = BertTokenizer.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')

train_news = pd.read_csv(train_data_path + 'news.tsv', sep='\t', header=None)
valid_news = pd.read_csv(valid_data_path + 'news.tsv', sep='\t', header=None)

all_news_df = pd.concat([train_news, valid_news]).drop_duplicates(subset=[0])

raw_data = {
	row[0]: str(row[3]) + " " + tokenizer.sep_token + " " + str(row[4]) 
	for _, row in all_news_df.iterrows()
}

num_news = len(valid_news_id_map)
sorted_data = np.zeros((num_news, 64), dtype=np.int32)


for nid, idx in tqdm(valid_news_id_map.items(), desc="Global Tokenizing"):
	if nid == 'PADD': 
		continue

	text = raw_data.get(nid, "") 

	tokens = tokenizer.encode(
		text, 
		add_special_tokens=True,
		max_length=64,
		padding='max_length',
		truncation=True
	)
	sorted_data[idx] = tokens

np.save(valid_data_path + 'news_token.npy', sorted_data)

  from .autonotebook import tqdm as notebook_tqdm
Global Tokenizing: 100%|██████████| 65239/65239 [01:32<00:00, 708.44it/s]


In [371]:
def generate_global_bert_features(token_path, model_name, save_path, device='cuda'):
    tokens = np.load(token_path) # [Total_News, 64]
    model = BertModel.from_pretrained(model_name).to(device).half().eval() # Dùng FP16
    
    num_news = len(tokens)
    batch_size = 32
    all_features = []
    
    with torch.no_grad():
        for i in tqdm(range(0, num_news, batch_size)):
            batch_tokens = torch.from_numpy(tokens[i : i + batch_size]).to(device).long()
            mask = (batch_tokens != 0).to(device) 
            
            outputs = model(batch_tokens, attention_mask=mask)
            
            all_features.append(outputs.last_hidden_state.cpu().half())

    full_features = torch.cat(all_features, dim=0)
    torch.save(full_features, save_path)
generate_global_bert_features(
    valid_data_path + 'news_token.npy', 
    'huawei-noah/TinyBERT_General_4L_312D', 
    valid_data_path + 'bert_features.pt'
)

100%|██████████| 2039/2039 [00:24<00:00, 83.80it/s]


In [375]:
def build_behaviors_valid(behaviors_path, user_id_map, news_id_map, save_path, max_hist=32):
    # Đọc đủ các cột cần thiết
    df = pd.read_csv(behaviors_path, sep='\t', header=None, usecols=[1, 3, 4])
    df.columns = ['user_id', 'history', 'impressions']
    
    # Map User sang ID số
    user_indices = df['user_id'].map(user_id_map).fillna(0).astype(np.int32).values

    def process_history(h_str):
        if pd.isna(h_str) or h_str == '':
            return np.zeros(max_hist, dtype=np.int32)
        h_list = [news_id_map.get(nid, 0) for nid in h_str.split()]
        if len(h_list) >= max_hist:
            return np.array(h_list[-max_hist:], dtype=np.int32)
        return np.pad(h_list, (max_hist - len(h_list), 0), 'constant')
    
    histories = np.array([process_history(h) for h in df['history']], dtype=np.int32)

    converted_data = []

    for u_idx, hist, imp in tqdm(zip(user_indices, histories, df['impressions']), total=len(df)):
        candidate_ids = []
        labels = []
        
        for item in imp.split():
            nid, label = item.split('-')
            n_idx = news_id_map.get(nid, 0)
            
            candidate_ids.append(n_idx)
            labels.append(int(label)) 
        
        converted_data.append({
            'user_idx': u_idx,
            'history': hist,
            'candidate_ids': np.array(candidate_ids, dtype=np.int32),
            'labels': np.array(labels, dtype=np.int32) 
        })
    
    with open(save_path, 'wb') as f:
        pickle.dump(converted_data, f)
    print(f"✅ Đã build xong {len(converted_data)} impressions cho tập Validation!")

In [379]:
with open(valid_data_path + 'user_id_map.pkl', 'rb') as f:
    valid_user_id_map = pickle.load(f)
build_behaviors_valid(valid_data_path + 'behaviors.tsv', 
                valid_user_id_map,
                valid_news_id_map,
                valid_data_path + 'behaviors.pkl',
                32)


100%|██████████| 73152/73152 [00:01<00:00, 40169.65it/s]


✅ Đã build xong 73152 impressions cho tập Validation!


In [380]:
with open(valid_data_path + 'behaviors.pkl', 'rb') as f:
    valid_behaviors = pickle.load(f)

In [None]:
valid_behaviors[0]

{'user_idx': np.int32(66995),
 'history': array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,  6893,
        30444, 14246, 29199, 28699, 16461,  1610, 27049, 31726, 27554,
        11723, 23872, 27081,  1571, 32954], dtype=int32),
 'candidate_ids': array([59482, 44282, 43150, 38866, 31176, 55979, 34830, 56143, 32895,
        56804, 36789, 37784, 34901, 30051, 39289, 38491, 62551, 37184,
         6444, 61507, 52774, 21739], dtype=int32),
 'labels': array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       dtype=int32)}