In [1]:
import pandas as pd
import numpy as np
from typing import Union

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
file_nm = '/content/drive/MyDrive/likes_data/train'

In [5]:
likes = []

with open(file_nm , 'r') as f:
    for user_ids, line in enumerate(f.readlines()):
        track_ids = [int(n) for n in line.split()]
        rank = [int(n) for n in range(len(track_ids))]
        user_likes = np.zeros((len(track_ids), 3), dtype=int)
        user_likes[:, 0] = user_ids
        user_likes[:, 1] = track_ids[::-1]
        user_likes[:, 2] = rank[::-1]
        likes.append(user_likes)
all_data = np.vstack(likes)

In [6]:
data = pd.DataFrame(all_data, columns=['user', 'item', 'order'])

In [7]:
data

Unnamed: 0,user,item,order
0,0,388242,53
1,0,278503,52
2,0,102795,51
3,0,470957,50
4,0,159637,49
...,...,...,...
94188629,1160083,19120,4
94188630,1160083,326821,3
94188631,1160083,214132,2
94188632,1160083,352098,1


In [8]:
class UsersKFoldPOut():
    def __init__(self, n_folds, p, random_seed=23, user_column='user', order='order'):
        self.n_folds = n_folds
        self.p = p
        self.random_seed = random_seed
        self.user_column = user_column
        self.order_column = order
        
    
    def split(self, df):
        users = df[self.user_column].unique()
        users_count = len(users)
        
        np.random.seed(self.random_seed)
        np.random.shuffle(users)
        
        fold_sizes = np.full(self.n_folds, users_count // self.n_folds, dtype=int)
        fold_sizes[: users_count % self.n_folds] += 1
        current = 0

        for fold_size in fold_sizes:
            start, stop = current, current + fold_size
            test_fold_users = users[start:stop]
            test_mask = df[self.user_column].isin(test_fold_users) & (df[self.order_column] < self.p)
            train_mask = ~df[self.user_column].isin(test_fold_users)
            
            yield train_mask, test_mask

In [9]:
p = 5
n_folds = 3
cv = UsersKFoldPOut(n_folds=n_folds, p=p)

for i, (train_mask, test_mask) in enumerate(cv.split(data)):
    train = data[train_mask]
    test = data[test_mask]
    print(f'Fold#{i} | Train: {train.shape[0]}, Test: {test.shape[0]}')

Fold#0 | Train: 62769950, Test: 1933475
Fold#1 | Train: 62769950, Test: 1933475
Fold#2 | Train: 62770016, Test: 1933470


In [10]:
intersect = set(train['user'].unique()).intersection(test['user'].unique())
intersect

set()