How it works:
- Execute run.py to get the initial train, val, test splits (saved in the train_val_test folder)
- Move the files to the folder train_val_test_before_remove_train
- This notebook makes use of those splits and preprocess them further.
- We save the further preprocessed datasets as ''benchmark files'' for input to RecBole. Those will be our final datasets that we will use for training, validation, and test.
- Execute run_new.py.

In [1]:
import pandas as pd
import pickle
import os

In [2]:
def stat_df(df, dataset_name):
    print("Getting stats from these columns: ", df.columns[0:2])
    num_user = df.iloc[:,0].unique().shape[0]
    num_item = df.iloc[:,1].unique().shape[0]
    num_inter = len(df)
    sparsity = 1 - num_inter / (num_user * num_item)
    print("Statistics: ")
    print(f"Number of users: {num_user}")
    print(f"Number of items: {num_item}")
    print(f"Number of interactions: {num_inter}")
    print(f"Sparsity: {sparsity}")

    return {dataset_name: [num_user, num_item, num_inter, sparsity]}

In [5]:
def load_data(dataset_name):
    print(f"Loading {dataset_name}")
    df = pd.read_csv(f"../dataset/{dataset_name}/{dataset_name}.inter", sep="\t")
    return df

def load_preprocessed_data(dataset, path="train_val_test"):

    with open(f"../{path}/{dataset}_train.pickle","rb") as f:
        data = pickle.load(f)
    train = pd.DataFrame(data)

    with open(f"../{path}/{dataset}_valid.pickle","rb") as f:
        data = pickle.load(f)
    val = pd.DataFrame(data)

    with open(f"../{path}/{dataset}_test.pickle","rb") as f:
        data = pickle.load(f)
    test = pd.DataFrame(data)

    return train, val, test

def concat(train, val, test):
    return pd.concat([train, val, test])

# Further preprocess data

In [4]:
list_dataset = ["Amazon-lb","Lastfm","ML-10M", "QK-video"]

In [5]:
def further_preprocess(dataset_name):

        train, val, test = load_preprocessed_data(dataset_name, "train_val_test_before_remove_train")

        #Remove users in train with less than 5 interactions (only keep those with at least 5)

        new_train_agg = train\
                .groupby("user_id")\
                .count()
        new_train_id = new_train_agg[new_train_agg.iloc[:, 0]>=5].index
        new_train = train[train.user_id.isin(new_train_id)]

        #Completely remove those users in val and test
        new_val = val[val.user_id.isin(new_train_id)]
        new_test =  test[test.user_id.isin(new_train_id)]

        #Ensure all val and test users are in new train
        assert new_val.user_id.isin(new_train_id).all()
        assert new_test.user_id.isin(new_train_id).all()

        #ensure each user in train has at least 5
        assert all(new_train\
                .groupby("user_id")\
                .count()\
                .iloc[:, 0] >= 5)

        return new_train, new_val, new_test

In [6]:
def convert_df_to_inter(df:pd.DataFrame, col_name_dict:dict, file_name:str, dataset_name):
    inter = df.copy()
    inter.rename(columns=col_name_dict, inplace=True)

    path = f"../preproc_data/new_{dataset_name}/"

    if not os.path.exists(path):
        os.makedirs(path)
    inter.to_csv(path+file_name, index=False, sep="\t")
    return inter

def create_file(dataset_name):
    train, val, test = further_preprocess(dataset_name)

    col_name_dict = {
                "user_id":"user_id:token",
                "item_id":"item_id:token",
                "artist_id":"artist_id:token",
                "label":"label:float",
                "timestamp":"timestamp:float"
                }
    
    #this method converts our loaded dataframe to a .inter file, and saves it in the folder data under the name 'file_name'
    convert_df_to_inter(train, col_name_dict, f"new_{dataset_name}.train.inter", dataset_name)
    convert_df_to_inter(val, col_name_dict,f"new_{dataset_name}.valid.inter", dataset_name)
    convert_df_to_inter(test, col_name_dict, f"new_{dataset_name}.test.inter", dataset_name)

Commented to avoid accidental run

In [7]:
# for data in list_dataset:
#     create_file(data)

# Dataset Stats
Count number of user, item, interaction.

This can be run after the instruction at the beginning of the notebook has been done (including run_new.py)

In [6]:
list_dataset = ["Amazon-lb","Lastfm","ML-10M","QK-video"]

In [None]:
result = {}
old_preproc_result = {}
preproc_result = {}
train_val_test_result = {}
test_df = {}

for data in list_dataset:
    df = load_data(data)
    stat_data = stat_df(df, data)
    result.update(stat_data)

    #old
    old_train, old_val, old_test = load_preprocessed_data(data, "train_val_test_before_remove_train")
    old_preproc_data = concat(old_train,old_val,old_test)

    old_preproc_stat_data = stat_df(old_preproc_data, data)
    old_preproc_result.update(old_preproc_stat_data)

    #new
    train, val, test = load_preprocessed_data("new_"+data,  "train_val_test")
    preproc_data = concat(train,val,test)

    preproc_stat_data = stat_df(preproc_data, "new_"+data)
    preproc_result.update(preproc_stat_data)
    test_df[data] = test

    #per split
    for i, (old_split, new_split) in enumerate(zip([old_train, old_val, old_test],[train, val, test])):
        old_stat = stat_df(old_split, data)
        new_stat = stat_df(new_split, "new_"+data)
        if i not in train_val_test_result:
            train_val_test_result[i] = old_stat
            train_val_test_result[i].update(new_stat)
        else:
            train_val_test_result[i].update(old_stat)
            train_val_test_result[i].update(new_stat)


In [None]:
df_result = pd.DataFrame(result).T
df_result.columns = ["num_user", "num_item", "num_inter", "sparsity"]
df_result

In [10]:
df_result.sort_values("num_inter").to_excel("stats/dataset_statistics.xlsx")

In [11]:
sorted_index = df_result.sort_values("num_inter").index

In [None]:
preproc_result = pd.DataFrame(preproc_result).T
preproc_result.columns = ["num_user", "num_item", "num_inter", "sparsity"]
preproc_result.loc["new_"+sorted_index].to_excel("stats/new_dataset_statistics_preprocessed.xlsx")