### test/train split generator + preprocessing for the recipes dataset

author: Thibaut Van Goethem

In [2]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import KFold

In [3]:
df = pd.read_csv('./data/RAW_interactions.csv')
df.reset_index()
df.drop_duplicates(subset=['user_id', 'recipe_id'])
print("amount of interactions in the full dataset: ",len(df))
print("amount of recipes in the full dataset: ",len(df.recipe_id.unique()))
print("amount of users in the full dataset: ",len(df.user_id.unique()))

amount of interactions in the full dataset:  1132367
amount of recipes in the full dataset:  231637
amount of users in the full dataset:  226570


In [24]:
# randomly removes 70% of the recipes 
recipes = df.recipe_id.unique()
print(len(recipes))
np.random.shuffle(recipes)
recipes = recipes[:int(len(recipes)*0.3)]
df = df[df["recipe_id"].isin(recipes)]
print(len(df.recipe_id.unique()))

231637
69491
69491


In [4]:
# remove items that have a user interaction count smaller than 4 or a item interaction count smaller than 3
now=0
prev=len(df)
while prev!=now:
    prev=len(df)
    df['count_user'] = df.groupby(['user_id'])['user_id'].transform('size')
    df = df.drop(df[(df['count_user'] <= 2)].index)
    df['count_item'] = df.groupby(['recipe_id'])['recipe_id'].transform('size')
    df = df.drop(df[(df['count_item'] <= 2)].index)
    # df.drop('count_item', axis=1, inplace=True)
    df.reset_index(drop=True, inplace=True)
    now=len(df)

In [5]:
print("amount of interactions in the full dataset: ",len(df))
print("amount of recipes in the full dataset: ",len(df.recipe_id.unique()))
print("amount of users in the full dataset: ",len(df.user_id.unique()))

amount of interactions in the full dataset:  733951
amount of recipes in the full dataset:  80511
amount of users in the full dataset:  32635


In [21]:
user_list=df['user_id'].unique()

k = 10
kf = KFold(n_splits=k, shuffle=True)
kf.get_n_splits(user_list)
folds = list()
idx=0
for train_index, test_index in kf.split(user_list):
    test_users=user_list[test_index]
    # train_inner=df[train_index]
    # test_inner=df[test_index]
    # X_train = df[df['user_id'].isin(train_inner)]
    X_test = df[df['user_id'].isin(test_users)]
    # X_train.to_csv('test/test_set_train_fold_%s.csv'%str(idx), index=False)
    # X_test.to_csv('test/test_set_predict_fold_%s.csv'%str(idx), index=False)
    folds.append(X_test)
    # idx+=1
if not os.path.isdir("./smallfolds/"):
   os.makedirs("./smallfolds/")
for i in range(len(folds)):
    os.makedirs("./smallfolds/fold_%s"%str(i))
for i in range(len(folds)):
    test=i
    validate1 =(i+1)%10
    validate2 =(i+2)%10

    folds[test].to_csv('smallfolds/fold_%s/test.csv'%str(idx), index=False)
    validate=pd.concat([folds[validate1],folds[validate2]])
    validate.to_csv('smallfolds/fold_%s/validate.csv'%str(idx), index=False)
    train_list=list()
    for j in range(len(folds)):
        if(j not in [test,validate1,validate2]):
            train_list.append(folds[j])
    train=pd.concat(train_list)
    train.to_csv('smallfolds/fold_%s/train.csv'%str(idx), index=False)
    idx+=1

In [22]:
recipes = pd.read_csv('./smallfolds/fold_0/validate.csv')
recipes['count_item'] = recipes.groupby(['recipe_id'])['recipe_id'].transform('size')
recipes['count_user'] = recipes.groupby(['user_id'])['user_id'].transform('size')