In [1]:
import pandas as pd
import numpy as np
import os
from typing import Tuple

In [23]:
DATA_PATH = os.path.abspath(os.path.join(os.curdir, 'Data'))
train_df = pd.read_csv(os.path.abspath(os.path.join(DATA_PATH, 'Train.csv')), index_col=False)
test_rand_df = pd.read_csv(os.path.abspath(os.path.join(DATA_PATH, 'RandomTest.csv')))
test_pop_df = pd.read_csv(os.path.abspath(os.path.join(DATA_PATH, 'PopularityTest.csv')))

In [3]:
train_df.shape

(988129, 2)

In [24]:
train_df.sample(5)

Unnamed: 0,UserID,ItemID
626353,3800,2764
539169,5592,1485
332904,5763,3564
931665,6011,583
792694,4712,2455


In [25]:
validation_df

Unnamed: 0,UserID,Item1,Item2
27,1963,2241,3283
159,5951,1939,161
216,1254,507,1444
242,2738,2793,1967
361,3527,557,1621
...,...,...,...
986635,4410,1878,2142
987605,2766,517,3490
987652,2380,1438,1078
987717,2465,1506,423


In [19]:
test_pop_df.sample(5)

Unnamed: 0,UserID,Item1,Item2
5615,4106,127,3354
2783,5699,1896,3373
1751,794,3238,2796
2672,1859,1807,1658
4130,2923,2183,1743


In [20]:
test_rand_df.sample(5)

Unnamed: 0,UserID,Item1,Item2
2324,3038,406,2790
5354,4644,3627,1017
178,4026,3435,3145
4935,4321,1461,1710
4952,2061,762,3009


In [4]:
print(f'Number of Users: {len(train_df.UserID.unique())}; Number of Items: {len(train_df.ItemID.unique())}')
print('One-Based (not starts from zero)')

Number of Users: 6040; Number of Items: 3705
One-Based (not starts from zero)


In [5]:
assert sorted(list(train_df.UserID.unique())) == list(range(1, len(train_df.UserID.unique()) + 1))

Items is missing with item 1750

In [6]:
train_df.UserID.max(), train_df.ItemID.max()

(6040, 3706)

In [7]:
def get_num_users_items(df):
    return df.UserID.max() + 1, df.ItemID.max() + 1

In [8]:
NUM_USERS, NUM_ITEMS = train_df.UserID.max(), train_df.ItemID.max()

### UserID

In [9]:
train_df.UserID.value_counts(dropna=False, sort=True)

3772    2312
2736    1848
5719    1741
2741    1593
4084    1519
        ... 
3542      18
1063      18
1128      18
4046      18
4183      18
Name: UserID, Length: 6040, dtype: int64

### ItemID

In [10]:
train_df.ItemID.value_counts(dropna=False, sort=True)

406     3342
814     2938
127     2922
405     2821
3523    2616
        ... 
984        1
430        1
1497       1
2523       1
1382       1
Name: ItemID, Length: 3705, dtype: int64

### Create validation set - Negative sampling

#### Ramdom sampling

In [11]:
def get_train_random_validation_data(train_df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Creating random validation set from the train set - for each user, there is only one record in the validation
    The item the user liked is in the first column ("Item1")
    """
    validation_indices = []
    validation_df = pd.DataFrame(columns=['UserID', 'Item1', 'Item2'])
    for user_id in train_df.UserID.unique():
        user_data = train_df.query('UserID == @user_id')
        validation_row = user_data.sample(1)
        validation_row.rename(columns={'ItemID': 'Item1'}, inplace=True)
        negative_item = np.random.choice(list(set(range(1, NUM_ITEMS)).difference(user_data.ItemID.unique())), 1)[0]
        validation_row['Item2'] = negative_item
        validation_indices.append(validation_row.index[0])
        validation_df = validation_df.append(validation_row)
    train_df.drop(validation_indices, inplace=True)
    train_df.reset_index(inplace=True)
    return train_df, validation_df

In [12]:
train_df, validation_df = get_train_random_validation_data(train_df)

In [13]:
def create_user_items_preferences_matrix(train_df: pd.DataFrame) -> np.array:
    """
    Create a numpy array (nXm) of preferences (if user liked item - value of "1", otherwise "0")
    where n - number_of_users, m - number of items
    :param train_df:
    :return:
    """
    preferences_matrix = np.zeros(get_num_users_items(df=train_df))
    for user_id in train_df.UserID.unique():
        np.put(preferences_matrix[user_id], train_df.query('UserID == @user_id').ItemID.values, 1)
    return preferences_matrix

In [14]:
train_preferences_matrix = create_user_items_preferences_matrix(train_df)

In [15]:
preferences_matrix.sum()

982089.0

In [17]:
preferences_matrix.shape

(6041, 3707)