In [None]:
import numpy as np
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import Dataset

# Data Load

In [None]:
# Load Data
user_data = pd.read_csv('./ml-1m/users.dat',sep='::', header=None, 
names=['userId', 'gender' ,'age','occupation', 'zipcode'], engine='python')

movie_data = pd.read_csv('./ml-1m/movies.dat', sep='::',
names=['movieId','title','category'], encoding='latin-1',engine='python')

ratings=pd.read_csv('ml-1m/ratings.dat',sep='::', names=['userId','movieId','rating','timestamps'],engine='python')

In [None]:
# For User

user_data.drop(columns = ['occupation', 'zipcode'], inplace=True)

user_data['userId'] = user_data['userId'].astype('category').cat.codes
user_data['gender'] = user_data['gender'].astype('category').cat.codes
user_data['age'] = user_data['age'].astype('category').cat.codes

print(user_data)

In [None]:
# For Movie

movie_data.drop(columns = ['title'], inplace=True)
movie_data['movieId'] = movie_data['movieId'].astype('category').cat.codes
movie_data['category'] = movie_data['category'].astype('category').cat.codes

print(movie_data)

In [None]:
# For Rating
ratings['userId'] = ratings['userId'] -1
ratings['movieId'] = ratings['movieId'] -1

ratings=ratings.join(movie_data.set_index('movieId'),on='movieId')
ratings=ratings.join(user_data.set_index('userId'), on='userId')

ratings.loc[ratings['rating'] <= 3, 'rating'] = 0
ratings.loc[ratings['rating'] > 3, 'rating'] = 1

print(ratings[:10])

In [None]:
# For feature preparation

features_sizes = {
    'userId':len(ratings['userId'].unique()),
    'movieId':len(ratings['movieId'].unique()),
    'category':len(ratings['category'].unique()),
    'gender':len(ratings['gender'].unique()),
    'age':len(ratings['age'].unique()),
}

offset = 0
features_offsets={}
for k,v in features_sizes.items():
    features_offsets[k] = offset
    next_offset += v

In [None]:
feature_columns = ['userId','movieId','category', 'gender', 'age']
for column in feature_columns:
    ratings[column] = ratings[column].apply(lambda c: c + features_offsets[column])
ratings.head(5)
print(ratings.max()) # To check out the embedding size

In [None]:
# Seperate train\val\test set.

train_set, val_set, test_set = [], [], []

ratings.sort_values(by=['userId', 'timestamps'], inplace=True)
# ratings.drop(columns = 'timestamps', inplace = True)

for user_id in ratings['userId'].unique():
    user_datas = ratings[ratings['userId'] == user_id]
    # Nothing Change : (
    # train, temp = train_test_split(user_data, train_size=0.8, random_state=10)
    # val, test = train_test_split(temp, train_size=0.5, random_state=10)
    
    total_records = len(user_datas)
    train_end = int(total_records * 0.8)
    val_end = int(total_records * 0.9)
    train_set.append(user_datas[:train_end])
    val_set.append(user_datas[train_end:val_end])
    test_set.append(user_datas[val_end:])
    
train_set = pd.concat(train_set)
val_set = pd.concat(val_set)
test_set = pd.concat(test_set)
print(len(train_set), len(test_set), len(val_set))

train_set = train_set.dropna()
val_set = val_set.dropna()
test_set = test_set.dropna()

print(len(train_set), len(test_set), len(val_set))



In [None]:
# BPRLoss Data Preparation

def polarity_separation(dataset):

    bpr_data = pd.DataFrame(columns=['userId','gender','age', 'movie1Id','category1','movie2Id','category2'])
    grouped_dataset = dataset.groupby('userId')

    for idx, data in grouped_dataset:
        y = data['rating'].tolist()
        selected_row = dataset[dataset['userId'] == idx]
        gender = selected_row['gender'].values[0]
        age = selected_row['age'].values[0]

        i, j, end = 0, 1, len(y)
        threshold = 3 if end > 3 else end
        while i < end:
            j = 0
            if y[i] == 1:
                while j < threshold:
                    if y[j] == 0:

                        temp_entry = pd.DataFrame({
                                    'userId': [idx],
                                    'gender': [gender],
                                    'age': [age],
                                    'movie1Id': [selected_row['movieId'].values[i]],
                                    'category1': [selected_row['category'].values[i]],
                                    'movie2Id': [selected_row['movieId'].values[j]],
                                    'category2': [selected_row['category'].values[j]]
                                    })

                        bpr_data = pd.concat([bpr_data, temp_entry], ignore_index=True)
                    j = j + 1
            i = i + 1
        print("index = ", idx)
    return bpr_data

In [None]:
bpr_train_set = polarity_separation(train_set)
bpr_val_set = polarity_separation(val_set)
# bpr_test_set = polarity_separation(test_set)

In [None]:
# Save Data
bpr_val_set.to_csv('./data/revised_bpr_sampled_val_set.csv', index=False)
bpr_train_set.to_csv('./data/revised_bpr_sampled_train_set.csv', index=False)
# bpr_test_set.to_csv('./data/bpr_sampled_test_set.csv', index=False)

user_unique_feature= ratings[['userId', 'gender','age']].drop_duplicates()
movie_unique_feature= ratings[['movieId', 'category']].drop_duplicates()
user_unique_feature.to_csv('./data/revised_user_features.csv', index=False)  
movie_unique_feature.to_csv('./data/revised_item_features.csv', index=False)  

train_set.to_csv('./data/revised_train_set.csv', index=False)  
val_set.to_csv('./data/revised_val_set.csv', index=False)  
test_set.to_csv('./data/revised_test_set.csv', index=False)  