In [1]:
import numpy as np
import csv
import pandas as pd
from sklearn.model_selection import train_test_split

# Data Load

In [2]:
# Load Data
user_data = pd.read_csv('./ml-1m/users.dat',sep='::', header=None, 
names=['userId', 'gender' ,'age','occupation', 'zipcode'], engine='python')

movie_data = pd.read_csv('./ml-1m/movies.dat', sep='::',
names=['movieId','title','category'], encoding='latin-1',engine='python')

ratings=pd.read_csv('ml-1m/ratings.dat',sep='::', names=['userId','movieId','rating','timestamps'],engine='python')

In [3]:
# For User

user_data.drop(columns = ['occupation', 'zipcode'], inplace=True)

user_data['userId'] = user_data['userId'].astype('category').cat.codes
user_data['gender'] = user_data['gender'].astype('category').cat.codes
user_data['age'] = user_data['age'].astype('category').cat.codes

print(user_data)

      userId  gender  age
0          0       0    0
1          1       1    6
2          2       1    2
3          3       1    4
4          4       1    2
...      ...     ...  ...
6035    6035       0    2
6036    6036       0    4
6037    6037       0    6
6038    6038       0    4
6039    6039       1    2

[6040 rows x 3 columns]


In [4]:
# For Movie

movie_data.drop(columns = ['title'], inplace=True)
movie_data['movieId'] = movie_data['movieId'].astype('category').cat.codes
movie_data['category'] = movie_data['category'].astype('category').cat.codes

print(movie_data)

      movieId  category
0           0       145
1           1       115
2           2       207
3           3       185
4           4       176
...       ...       ...
3878     3878       176
3879     3879       239
3880     3880       239
3881     3881       239
3882     3882       260

[3883 rows x 2 columns]


In [5]:
# For Rating
ratings['userId'] = ratings['userId'] -1
ratings['movieId'] = ratings['movieId'] -1

ratings=ratings.join(movie_data.set_index('movieId'),on='movieId')
ratings=ratings.join(user_data.set_index('userId'), on='userId')

ratings.loc[ratings['rating'] <= 3, 'rating'] = 0
ratings.loc[ratings['rating'] > 3, 'rating'] = 1

print(ratings[:10])

   userId  movieId  rating  timestamps  category  gender  age
0       0     1192       1   978300760      30.0       0    0
1       0      660       0   978302109     239.0       0    0
2       0      913       0   978301968     136.0       0    0
3       0     3407       1   978300275     276.0       0    0
4       0     2354       1   978824291     176.0       0    0
5       0     1196       0   978302268      77.0       0    0
6       0     1286       1   978302039     258.0       0    0
7       0     2803       1   978300719      65.0       0    0
8       0      593       1   978302268     207.0       0    0
9       0      918       1   978301368     270.0       0    0


In [6]:
# Seperate train\val\test set.

train_set, val_set, test_set = [], [], []

ratings.sort_values(by=['userId', 'timestamps'], inplace=True)
# ratings.drop(columns = 'timestamps', inplace = True)

for user_id in ratings['userId'].unique():
    user_datas = ratings[ratings['userId'] == user_id]
    # Nothing Change : (
    # train, temp = train_test_split(user_data, train_size=0.8, random_state=10)
    # val, test = train_test_split(temp, train_size=0.5, random_state=10)
    
    total_records = len(user_datas)
    train_end = int(total_records * 0.8)
    val_end = int(total_records * 0.9)
    train_set.append(user_datas[:train_end])
    val_set.append(user_datas[train_end:val_end])
    test_set.append(user_datas[val_end:])
    
train_set = pd.concat(train_set)
val_set = pd.concat(val_set)
test_set = pd.concat(test_set)
print(len(train_set), len(test_set), len(val_set))

train_set = train_set.dropna()
val_set = val_set.dropna()
test_set = test_set.dropna()

print(len(train_set), len(test_set), len(val_set))



797758 102759 99692
791662 100931 98791


In [7]:
# Save Data

train_set.to_csv('./data/train_set.csv', index=False)  
val_set.to_csv('./data/val_set.csv', index=False)  
test_set.to_csv('./data/test_set.csv', index=False)  
user_data.to_csv('./data/user_features.csv', index=False)  
movie_data.to_csv('./data/item_features.csv', index=False)  