In [1]:
import numpy as np
import csv
import pandas as pd

# Data Load & Preprocessing

In [2]:
# Load Data
user_data = pd.read_csv('./ml-1m/users.dat',header=None,encoding='utf-8',delimiter="\t",quoting=csv.QUOTE_NONE)
movie_data = pd.read_csv('./ml-1m/movies.dat',header=None,encoding = 'latin-1', delimiter="\t",quoting=csv.QUOTE_NONE)
rating_data = pd.read_csv('./ml-1m/ratings.dat',header=None,encoding='utf-8',delimiter="\t",quoting=csv.QUOTE_NONE)

In [3]:
# For User
user_id = [user.split("::")[0] for user in user_data[0]]
user_age = [user.split("::")[2] for user in user_data[0]]
user_gender = [user.split("::")[1] for user in user_data[0]]
user_simplified_data = [list(item) for item in zip(user_id, user_age, user_gender)]
print(user_simplified_data)

[['1', '1', 'F'], ['2', '56', 'M'], ['3', '25', 'M'], ['4', '45', 'M'], ['5', '25', 'M'], ['6', '50', 'F'], ['7', '35', 'M'], ['8', '25', 'M'], ['9', '25', 'M'], ['10', '35', 'F'], ['11', '25', 'F'], ['12', '25', 'M'], ['13', '45', 'M'], ['14', '35', 'M'], ['15', '25', 'M'], ['16', '35', 'F'], ['17', '50', 'M'], ['18', '18', 'F'], ['19', '1', 'M'], ['20', '25', 'M'], ['21', '18', 'M'], ['22', '18', 'M'], ['23', '35', 'M'], ['24', '25', 'F'], ['25', '18', 'M'], ['26', '25', 'M'], ['27', '25', 'M'], ['28', '25', 'F'], ['29', '35', 'M'], ['30', '35', 'F'], ['31', '56', 'M'], ['32', '25', 'F'], ['33', '45', 'M'], ['34', '18', 'F'], ['35', '45', 'M'], ['36', '25', 'M'], ['37', '25', 'F'], ['38', '18', 'F'], ['39', '18', 'M'], ['40', '45', 'M'], ['41', '18', 'F'], ['42', '25', 'M'], ['43', '25', 'M'], ['44', '45', 'M'], ['45', '45', 'F'], ['46', '18', 'M'], ['47', '18', 'M'], ['48', '25', 'M'], ['49', '18', 'M'], ['50', '25', 'F'], ['51', '1', 'F'], ['52', '18', 'M'], ['53', '25', 'M'], ['54

In [4]:
# For Movie
movie_id = [movie.split("::")[0] for movie in movie_data[0]]
movie_category = [movie.split("::")[2] for movie in movie_data[0]]
movie_simplified_data = [list(item) for item in zip(movie_id, movie_category)]

print(movie_simplified_data)

[['1', "Animation|Children's|Comedy"], ['2', "Adventure|Children's|Fantasy"], ['3', 'Comedy|Romance'], ['4', 'Comedy|Drama'], ['5', 'Comedy'], ['6', 'Action|Crime|Thriller'], ['7', 'Comedy|Romance'], ['8', "Adventure|Children's"], ['9', 'Action'], ['10', 'Action|Adventure|Thriller'], ['11', 'Comedy|Drama|Romance'], ['12', 'Comedy|Horror'], ['13', "Animation|Children's"], ['14', 'Drama'], ['15', 'Action|Adventure|Romance'], ['16', 'Drama|Thriller'], ['17', 'Drama|Romance'], ['18', 'Thriller'], ['19', 'Comedy'], ['20', 'Action'], ['21', 'Action|Comedy|Drama'], ['22', 'Crime|Drama|Thriller'], ['23', 'Thriller'], ['24', 'Drama|Sci-Fi'], ['25', 'Drama|Romance'], ['26', 'Drama'], ['27', 'Drama'], ['28', 'Romance'], ['29', 'Adventure|Sci-Fi'], ['30', 'Drama'], ['31', 'Drama'], ['32', 'Drama|Sci-Fi'], ['33', 'Adventure|Romance'], ['34', "Children's|Comedy|Drama"], ['35', 'Drama|Romance'], ['36', 'Drama'], ['37', 'Documentary'], ['38', 'Comedy'], ['39', 'Comedy|Romance'], ['40', 'Drama'], ['41'

In [5]:
# For Rating
rating_user_id = [rating.split("::")[0] for rating in rating_data[0]]
rating_movie_id = [rating.split("::")[1] for rating in rating_data[0]]
rating_label = [ 1 if int(rating.split("::")[2])>3 else 0 for rating in rating_data[0]]
rating_timestamp = [rating.split("::")[3] for rating in rating_data[0]]
rating_simplified_data = [list(item) for item in zip(rating_user_id,rating_movie_id,rating_label,rating_timestamp)]
print(rating_simplified_data[0])

['1', '1193', 1, '978300760']


In [6]:
# Seperate train\val\test set.
user_records = {}
for record in rating_simplified_data:
    user_id = record[0]
    if user_id not in user_records:
        user_records[user_id] = []
    user_records[user_id].append(record)

train_set, val_set, test_set = [], [], []

for user_id, user_records_list in user_records.items():
    sorted_user_records = sorted(user_records_list, key=lambda x: x[3])
    total_records = len(sorted_user_records)
    train_end = int(total_records * 0.8)
    val_end = int(total_records * 0.9)

    train_set.extend(sorted_user_records[:train_end])
    val_set.extend(sorted_user_records[train_end:val_end])
    test_set.extend(sorted_user_records[val_end:])


In [7]:
# Write
def write_to_csv(data, filename):
    with open(filename, 'w', newline='') as file:
        writer = csv.writer(file)
        for row in data:
            writer.writerow(row)


write_to_csv(train_set, './data/train_set.csv')
write_to_csv(val_set, './data/val_set.csv')
write_to_csv(test_set, './data/test_set.csv')
write_to_csv(user_simplified_data, './data/user_features.csv')
write_to_csv(user_simplified_data, './data/item_features.csv')