## Project Step 1: Data Preparation

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import dump_svmlight_file
from scipy.sparse import coo_matrix
import json

In [2]:
# Step 1: Load all datasets
ratings = pd.read_csv('Ratings.csv', sep=';', encoding='latin-1', on_bad_lines='skip')
users = pd.read_csv('Users.csv', sep=';', encoding='latin-1', on_bad_lines='skip')
books = pd.read_csv('Books.csv', sep=';', encoding='latin-1', on_bad_lines='skip', quotechar='"', low_memory=False)

# Rename columns
ratings.columns = ['UserID', 'ISBN', 'Rating']
books.columns = ['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher']
users.columns = ['UserID', 'Age']

  users = pd.read_csv('Users.csv', sep=';', encoding='latin-1', on_bad_lines='skip')


In [None]:
# Step 2: Clean users data

# Drop missing values
users = users.dropna(subset=['Age'])

print(f"Users after cleaning: {users.shape}")

Users after cleaning: (168627, 2)


In [None]:
# Step 3: Clean books data
books = books.dropna()

In [5]:
# Step 4: Clean ratings data
ratings = ratings[ratings['Rating'] > 0]
ratings = ratings.dropna()

# Only keep ratings with valid users and books
ratings = ratings[ratings['UserID'].isin(users['UserID'])]
ratings = ratings[ratings['ISBN'].isin(books['ISBN'])]

print(f"Final ratings count: {ratings.shape[0]}")

Final ratings count: 255745


In [6]:
from scipy.sparse import coo_matrix
from sklearn.datasets import dump_svmlight_file
import numpy as np

# Step 5: Create mappings starting from 1
user_map = {user_id: idx + 1 for idx, user_id in enumerate(ratings['UserID'].unique())}
book_map = {isbn: idx + 1  for idx, isbn in enumerate(ratings['ISBN'].unique())}

ratings['user_index'] = ratings['UserID'].map(user_map)
ratings['book_index'] = ratings['ISBN'].map(book_map)

# Step 6: Build sparse matrix
row = ratings['user_index'].values
col = ratings['book_index'].values
data = ratings['Rating'].values

n_users = len(user_map)  # Do NOT add +1 here
n_books = len(book_map)  # Do NOT add +1 here

sparse_matrix = coo_matrix((data, (row - 1, col - 1)), shape=(n_users, n_books))

# Step 7: Save LIBSVM file
dummy_target = np.ones(n_users)  # Make sure the shape matches the number of rows
dump_svmlight_file(sparse_matrix, dummy_target, 'user_book_ratings.libsvm', zero_based=False)