In [30]:
import os
import pandas as pd
import numpy as np

In [31]:
def load_data(rating_path, movie_path):
    """
    Load rating and movie CSV files.
    """
    ratings = pd.read_csv(rating_path, usecols=['userId', 'movieId', 'rating', 'timestamp'])
    movies = pd.read_csv(movie_path, usecols=['movieId', 'title', 'genres'])
    return ratings, movies

In [32]:

def filter_ratings(ratings, min_rating=3.0):
    """
    Filter ratings to include only those >= min_rating.
    """
    filtered = ratings[ratings['rating'] >= min_rating].copy()
    return filtered

In [33]:
def build_item_metadata(movies):
    """
    Build item metadata from movie title and genres.
    Return a dictionary {item_id: "title - genres"} and a DataFrame for saving.
    """
    # Example: "Toy Story (1995) - Genres: Animation, Children, Comedy"
    item_metadata_dict = {}
    for row in movies.itertuples():
        movie_id = row.movieId
        title = row.title if pd.notnull(row.title) else "Unknown Title"
        genres = row.genres if pd.notnull(row.genres) else "Unknown"
        # Clean genres if needed (e.g. split by '|')
        genres_str = ", ".join(genres.split('|')) if genres != "(no genres listed)" else "N/A"
        item_metadata_dict[movie_id] = f"{title} - Genres: {genres_str}"
    
    # Build a DataFrame to save
    meta_df = pd.DataFrame({
        'movieId': list(item_metadata_dict.keys()),
        'metadata': list(item_metadata_dict.values())
    })
    return item_metadata_dict, meta_df


In [34]:
def create_user_sequences(ratings, max_seq_len=150):
    """
    Group by user, sort by timestamp, and create sequences.
    Return a dict of {user_id: [list_of_item_ids_in_order]}.
    """
    # Sort by (userId, timestamp)
    ratings_sorted = ratings.sort_values(by=['userId', 'timestamp'])
    
    # Group by userId
    user_sequences = {}
    for user_id, group in ratings_sorted.groupby('userId'):
        # Sort within the group by ascending timestamp
        item_list = group['movieId'].tolist()
        
        # If the user has more than max_seq_len interactions, keep the last max_seq_len
        if len(item_list) > max_seq_len:
            item_list = item_list[-max_seq_len:]
        
        user_sequences[user_id] = item_list
    
    return user_sequences

In [35]:
def leave_one_out_split(user_sequences):
    """
    For each user's sequence:
      - Last item is test
      - Second-last item is validation
      - Everything else is training
    Return dictionaries: train_seqs, val_seqs, test_seqs
    """
    train_seqs = {}
    val_seqs = {}
    test_seqs = {}
    
    for user, seq in user_sequences.items():
        if len(seq) < 2:
            # Not enough interactions for val/test
            # We'll treat everything as train or handle edge cases as needed
            train_seqs[user] = seq
            val_seqs[user] = []
            test_seqs[user] = []
        else:
            test_item = seq[-1:]
            val_item = seq[:-1]
            train_items = seq[:]
            train_seqs[user] = train_items
            val_seqs[user] = val_item
            test_seqs[user] = test_item
    
    return train_seqs, val_seqs, test_seqs

In [36]:
def pad_sequence(seq, max_len=150, pad_val=0):
    """
    Pad a sequence to max_len with pad_val.
    """
    if len(seq) < max_len:
        seq = [pad_val] * (max_len - len(seq)) + seq
    return seq[-max_len:] 

In [37]:
def build_id_mappings(user_sequences, item_metadata_dict):
    """
    Build contiguous ID mappings for users and items.
    Return:
      - user2idx, item2idx
      - idx2user, idx2item
    """
    unique_users = sorted(list(user_sequences.keys()))
    unique_items = sorted(list(item_metadata_dict.keys()))
    
    user2idx = {u: i for i, u in enumerate(unique_users)}
    item2idx = {m: i for i, m in enumerate(unique_items)}
    
    idx2user = {i: u for u, i in user2idx.items()}
    idx2item = {i: m for m, i in item2idx.items()}
    
    return user2idx, item2idx, idx2user, idx2item

In [38]:
def pad_sequence(seq, max_len=150, pad_val=0):
    """
    Pad or truncate a sequence to a fixed length.
    Pads on the left (like typical NLP settings).
    """
    if len(seq) >= max_len:
        return seq[-max_len:]
    return [pad_val] * (max_len - len(seq)) + seq


def convert_and_pad_splits(train_seqs, val_seqs, test_seqs, user2idx, item2idx, max_seq_len=150):
    """
    Convert user/item IDs to indices, pad sequences, and return DataFrames.
    Format:
      - Train: [user_id] + padded_seq
      - Val/Test: [user_id] + padded_seq + label
    """
    train_data = []
    val_data = []
    test_data = []

    # Train
    for user, train_items in train_seqs.items():
        if user not in user2idx:
            continue
        u_idx = user2idx[user]
        train_items_idx = [item2idx[i] for i in train_items if i in item2idx]
        if not train_items_idx:
            continue
        train_items_idx = pad_sequence(train_items_idx, max_len=max_seq_len, pad_val=0)
        train_data.append([u_idx] + train_items_idx)

    # Validation
    for user, val_items in val_seqs.items():
        if user not in user2idx:
            continue
        u_idx = user2idx[user]
        val_items_idx = [item2idx[i] for i in val_items if i in item2idx]
        if len(val_items_idx) < 2:
            continue
        input_seq = val_items_idx[:-1]
        label = val_items_idx[-1]
        input_seq = pad_sequence(input_seq, max_len=max_seq_len, pad_val=0)
        val_data.append([u_idx] + input_seq + [label])

    # Test
    for user, test_items in test_seqs.items():
        if user not in user2idx:
            continue
        u_idx = user2idx[user]
        test_items_idx = [item2idx[i] for i in test_items if i in item2idx]
        if len(test_items_idx) < 2:
            continue
        input_seq = test_items_idx[:-1]
        label = test_items_idx[-1]
        input_seq = pad_sequence(input_seq, max_len=max_seq_len, pad_val=0)
        test_data.append([u_idx] + input_seq + [label])

    # Build DataFrames
    train_df = pd.DataFrame(train_data)
    val_df = pd.DataFrame(val_data)
    test_df = pd.DataFrame(test_data)

    return train_df, val_df, test_df

In [39]:
def main():
    # Define file paths
    rating_path = "../data/rawdata/rating.csv"
    movie_path = "../data/rawdata/movie.csv"
    processed_dir = "../data/processed"
    os.makedirs(processed_dir, exist_ok=True)
    
    # 1. Load Data
    ratings, movies = load_data(rating_path, movie_path)
    
    # 2. Filter Ratings
    ratings = filter_ratings(ratings, min_rating=3.0)
    
    user_count = ratings['userId'].value_counts()
    top_users = user_count.head(10000).index
    ratings = ratings[ratings['userId'].isin(top_users)]
    
    # 3. Build Item Metadata
    item_metadata_dict, meta_df = build_item_metadata(movies)
    
    # 4. Create User Sequences
    user_sequences = create_user_sequences(ratings, max_seq_len=150)
    
    # 5. Leave-One-Out Split
    train_seqs, val_seqs, test_seqs = leave_one_out_split(user_sequences)
    
    # 6. Build ID Mappings
    user2idx, item2idx, idx2user, idx2item = build_id_mappings(user_sequences, item_metadata_dict)
    
    # 7. Convert & Pad Splits
    train_df, val_df, test_df = convert_and_pad_splits(train_seqs, val_seqs, test_seqs,
                                                       user2idx, item2idx, max_seq_len=150)
    
    # 8. Save Outputs
    #   8a. Sequences
    train_df.to_csv(os.path.join(processed_dir, 'train_sequences.csv'), index=False, header=False)
    val_df.to_csv(os.path.join(processed_dir, 'val_sequences.csv'), index=False, header=False)
    test_df.to_csv(os.path.join(processed_dir, 'test_sequences.csv'), index=False, header=False)
    
    #   8b. Metadata
    meta_df.to_csv(os.path.join(processed_dir, 'item_metadata.csv'), index=False)
    
    #   8c. Mappings
    #       We'll store user2idx and item2idx in a simple DataFrame format
    user_mapping_df = pd.DataFrame(list(user2idx.items()), columns=['original_user_id', 'mapped_id'])
    item_mapping_df = pd.DataFrame(list(item2idx.items()), columns=['original_item_id', 'mapped_id'])
    
    user_mapping_df.to_csv(os.path.join(processed_dir, 'user_mapping.csv'), index=False)
    item_mapping_df.to_csv(os.path.join(processed_dir, 'item_mapping.csv'), index=False)
    
    print("Data preprocessing complete. Files saved in:", processed_dir)



In [40]:
if __name__ == "__main__":
    main()

Data preprocessing complete. Files saved in: ../data/processed
