# normalize part

In [6]:
import json
import os

# Normalize the entire .json file and save it to a new file
def normalize_json_file(file_path):
    try:
        # Get the new file name
        base_name = os.path.basename(file_path)
        new_file_name = f"normalized_{base_name}"
        new_file_path = os.path.join(os.path.dirname(file_path), new_file_name)

        with open(file_path, 'r', encoding='utf-8') as file, open(new_file_path, 'w', encoding='utf-8') as new_file:
            items = []
            for line in file:
                try:
                    # Attempt to parse each line as a JSON object
                    item = json.loads(line.strip())
                    items.append(item)
                except json.JSONDecodeError as e:
                    print(f"Error parsing line: {e}")
                    continue

            # Write the parsed content to the new file
            json.dump(items, new_file, indent=4, ensure_ascii=False)
        print(f"Successfully normalized the entire file and saved to {new_file_path}")
    except FileNotFoundError:
        print(f"File {file_path} not found.")
    except Exception as e:
        print(f"Error reading the file: {e}")

# Main function
def main():
    file_path = input("Enter the path of the .json file to normalize: ")
    normalize_json_file(file_path)

if __name__ == "__main__":
    main()

File  not found.


# preprocess part

In [None]:
import pandas as pd
import numpy as np
import json
import os
from collections import defaultdict

def preprocess_data(input_file, save_path):
    # Load small_goodreads_books.json
    with open(input_file, 'r') as file:
        books_data = json.load(file)

    # Extract relevant information
    book_shelves = []
    ratings = []

    # List of tags to exclude
    excluded_tags = ['to-read']

    for book in books_data:
        book_title = book.get('title', 'Unknown')
        book_rating = round(float(book.get('average_rating', 0)), 3)  # Round rating to 3 decimal places
        
        # Extract popular shelves
        shelves = [shelf['name'] for shelf in book.get('popular_shelves', [])]

        for shelf in shelves:
            if shelf not in excluded_tags:  # Exclude the tags in the excluded_tags list
                book_shelves.append(shelf)
                ratings.append(book_rating)

    # Count occurrences of each shelf and filter to get the top 20 most popular shelves
    shelf_counts = pd.Series(book_shelves).value_counts()

    # Get the top 20 most frequent shelves (excluding the tags in excluded_tags)
    top_shelves = [shelf for shelf in shelf_counts.index if shelf not in excluded_tags][:20]

    # Print the top 20 arms (shelves)
    print(f"Using the following {len(top_shelves)} arms (shelves): {top_shelves}")

    # Create reward pools for each shelf
    rewards_by_shelf = defaultdict(list)
    for shelf, rating in zip(book_shelves, ratings):
        if shelf in top_shelves:
            rewards_by_shelf[shelf].append(rating)  # Store the rounded rating

    # Convert reward lists to numpy arrays for fast access
    sample_pool = {shelf: np.array(rewards_by_shelf[shelf]) for shelf in top_shelves}

    # Ensure the save path exists
    os.makedirs(save_path, exist_ok=True)

    # Save arms and rewards to JSON files
    with open(os.path.join(save_path, "arms.json"), "w") as f:
        json.dump(top_shelves, f)

    with open(os.path.join(save_path, "rewards.json"), "w") as f:
        json.dump({shelf: sample_pool[shelf].tolist() for shelf in top_shelves}, f)

    print(f"Arms and rewards saved to {save_path}")

def main():
    # Ask for input file path and save directory
    input_file = input("Enter the path to the small_goodreads_books.json file: ")
    save_path = input("Enter the directory to save the processed files (e.g., ../../save/): ")

    preprocess_data(input_file, save_path)

if __name__ == "__main__":
    main()


Using the following 20 arms (shelves): ['owned-books', 'fiction', 'currently-reading', 'owned', 'default', 'favorites', 'books-i-own', 'library', 'books', 'ebook', 'to-buy', 'wish-list', 'my-books', 'unfinished', 'adult', 'own-it', 'series', 'novels', 'literature', 'my-library']
Arms and rewards saved to ../../save/
