In [None]:
import pickle
import json

def load_retrieved_data(pkl_file_path):
    with open(pkl_file_path, 'rb') as f:
        retrieved_data = pickle.load(f)
    return retrieved_data

def get_user_sequences(user_id, umap, train, val):
    if user_id not in umap:
        print(f"User ID {user_id} not found in umap.")
        return None
    user_index = umap[user_id]
    
    # Get sequences
    train_sequence = train.get(user_index, [])
    val_sequence = val.get(user_index, [])
    
    # Combine sequences
    full_sequence = train_sequence + val_sequence
    return full_sequence

def map_item_ids_to_product_info(item_ids, meta, smap):
    # Reverse smap to map item IDs to ASINs
    id_to_asin = {v: k for k, v in smap.items()}
    item_info = []
    for item_id in item_ids:
        product_name = meta.get(item_id)
        asin = id_to_asin.get(item_id)
        if product_name and asin:
            item_info.append({'item_id': item_id, 'product_name': product_name, 'asin': asin})
        else:
            print(f"Item ID {item_id} not found in meta or smap.")
    return item_info

def get_user_reviews(user_id, user_reviews_data):
    for user_data in user_reviews_data:
        if user_data['user_id'] == user_id:
            return user_data['reviews']
    print(f"User ID {user_id} not found in user reviews data.")
    return []

def match_sequences_to_reviews(item_info, user_reviews):
    # Create a set of ASINs from item_info
    sequence_asins = set([item['asin'] for item in item_info])
    
    # Find reviews where the parent_asin is in sequence_asins
    matched_reviews = []
    for review in user_reviews:
        if review['parent_asin'] in sequence_asins:
            matched_reviews.append(review)
    
    return matched_reviews

# Load data
pkl_file_path = 'data/dataset.pkl'  # Update with your actual path
retrieved_data = load_retrieved_data(pkl_file_path)

train = retrieved_data['train']
val = retrieved_data['val']
umap = retrieved_data['umap']
meta = retrieved_data['meta']
smap = retrieved_data['smap']

# User ID
user_id = 'AFSKPY37N3C43SOI5IEXEK5JSIYA'  # Replace with your user ID

# Get user sequences
user_sequence = get_user_sequences(user_id, umap, train, val)
if user_sequence is None:
    raise ValueError(f"No sequences found for user ID {user_id}")

# Map item IDs to product names and ASINs
item_info = map_item_ids_to_product_info(user_sequence, meta, smap)

# Load your user reviews JSON data
with open('input_set.json', 'r') as f:
    user_reviews_data = json.load(f)

# Get user's reviews
user_reviews = get_user_reviews(user_id, user_reviews_data)
if not user_reviews:
    raise ValueError(f"No reviews found for user ID {user_id}")

# Match sequences to reviews
matched_reviews = match_sequences_to_reviews(item_info, user_reviews)

# Display matched reviews
print(f"Matched reviews for user {user_id}:")
for review in matched_reviews:
    print(f"- {review['product_name']} (ASIN: {review['parent_asin']}) (review: {review['text']})")


In [None]:
import ast

def load_data_from_txt(file_path):
    data = {}
    current_key = None
    current_data_lines = []
    
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line.startswith('Keys in'):
                # This is the header line; skip it
                continue
            elif line.startswith('dict_keys'):
                # Skip the dict_keys line
                continue
            elif line.startswith('Key:'):
                # When we encounter a new key, process the previous one
                if current_key is not None and current_data_lines:
                    # Combine the lines and parse the dictionary
                    dict_str = ' '.join(current_data_lines)
                    try:
                        data[current_key] = ast.literal_eval(dict_str)
                    except SyntaxError as e:
                        print(f"Error parsing data for key '{current_key}': {e}")
                        data[current_key] = {}
                    current_data_lines = []
                # Set the new key
                current_key = line.replace('Key:', '').strip()
            else:
                # Collect lines of data
                current_data_lines.append(line)
        # Don't forget to process the last key
        if current_key is not None and current_data_lines:
            dict_str = ' '.join(current_data_lines)
            try:
                data[current_key] = ast.literal_eval(dict_str)
            except SyntaxError as e:
                print(f"Error parsing data for key '{current_key}': {e}")
                data[current_key] = {}
    return data

def display_interaction_counts(umap, train, val, test):
    print(f"{'User ID':<40} {'Train':<10} {'Val':<10} {'Test':<10}")
    print("-" * 70)
    for user_id, user_index in umap.items():
        # Convert user_index to integer if necessary
        user_index = int(user_index)
        train_sequence = train.get(user_index, [])
        val_sequence = val.get(user_index, [])
        test_sequence = test.get(user_index, [])
        num_train = len(train_sequence)
        num_val = len(val_sequence)
        num_test = len(test_sequence)
        print(f"{user_id:<40} {num_train:<10} {num_val:<10} {num_test:<10}")

# Load data
txt_file_path = 'retrieved_data_data_output.txt'  # Update with your actual path
retrieved_data = load_data_from_txt(txt_file_path)

# Verify keys in retrieved_data
print("Keys in retrieved_data:", retrieved_data.keys())

train = retrieved_data.get('train', {})
val = retrieved_data.get('val', {})
test = retrieved_data.get('test', {})
umap = retrieved_data.get('umap', {})
smap = retrieved_data.get('smap', {})
meta = retrieved_data.get('meta', {})

# Display interaction counts
display_interaction_counts(umap, train, val, test)


In [None]:
import ast

def load_data_from_txt(file_path):
    data = {}
    current_key = None
    current_data_lines = []
    
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line.startswith('Keys in'):
                # This is the header line; skip it
                continue
            elif line.startswith('dict_keys'):
                # Skip the dict_keys line
                continue
            elif line.startswith('Key:'):
                # When we encounter a new key, process the previous one
                if current_key is not None and current_data_lines:
                    # Combine the lines and parse the dictionary
                    dict_str = ' '.join(current_data_lines)
                    try:
                        data[current_key] = ast.literal_eval(dict_str)
                    except SyntaxError as e:
                        print(f"Error parsing data for key '{current_key}': {e}")
                        data[current_key] = {}
                    current_data_lines = []
                # Set the new key
                current_key = line.replace('Key:', '').strip()
            else:
                # Collect lines of data
                current_data_lines.append(line)
        # Don't forget to process the last key
        if current_key is not None and current_data_lines:
            dict_str = ' '.join(current_data_lines)
            try:
                data[current_key] = ast.literal_eval(dict_str)
            except SyntaxError as e:
                print(f"Error parsing data for key '{current_key}': {e}")
                data[current_key] = {}
    return data

def display_interaction_counts(umap, train, val, test, id_to_asin, max_items_display=5):
    print(f"{'User ID':<40} {'Train':<10} {'Val':<10} {'Test':<10} {'Train ASINs':<50} {'Val ASINs':<50} {'Test ASINs':<50}")
    print("-" * 220)
    for user_id, user_index in umap.items():
        # Convert user_index to integer if necessary
        user_index = int(user_index)
        train_sequence = train.get(user_index, [])
        val_sequence = val.get(user_index, [])
        test_sequence = test.get(user_index, [])
        num_train = len(train_sequence)
        num_val = len(val_sequence)
        num_test = len(test_sequence)
        
        # Map numerical IDs to ASINs
        train_asins = [id_to_asin.get(item_id, f'Unknown({item_id})') for item_id in train_sequence[:max_items_display]]
        if num_train > max_items_display:
            train_asins.append('...')
        train_items_str = ', '.join(train_asins)
        
        val_asins = [id_to_asin.get(item_id, f'Unknown({item_id})') for item_id in val_sequence[:max_items_display]]
        if num_val > max_items_display:
            val_asins.append('...')
        val_items_str = ', '.join(val_asins)
        
        test_asins = [id_to_asin.get(item_id, f'Unknown({item_id})') for item_id in test_sequence[:max_items_display]]
        if num_test > max_items_display:
            test_asins.append('...')
        test_items_str = ', '.join(test_asins)
        
        print(f"{user_id:<40} {num_train:<10} {num_val:<10} {num_test:<10} {train_items_str:<50} {val_items_str:<50} {test_items_str:<50}")

def write_interaction_counts_to_file(umap, train, val, test, id_to_asin, output_file='interaction_counts.csv'):
    import csv
    
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['User ID', 'Train Count', 'Val Count', 'Test Count', 'Train ASINs', 'Val ASINs', 'Test ASINs']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        writer.writeheader()
        for user_id, user_index in umap.items():
            # Convert user_index to integer if necessary
            user_index = int(user_index)
            train_sequence = train.get(user_index, [])
            val_sequence = val.get(user_index, [])
            test_sequence = test.get(user_index, [])
            num_train = len(train_sequence)
            num_val = len(val_sequence)
            num_test = len(test_sequence)
            
            # Map numerical IDs to ASINs
            train_asins = [id_to_asin.get(item_id, f'Unknown({item_id})') for item_id in train_sequence]
            val_asins = [id_to_asin.get(item_id, f'Unknown({item_id})') for item_id in val_sequence]
            test_asins = [id_to_asin.get(item_id, f'Unknown({item_id})') for item_id in test_sequence]
            
            # Convert sequences to comma-separated strings
            train_items_str = ', '.join(train_asins)
            val_items_str = ', '.join(val_asins)
            test_items_str = ', '.join(test_asins)
            
            writer.writerow({
                'User ID': user_id,
                'Train Count': num_train,
                'Val Count': num_val,
                'Test Count': num_test,
                'Train ASINs': train_items_str,
                'Val ASINs': val_items_str,
                'Test ASINs': test_items_str
            })
    print(f"Interaction counts have been written to {output_file}")

# Load data
txt_file_path = 'retrieved_output.txt'  # Update with your actual path
retrieved_data = load_data_from_txt(txt_file_path)

# Access the datasets
train = retrieved_data.get('train', {})
val = retrieved_data.get('val', {})
test = retrieved_data.get('test', {})
umap = retrieved_data.get('umap', {})
smap = retrieved_data.get('smap', {})

# Create reverse mapping from IDs to ASINs
id_to_asin = {int(v): k for k, v in smap.items()}

# Collect ASINs for each user
user_asin_interactions = {}

for user_id, user_index in umap.items():
    user_index = int(user_index)
    # Get the interaction sequences for the user
    train_sequence = train.get(user_index, [])
    val_sequence = val.get(user_index, [])
    test_sequence = test.get(user_index, [])
    # Combine all interactions
    total_sequence = train_sequence + val_sequence + test_sequence
    # Map interaction IDs to ASINs
    asin_sequence = [id_to_asin.get(item_id, f'Unknown({item_id})') for item_id in total_sequence]
    # Filter out unknown ASINs if any
    asin_sequence = [asin for asin in asin_sequence if not asin.startswith('Unknown')]
    # Store the ASINs
    user_asin_interactions[user_id] = set(asin_sequence)  # Use a set for faster lookup

# Load the user reviews JSON data from both input_set.json and eval_set.json
input_json_file_path = 'input_set.json'  # Update with your actual path
eval_json_file_path = 'eval_set.json'    # Update with your actual path

# Load input_set.json
with open(input_json_file_path, 'r') as f:
    input_user_reviews_data = json.load(f)

# Load eval_set.json
with open(eval_json_file_path, 'r') as f:
    eval_user_reviews_data = json.load(f)

# Combine the user reviews data from both files
combined_user_reviews_data = input_user_reviews_data + eval_user_reviews_data

# Create a dictionary to hold the combined user reviews
user_reviews_dict = {}

# Create a set of user IDs we are interested in
user_ids_set = set(user_asin_interactions.keys())

# Iterate over the combined user reviews data
for user_data in combined_user_reviews_data:
    user_id = user_data['user_id']
    if user_id in user_ids_set:
        # Get the ASINs this user interacted with
        user_asins = user_asin_interactions[user_id]
        # **Modified Section Starts Here**
        # Get the reviews
        if 'reviews' in user_data:
            # Multiple reviews
            user_reviews = user_data['reviews']
        elif 'review' in user_data:
            # Single review, wrap it in a list
            user_reviews = [user_data['review']]
        else:
            # No reviews, skip this user
            continue

        # Filter the reviews for this user
        filtered_reviews = [
            review for review in user_reviews
            if review['parent_asin'] in user_asins
        ]
        if filtered_reviews:
            if user_id not in user_reviews_dict:
                user_reviews_dict[user_id] = filtered_reviews
            else:
                # Combine reviews if user already exists
                user_reviews_dict[user_id].extend(filtered_reviews)
        # **Modified Section Ends Here**

# Convert the user_reviews_dict to a list of dictionaries
filtered_user_reviews = [
    {'user_id': user_id, 'reviews': reviews}
    for user_id, reviews in user_reviews_dict.items()
]

# Save the filtered data to a new JSON file
output_json_file = 'data/filtered_user_reviews.json'  # Update with your desired output path
with open(output_json_file, 'w') as f:
    json.dump(filtered_user_reviews, f, indent=4)
print(f"Filtered user reviews have been saved to {output_json_file}")

# Display a sample of the filtered data
print("\nSample of filtered user reviews:")
for user_data in filtered_user_reviews[:2]:  # Display first 2 users
    user_id = user_data['user_id']
    print(f"User ID: {user_id}")
    for review in user_data['reviews']:
        print(f"  ASIN: {review['parent_asin']}")
        print(f"  Product Name: {review['product_name']}")
        print(f"  Rating: {review['rating']}")
        print(f"  Title: {review['title']}")
        print(f"  Text: {review['text'][:100]}...")  # Display first 100 characters
        print()



In [30]:
import ast
import json

def load_data_from_txt(file_path):
    data = {}
    current_key = None
    current_data_lines = []

    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line.startswith('Keys in'):
                # This is the header line; skip it
                continue
            elif line.startswith('dict_keys'):
                # Skip the dict_keys line
                continue
            elif line.startswith('Key:'):
                # When we encounter a new key, process the previous one
                if current_key is not None and current_data_lines:
                    # Combine the lines and parse the dictionary
                    dict_str = ' '.join(current_data_lines)
                    try:
                        data[current_key] = ast.literal_eval(dict_str)
                    except SyntaxError as e:
                        print(f"Error parsing data for key '{current_key}': {e}")
                        data[current_key] = {}
                    current_data_lines = []
                # Set the new key
                current_key = line.replace('Key:', '').strip()
            else:
                # Collect lines of data
                current_data_lines.append(line)
        # Don't forget to process the last key
        if current_key is not None and current_data_lines:
            dict_str = ' '.join(current_data_lines)
            try:
                data[current_key] = ast.literal_eval(dict_str)
            except SyntaxError as e:
                print(f"Error parsing data for key '{current_key}': {e}")
                data[current_key] = {}
    return data

# Load data
txt_file_path = 'data/dataset.txt'  # Update with your actual path
retrieved_data = load_data_from_txt(txt_file_path)

# Access the datasets
train = retrieved_data.get('train', {})
val = retrieved_data.get('val', {})
test = retrieved_data.get('test', {})
umap = retrieved_data.get('umap', {})
smap = retrieved_data.get('smap', {})

# Create reverse mapping from IDs to ASINs
id_to_asin = {int(v): k for k, v in smap.items()}

# Collect ASINs for each user
user_train_val_asins = {}  # ASINs from train and val
user_test_asins = {}       # ASINs from test

for user_id, user_index in umap.items():
    user_index = int(user_index)
    # Get interaction sequences for the user
    train_sequence = train.get(user_index, [])
    val_sequence = val.get(user_index, [])
    test_sequence = test.get(user_index, [])
    
    # Map interaction IDs to ASINs
    train_val_sequence = train_sequence + val_sequence
    train_val_asins = [id_to_asin.get(item_id, f'Unknown({item_id})') for item_id in train_val_sequence]
    test_asins = [id_to_asin.get(item_id, f'Unknown({item_id})') for item_id in test_sequence]
    
    # Filter out unknown ASINs
    train_val_asins = [asin for asin in train_val_asins if not asin.startswith('Unknown')]
    test_asins = [asin for asin in test_asins if not asin.startswith('Unknown')]
    
    # Store the ASINs in sets for efficient lookup
    user_train_val_asins[user_id] = set(train_val_asins)
    user_test_asins[user_id] = set(test_asins)

# Load the user reviews JSON data
json_file_path = 'data/user_reviews.json'  # Update with your actual path
with open(json_file_path, 'r') as f:
    user_reviews_data = json.load(f)

# Create lists to hold the filtered user reviews
train_val_user_reviews = []
test_user_reviews = []

# Create a set of user IDs we are interested in
user_ids_set = set(umap.keys())

# Iterate over the user reviews data
for user_data in user_reviews_data:
    user_id = user_data['user_id']
    if user_id in user_ids_set:
        # Get the ASINs this user interacted with
        train_val_asins = user_train_val_asins.get(user_id, set())
        test_asins = user_test_asins.get(user_id, set())
        
        # Sets to track unique reviews
        train_val_unique_review_ids = set()
        test_unique_review_ids = set()
        
        # Lists to hold filtered and unique reviews
        train_val_filtered_reviews = []
        test_filtered_reviews = []
        
        # Iterate over the user's reviews
        for review in user_data['reviews']:
            asin = review['parent_asin']
            review_id = (asin, review['timestamp'])
            
            # Check for train_val reviews
            if asin in train_val_asins:
                if review_id not in train_val_unique_review_ids:
                    train_val_unique_review_ids.add(review_id)
                    train_val_filtered_reviews.append(review)
                else:
                    # Duplicate found, skip adding
                    continue
            # Check for test reviews
            elif asin in test_asins:
                if review_id not in test_unique_review_ids:
                    test_unique_review_ids.add(review_id)
                    test_filtered_reviews.append(review)
                else:
                    # Duplicate found, skip adding
                    continue
            # Else, the review is not part of our interactions
            else:
                continue
        
        # Add the filtered reviews to the respective lists
        if train_val_filtered_reviews:
            train_val_user_reviews.append({
                'user_id': user_id,
                'reviews': train_val_filtered_reviews
            })
        if test_filtered_reviews:
            test_user_reviews.append({
                'user_id': user_id,
                'reviews': test_filtered_reviews
            })

# Save the train and validation reviews to a JSON file
train_val_output_json = 'data/train_val_user_reviews.json'
with open(train_val_output_json, 'w') as f:
    json.dump(train_val_user_reviews, f, indent=4)
print(f"Train and validation user reviews have been saved to {train_val_output_json}")

# Save the test reviews to a separate JSON file
test_output_json = 'data/test_user_reviews.json'
with open(test_output_json, 'w') as f:
    json.dump(test_user_reviews, f, indent=4)
print(f"Test user reviews have been saved to {test_output_json}")

# Display a sample of the train and validation reviews
print("\nSample of train and validation user reviews:")
for user_data in train_val_user_reviews[:2]:  # Display first 2 users
    user_id = user_data['user_id']
    print(f"User ID: {user_id}")
    for review in user_data['reviews']:
        print(f"  ASIN: {review['parent_asin']}")
        print(f"  Product Name: {review['product_name']}")
        print(f"  Rating: {review['rating']}")
        print(f"  Title: {review['title']}")
        print(f"  Text: {review['text'][:100]}...")  # Display first 100 characters
        print()

# Display a sample of the test reviews
print("\nSample of test user reviews:")
for user_data in test_user_reviews[:2]:  # Display first 2 users
    user_id = user_data['user_id']
    print(f"User ID: {user_id}")
    for review in user_data['reviews']:
        print(f"  ASIN: {review['parent_asin']}")
        print(f"  Product Name: {review['product_name']}")
        print(f"  Rating: {review['rating']}")
        print(f"  Title: {review['title']}")
        print(f"  Text: {review['text'][:100]}...")  # Display first 100 characters
        print()


Train and validation user reviews have been saved to data/train_val_user_reviews.json
Test user reviews have been saved to data/test_user_reviews.json

Sample of train and validation user reviews:
User ID: AFSKPY37N3C43SOI5IEXEK5JSIYA
  ASIN: B07J3GH1W1
  Product Name: Manicure and Pedicure Nail Clipper from POWERGROOMING - Powerful Trimmer for Thick and Thin Finger Nails and Toe Nails - Included Nail File and"Catcher" for Easy Cleanup (1 Pack)
  Rating: 5.0
  Title: Nice manicure set for men or women
  Text: This a really cute kit which would make for a great gift for someone. It is in a little leather like...

  ASIN: B07W397QG4
  Product Name: Iryasa Night Indulge Cream - Natural Face Cream for Dry Skin - Vegan Anti Aging Night Cream for Women - Firming Cream for Face and Neck - Organic Vitamin C Moisturizer for Face - 1.7oz
  Rating: 5.0
  Title: Wonderful overnight cream!
  Text: To be honest, I rarely have used an overnight cream. Typically, my skin care routine is the same mor..

In [24]:
def count_interactions(json_file_path):
    """
    Counts the number of interactions (reviews) in the filtered_user_reviews.json file.

    Parameters:
    - json_file_path: str, path to the filtered_user_reviews.json file.

    Returns:
    - total_interactions: int, total number of interactions.
    - interactions_per_user: dict, mapping of user IDs to their interaction counts.
    """
    import json

    # Load the filtered user reviews JSON data
    with open(json_file_path, 'r') as f:
        filtered_user_reviews = json.load(f)
    
    total_interactions = 0
    interactions_per_user = {}
    
    for user_data in filtered_user_reviews:
        user_id = user_data['user_id']
        num_reviews = len(user_data['reviews'])
        interactions_per_user[user_id] = num_reviews
        total_interactions += num_reviews
    
    # Display the results
    print(f"Total number of interactions (reviews): {total_interactions}")
    print("\nNumber of interactions per user:")
    for user_id, count in interactions_per_user.items():
        print(f"User ID: {user_id}, Interactions: {count}")
    
    # Optional Analysis
    if interactions_per_user:
        average_interactions = total_interactions / len(interactions_per_user)
        print(f"\nAverage interactions per user: {average_interactions:.2f}")

        max_interactions_user = max(interactions_per_user, key=interactions_per_user.get)
        min_interactions_user = min(interactions_per_user, key=interactions_per_user.get)

        print(f"User with most interactions: {max_interactions_user} ({interactions_per_user[max_interactions_user]} interactions)")
        print(f"User with least interactions: {min_interactions_user} ({interactions_per_user[min_interactions_user]} interactions)")
    else:
        print("\nNo interactions found in the data.")
    
    return total_interactions, interactions_per_user


In [29]:
# Test the function
json_file_path = 'data/filtered_user_reviews.json'  # Update this path as needed
total_interactions, interactions_per_user = count_interactions(json_file_path)


Total number of interactions (reviews): 2544

Number of interactions per user:
User ID: AFSKPY37N3C43SOI5IEXEK5JSIYA, Interactions: 7
User ID: AHV6QCNBJNSGLATP56JAWJ3C4G2A, Interactions: 14
User ID: AFJBKPK5W56XWSNPQU2WW66ISWYQ, Interactions: 14
User ID: AFXF3EGQTQDXMRLDWFU7UBFQZB7Q, Interactions: 26
User ID: AFWVN52MRBWOTIK7UGXBWGOY4HBA, Interactions: 10
User ID: AFQQQ5LGNSQUEBGDCYBAZZE5T3DA, Interactions: 10
User ID: AGAM2CCKV52HI4YZU7ASZTSXA7YQ, Interactions: 15
User ID: AF2BLE54TEMGZ546U763ZHZRXC4A, Interactions: 29
User ID: AGZZXSMMS4WRHHJRBUJZI4FZDHKQ, Interactions: 7
User ID: AGD25H7BIT2JUXSIOPYCYB23J3ZQ, Interactions: 7
User ID: AEXGISIVX7WBUNI7UHHERVB3DF7Q, Interactions: 7
User ID: AEZP6Z2C5AVQDZAJECQYZWQRNG3Q, Interactions: 59
User ID: AGTW6ZGPUAORQ7X6CNBP6PJW7OTA, Interactions: 11
User ID: AHALZ7AKVAVL7QEVBCI55JVLGXOQ, Interactions: 7
User ID: AHTLWVDXSMG5YMVMEIWWOU6XBZMA, Interactions: 7
User ID: AHMG3ALUBE3FEBHODTBHP5J24YDA, Interactions: 5
User ID: AG73BVBKUOH22USSFJA5ZWL

In [22]:
import ast
import json

def load_original_data(txt_file_path):
    data = {}
    current_key = None
    current_data_lines = []
    
    with open(txt_file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line.startswith('Keys in'):
                # This is the header line; skip it
                continue
            elif line.startswith('dict_keys'):
                # Skip the dict_keys line
                continue
            elif line.startswith('Key:'):
                # When we encounter a new key, process the previous one
                if current_key is not None and current_data_lines:
                    # Combine the lines and parse the dictionary
                    dict_str = ' '.join(current_data_lines)
                    try:
                        data[current_key] = ast.literal_eval(dict_str)
                    except SyntaxError as e:
                        print(f"Error parsing data for key '{current_key}': {e}")
                        data[current_key] = {}
                    current_data_lines = []
                # Set the new key
                current_key = line.replace('Key:', '').strip()
            else:
                # Collect lines of data
                current_data_lines.append(line)
        # Don't forget to process the last key
        if current_key is not None and current_data_lines:
            dict_str = ' '.join(current_data_lines)
            try:
                data[current_key] = ast.literal_eval(dict_str)
            except SyntaxError as e:
                print(f"Error parsing data for key '{current_key}': {e}")
                data[current_key] = {}
    return data

def calculate_original_interactions(data):
    """
    Calculates the total number of interactions per user from the original datasets.
    
    Parameters:
    - data: dict, contains 'train', 'val', 'test', 'umap', 'smap'
    
    Returns:
    - original_interactions_per_user: dict, mapping of user IDs to their interaction counts
    """
    train = data.get('train', {})
    val = data.get('val', {})
    test = data.get('test', {})
    umap = data.get('umap', {})
    
    original_interactions_per_user = {}
    
    for user_id, user_index in umap.items():
        user_index = int(user_index)
        train_sequence = train.get(user_index, [])
        val_sequence = val.get(user_index, [])
        test_sequence = test.get(user_index, [])
        total_interactions = len(train_sequence) + len(val_sequence) + len(test_sequence)
        original_interactions_per_user[user_id] = total_interactions
    
    return original_interactions_per_user

def load_filtered_user_reviews(json_file_path):
    """
    Loads the filtered user reviews from JSON and counts the number of interactions per user.
    
    Parameters:
    - json_file_path: str, path to the filtered_user_reviews.json file.
    
    Returns:
    - filtered_interactions_per_user: dict, mapping of user IDs to their interaction counts
    """
    # Load the filtered user reviews JSON data
    with open(json_file_path, 'r') as f:
        filtered_user_reviews = json.load(f)
    
    filtered_interactions_per_user = {}
    
    for user_data in filtered_user_reviews:
        user_id = user_data['user_id']
        num_reviews = len(user_data['reviews'])
        filtered_interactions_per_user[user_id] = num_reviews
    
    return filtered_interactions_per_user

def load_filtered_user_reviews(json_file_path):
    """
    Loads the filtered user reviews from JSON and counts the number of interactions per user.
    
    Parameters:
    - json_file_path: str, path to the filtered_user_reviews.json file.
    
    Returns:
    - filtered_interactions_per_user: dict, mapping of user IDs to their interaction counts
    """
    # Load the filtered user reviews JSON data
    with open(json_file_path, 'r') as f:
        filtered_user_reviews = json.load(f)
    
    filtered_interactions_per_user = {}
    
    for user_data in filtered_user_reviews:
        user_id = user_data['user_id']
        num_reviews = len(user_data['reviews'])
        filtered_interactions_per_user[user_id] = num_reviews
    
    return filtered_interactions_per_user

def compare_interactions(original_counts, filtered_counts):
    """
    Compares the interaction counts per user between the original data and the filtered reviews.
    
    Parameters:
    - original_counts: dict, interaction counts from the original data
    - filtered_counts: dict, interaction counts from the filtered reviews
    
    Returns:
    - comparison_results: list of dicts, each containing user ID, original count, filtered count, and discrepancy
    """
    comparison_results = []
    
    all_user_ids = set(original_counts.keys()).union(set(filtered_counts.keys()))
    
    for user_id in all_user_ids:
        original_count = original_counts.get(user_id, 0)
        filtered_count = filtered_counts.get(user_id, 0)
        discrepancy = original_count - filtered_count
        comparison_results.append({
            'user_id': user_id,
            'original_count': original_count,
            'filtered_count': filtered_count,
            'discrepancy': discrepancy
        })
    
    return comparison_results
def display_comparison_results(comparison_results):
    """
    Displays the comparison results between original and filtered interaction counts.
    
    Parameters:
    - comparison_results: list of dicts containing comparison data
    """
    print(f"{'User ID':<40} {'Original Count':<15} {'Filtered Count':<15} {'Discrepancy':<12}")
    print("-" * 85)
    for result in comparison_results:
        user_id = result['user_id']
        original_count = result['original_count']
        filtered_count = result['filtered_count']
        discrepancy = result['discrepancy']
        print(f"{user_id:<40} {original_count:<15} {filtered_count:<15} {discrepancy:<12}")
    
    # Identify users with discrepancies
    discrepancies = [res for res in comparison_results if res['discrepancy'] != 0]
    if discrepancies:
        print("\nUsers with discrepancies:")
        for res in discrepancies:
            print(f"User ID: {res['user_id']}, Discrepancy: {res['discrepancy']}")
    else:
        print("\nAll user interaction counts match between the original data and filtered reviews.")



In [28]:
# Paths to your data files
txt_file_path = 'data/dataset.txt'  # Update this path as needed
json_file_path = 'data/filtered_user_reviews.json'  # Update this path as needed

# Step 1: Load original data and calculate interaction counts
original_data = load_original_data(txt_file_path)
original_interactions_per_user = calculate_original_interactions(original_data)

# Step 2: Load filtered user reviews and count interactions
filtered_interactions_per_user = load_filtered_user_reviews(json_file_path)

# Step 3: Compare interaction counts
comparison_results = compare_interactions(original_interactions_per_user, filtered_interactions_per_user)

# Step 4: Display and analyze results
display_comparison_results(comparison_results)


User ID                                  Original Count  Filtered Count  Discrepancy 
-------------------------------------------------------------------------------------
AHBEKWBIK2I7EPZH4L2Z2G4IZWNA             5               5               0           
AE5IMGWRBJA7JQFBQTBK25HDYGVA             14              14              0           
AE3PLZHW6NXWBMZ76TDVFQG2MJFA             15              15              0           
AHCA3FF2KQI7SRT32XBZCNHFEHOQ             8               8               0           
AHT7TDFPRBZE3GH7RFKCPSCEAR5Q             5               5               0           
AHTLWVDXSMG5YMVMEIWWOU6XBZMA             7               7               0           
AESCUI6VXJSHLUIO44Y5ERAHZE4A             9               9               0           
AEGTJSI4X2EZHAL5VWJV3RCJIO4A             7               7               0           
AFNCHMAKUAJOGVCKOA4XGLINHPDQ             15              15              0           
AGC7QCUXJISMEA6RKUGWBKYFA3EA             7            

In [32]:
import ast
import json

def load_original_data(txt_file_path):
    data = {}
    current_key = None
    current_data_lines = []

    with open(txt_file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line.startswith('Keys in'):
                # This is the header line; skip it
                continue
            elif line.startswith('dict_keys'):
                # Skip the dict_keys line
                continue
            elif line.startswith('Key:'):
                # When we encounter a new key, process the previous one
                if current_key is not None and current_data_lines:
                    # Combine the lines and parse the dictionary
                    dict_str = ' '.join(current_data_lines)
                    try:
                        data[current_key] = ast.literal_eval(dict_str)
                    except SyntaxError as e:
                        print(f"Error parsing data for key '{current_key}': {e}")
                        data[current_key] = {}
                    current_data_lines = []
                # Set the new key
                current_key = line.replace('Key:', '').strip()
            else:
                # Collect lines of data
                current_data_lines.append(line)
        # Don't forget to process the last key
        if current_key is not None and current_data_lines:
            dict_str = ' '.join(current_data_lines)
            try:
                data[current_key] = ast.literal_eval(dict_str)
            except SyntaxError as e:
                print(f"Error parsing data for key '{current_key}': {e}")
                data[current_key] = {}
    return data

def calculate_original_interactions(data):
    """
    Calculates the number of interactions per user from the original datasets.

    Parameters:
    - data: dict, contains 'train', 'val', 'test', 'umap', 'smap'

    Returns:
    - original_train_val_counts: dict, mapping of user IDs to their train+val interaction counts
    - original_test_counts: dict, mapping of user IDs to their test interaction counts
    """
    train = data.get('train', {})
    val = data.get('val', {})
    test = data.get('test', {})
    umap = data.get('umap', {})

    original_train_val_counts = {}
    original_test_counts = {}

    for user_id, user_index in umap.items():
        user_index = int(user_index)
        train_sequence = train.get(user_index, [])
        val_sequence = val.get(user_index, [])
        test_sequence = test.get(user_index, [])

        # Calculate train+val interactions
        train_val_interactions = len(train_sequence) + len(val_sequence)
        original_train_val_counts[user_id] = train_val_interactions

        # Calculate test interactions
        test_interactions = len(test_sequence)
        original_test_counts[user_id] = test_interactions

    return original_train_val_counts, original_test_counts

def load_filtered_user_reviews(json_file_path):
    """
    Loads the filtered user reviews from JSON and counts the number of interactions per user.

    Parameters:
    - json_file_path: str, path to the user_reviews.json file.

    Returns:
    - filtered_interactions_per_user: dict, mapping of user IDs to their interaction counts
    """
    # Load the filtered user reviews JSON data
    with open(json_file_path, 'r') as f:
        filtered_user_reviews = json.load(f)

    filtered_interactions_per_user = {}

    for user_data in filtered_user_reviews:
        user_id = user_data['user_id']
        num_reviews = len(user_data['reviews'])
        filtered_interactions_per_user[user_id] = num_reviews

    return filtered_interactions_per_user

def compare_interactions(original_counts, filtered_counts):
    """
    Compares the interaction counts per user between the original data and the filtered reviews.

    Parameters:
    - original_counts: dict, interaction counts from the original data
    - filtered_counts: dict, interaction counts from the filtered reviews

    Returns:
    - comparison_results: list of dicts, each containing user ID, original count, filtered count, and discrepancy
    """
    comparison_results = []

    all_user_ids = set(original_counts.keys()).union(set(filtered_counts.keys()))

    for user_id in all_user_ids:
        original_count = original_counts.get(user_id, 0)
        filtered_count = filtered_counts.get(user_id, 0)
        discrepancy = original_count - filtered_count
        comparison_results.append({
            'user_id': user_id,
            'original_count': original_count,
            'filtered_count': filtered_count,
            'discrepancy': discrepancy
        })

    return comparison_results

def display_comparison_results(comparison_results):
    """
    Displays the comparison results between original and filtered interaction counts.

    Parameters:
    - comparison_results: list of dicts containing comparison data
    """
    print(f"{'User ID':<40} {'Original Count':<15} {'Filtered Count':<15} {'Discrepancy':<12}")
    print("-" * 85)
    for result in comparison_results:
        user_id = result['user_id']
        original_count = result['original_count']
        filtered_count = result['filtered_count']
        discrepancy = result['discrepancy']
        print(f"{user_id:<40} {original_count:<15} {filtered_count:<15} {discrepancy:<12}")

    # Identify users with discrepancies
    discrepancies = [res for res in comparison_results if res['discrepancy'] != 0]
    if discrepancies:
        print("\nUsers with discrepancies:")
        for res in discrepancies:
            print(f"User ID: {res['user_id']}, Discrepancy: {res['discrepancy']}")
    else:
        print("\nAll user interaction counts match between the original data and filtered reviews.")

# Paths to your data files
txt_file_path = 'data/dataset.txt'  # Update this path as needed
train_val_json_path = 'data/train_val_user_reviews.json'  # Path to train and val JSON
test_json_path = 'data/test_user_reviews.json'  # Path to test JSON

# Step 1: Load original data and calculate interaction counts
original_data = load_original_data(txt_file_path)
original_train_val_counts, original_test_counts = calculate_original_interactions(original_data)

# Step 2: Load filtered user reviews and count interactions
filtered_train_val_counts = load_filtered_user_reviews(train_val_json_path)
filtered_test_counts = load_filtered_user_reviews(test_json_path)

# Step 3: Compare interaction counts for train and val
print("Comparing Train and Validation Interactions:")
train_val_comparison_results = compare_interactions(original_train_val_counts, filtered_train_val_counts)
display_comparison_results(train_val_comparison_results)

# Step 4: Compare interaction counts for test
print("\nComparing Test Interactions:")
test_comparison_results = compare_interactions(original_test_counts, filtered_test_counts)
display_comparison_results(test_comparison_results)


Comparing Train and Validation Interactions:
User ID                                  Original Count  Filtered Count  Discrepancy 
-------------------------------------------------------------------------------------
AHBEKWBIK2I7EPZH4L2Z2G4IZWNA             4               4               0           
AE5IMGWRBJA7JQFBQTBK25HDYGVA             13              13              0           
AE3PLZHW6NXWBMZ76TDVFQG2MJFA             14              14              0           
AHCA3FF2KQI7SRT32XBZCNHFEHOQ             7               7               0           
AHT7TDFPRBZE3GH7RFKCPSCEAR5Q             4               4               0           
AHTLWVDXSMG5YMVMEIWWOU6XBZMA             6               6               0           
AESCUI6VXJSHLUIO44Y5ERAHZE4A             8               8               0           
AEGTJSI4X2EZHAL5VWJV3RCJIO4A             6               6               0           
AFNCHMAKUAJOGVCKOA4XGLINHPDQ             14              14              0           
AGC7QCUXJ