In [5]:
import gzip
import shutil
import os

def extract_gz_to_file(gz_file_path, output_file_path=None):
    """
    Extracts a .gz (gzip) file and saves it as a new file.

    Parameters:
    ----------
    gz_file_path : str
        The path to the .gz file you want to extract.
    
    output_file_path : str, optional
        The path where the extracted file will be saved. If not provided,
        the file will be saved in the same directory as the .gz file with
        the .gz extension removed.

    Returns:
    -------
    output_file_path : str
        The path to the extracted file.
    
    Raises:
    ------
    FileNotFoundError:
        If the specified .gz file does not exist.
    
    OSError:
        If there is an error during the extraction process.
    """
    if not os.path.isfile(gz_file_path):
        raise FileNotFoundError(f"The file {gz_file_path} does not exist.")

    # Set output path by default in the same directory as gz_file_path, with .gz removed
    if output_file_path is None:
        output_file_path = gz_file_path.rstrip('.gz')

    try:
        # Open the gzip file and the output file, then copy contents
        with gzip.open(gz_file_path, 'rb') as f_in:
            with open(output_file_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        
        print(f"Extraction complete: {output_file_path}")
        return output_file_path
    except OSError as e:
        raise OSError(f"Error during extraction: {e}")

# Example Usage
gz_input_path = 'All_Beauty.valid.csv.gz'         # Replace with your .gz file path
gz_output_path = 'All_Beauty.valid.csv'      # Optional: specify the path for the extracted file

try:
    extracted_file_path = extract_gz_to_file(gz_input_path, gz_output_path)
except Exception as e:
    print(e)


Extraction complete: All_Beauty.valid.csv


In [24]:
# Import necessary libraries
import pandas as pd
import json
from collections import defaultdict

# Define file paths
REVIEWS_FILE = 'data/All_beauty_more_than_3_with_product.jsonl'        # Replace with your actual path
TRAIN_SPLIT_FILE = 'All_Beauty.train.csv'      # Replace with your actual path
TEST_SPLIT_FILE = 'All_Beauty.test.csv'        # Replace with your actual path
VAL_SPLIT_FILE = 'All_Beauty.valid.csv'          # Replace with your actual path


In [25]:
# Step 3: Load Reviews Data
reviews = []
with open(REVIEWS_FILE, 'r', encoding='utf-8') as f:
    for line in f:
        try:
            review = json.loads(line.strip())
            reviews.append(review)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            continue

reviews_df = pd.DataFrame(reviews)
print(reviews_df.columns)
print(f"Total reviews loaded: {len(reviews_df)}")

# Verify if 'product_name' exists
if 'product_name' not in reviews_df.columns:
    print("'product_name' is missing. Attempting to fill with 'asin'.")
    # Option 1: Use 'asin' as 'product_name'
    reviews_df['product_name'] = reviews_df['asin']
    # Option 2: If you have another key, adjust accordingly
    # Example:
    # if 'productTitle' in reviews_df.columns:
    #     reviews_df.rename(columns={'productTitle': 'product_name'}, inplace=True)
else:
    print("'product_name' is present.")

# Step 4: Load Split Data
train_df = pd.read_csv(TRAIN_SPLIT_FILE)
test_df = pd.read_csv(TEST_SPLIT_FILE)
val_df = pd.read_csv(VAL_SPLIT_FILE)

print(f"Train reviews: {len(train_df)}")
print(f"Test reviews: {len(test_df)}")
print(f"Validation reviews: {len(val_df)}")

Index(['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id',
       'timestamp', 'helpful_vote', 'verified_purchase', 'product_name'],
      dtype='object')
Total reviews loaded: 39427
'product_name' is present.
Train reviews: 2029
Test reviews: 253
Validation reviews: 253


In [26]:
def remove_duplicates(df, subset_cols, keep='first'):
    """
    Removes duplicate rows from a DataFrame based on specified columns.

    Parameters:
    ----------
    df : pandas.DataFrame
        The DataFrame from which to remove duplicates.
    subset_cols : list
        List of column names to consider for identifying duplicates.
    keep : {'first', 'last', False}, default 'first'
        Determines which duplicates (if any) to keep.
        - 'first': Keep the first occurrence.
        - 'last': Keep the last occurrence.
        - False: Drop all duplicates.

    Returns:
    -------
    pandas.DataFrame
        DataFrame with duplicates removed.
    """
    before = len(df)
    df_cleaned = df.drop_duplicates(subset=subset_cols, keep=keep).reset_index(drop=True)
    after = len(df_cleaned)
    print(f"Duplicates removed: {before - after}")
    return df_cleaned

# Define the columns to identify duplicates
duplicate_subset = ['user_id', 'parent_asin', 'timestamp']

# Remove duplicates from reviews_df
reviews_df = remove_duplicates(reviews_df, subset_cols=duplicate_subset, keep='first')

# (Optional) Remove duplicates from split DataFrames if necessary
train_df = remove_duplicates(train_df, subset_cols=duplicate_subset, keep='first')
test_df = remove_duplicates(test_df, subset_cols=duplicate_subset, keep='first')
val_df = remove_duplicates(val_df, subset_cols=duplicate_subset, keep='first')


Duplicates removed: 2575
Duplicates removed: 0
Duplicates removed: 0
Duplicates removed: 0


In [27]:
# Initialize 'split' column
reviews_df['split'] = None

# Function to assign split labels
def assign_split(df, split_label):
    # Create a unique identifier for matching
    identifiers = set(zip(df['user_id'], df['parent_asin'], df['timestamp']))
    return identifiers

# Get identifiers for each split
train_identifiers = assign_split(train_df, 'train')
test_identifiers = assign_split(test_df, 'test')
val_identifiers = assign_split(val_df, 'val')

# Function to determine the split for each review
def determine_split(row):
    identifier = (row['user_id'], row['parent_asin'], row['timestamp'])
    if identifier in train_identifiers:
        return 'train'
    elif identifier in test_identifiers:
        return 'test'
    elif identifier in val_identifiers:
        return 'val'
    else:
        return 'unknown'  # Or handle as needed

# Apply the function to assign splits
reviews_df['split'] = reviews_df.apply(determine_split, axis=1)

# Check distribution
print(reviews_df['split'].value_counts())


split
unknown    34317
train       2029
test         253
val          253
Name: count, dtype: int64


In [28]:
reviews_df.columns

Index(['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id',
       'timestamp', 'helpful_vote', 'verified_purchase', 'product_name',
       'split'],
      dtype='object')

In [29]:
# Function to create JSON structure for a given split
def create_json_split(df, split_label):
    split_df = df[df['split'] == split_label]
    user_reviews = defaultdict(list)
    
    for _, row in split_df.iterrows():
        review = {
            "product_name": row['product_name'],
            "parent_asin": row['parent_asin'],
            "rating": row['rating'],
            "title": row['title'],
            "text": row['text'],
            "timestamp": row['timestamp']
        }
        user_reviews[row['user_id']].append(review)
    
    # Convert to the desired list of dictionaries
    output = [{"user_id": user_id, "reviews": reviews} for user_id, reviews in user_reviews.items()]
    return output

# Create JSON structures
train_json = create_json_split(reviews_df, 'train')
test_json = create_json_split(reviews_df, 'test')
val_json = create_json_split(reviews_df, 'val')

# Display sample
print(json.dumps(train_json[:2], indent=4))


[
    {
        "user_id": "AFSKPY37N3C43SOI5IEXEK5JSIYA",
        "reviews": [
            {
                "product_name": "Keratin Secrets Do It Yourself Home Keratin System",
                "parent_asin": "B07SLFWZKN",
                "rating": 3.0,
                "title": "Just ok",
                "text": "I try to get Keratin treatments every 3 months, but honestly it has been getting costly. So, when I saw this I was excited to try it. I found it difficult to use and almost impossible to get to saturate the back of my hair and straight iron it the way they do in the salon. Front and sides were ok, but I couldn't maneuver the back to get it straight. Then I saw the ingredients after the first time and saw it contained formaldehyde and that was the last time I used the actual treatment. I did, however, use the shampoo and conditioner (and I still am). I wish they sold the S&C separate because I really did like it and I am always in the market for a good hair wash which won't s

In [30]:
# Save to JSON files
with open('train_output.json', 'w', encoding='utf-8') as f:
    json.dump(train_json, f, ensure_ascii=False, indent=4)

with open('test_output.json', 'w', encoding='utf-8') as f:
    json.dump(test_json, f, ensure_ascii=False, indent=4)

with open('val_output.json', 'w', encoding='utf-8') as f:
    json.dump(val_json, f, ensure_ascii=False, indent=4)

print("JSON files saved successfully.")


JSON files saved successfully.


In [4]:
import re

def extract_skipped_users(file_path):
    """Reads a text file and extracts user numbers from skipped user entries."""
    skipped_users = []
    with open(file_path, 'r') as file:
        for line in file:
            # Match lines with the pattern "User X skipped after Y retries"
            match = re.match(r'User (\d+) skipped after \d+ retries\.', line.strip())
            if match:
                # Extract the user number (X) from the match
                user_number = int(match.group(1))
                skipped_users.append(user_number)
    return skipped_users

# Example usage:
# Assuming the file 'skipped_users.txt' contains the input strings
file_path = 'results_None_2024-11-15_with_description_2_samples.txt'
skipped_users = extract_skipped_users(file_path)
print("Skipped Users:", skipped_users)
file_path = 'results_None_2024-11-15_with_description_3_samples.txt'
skipped_users = extract_skipped_users(file_path)
print("Skipped Users:", skipped_users)
file_path = 'results_None_2024-11-14_12pm_samples.txt'
skipped_users = extract_skipped_users(file_path)
print("Skipped Users:", skipped_users)
file_path = 'results_None_2024-11-14_3_samples.txt'
skipped_users = extract_skipped_users(file_path)
print("Skipped Users:", skipped_users)
file_path = 'results_None_2024-11-14_2_samples.txt'
skipped_users = extract_skipped_users(file_path)
print("Skipped Users:", skipped_users)


Skipped Users: [4, 7, 8, 12, 17, 28, 31, 35, 40, 41, 43, 62, 65, 81, 87, 90, 93, 124, 129, 143, 159, 164, 174, 204, 205, 207, 218, 221, 231, 236, 240, 244]
Skipped Users: [4, 7, 8, 12, 17, 28, 31, 35, 40, 41, 43, 62, 65, 81, 87, 90, 93, 124, 129, 143, 159, 164, 174, 204, 205, 207, 218, 221, 231, 236, 240, 244]
Skipped Users: [4, 7, 8, 12, 17, 28, 31, 35, 40, 41, 43, 62, 65, 81, 87, 90, 93, 124, 129, 143, 159, 164, 174, 204, 205, 207, 218, 221, 231, 236, 240, 244]
Skipped Users: [4, 7, 8, 12, 17, 28, 31, 35, 40, 41, 43, 62, 65, 81, 87, 90, 93, 124, 129, 143, 159, 164, 174, 204, 205, 207, 218, 221, 231, 236, 240, 244]
Skipped Users: [4, 7, 8, 12, 17, 28, 31, 35, 40, 41, 43, 62, 65, 81, 87, 90, 93, 124, 129, 143, 159, 164, 174, 204, 205, 207, 218, 221, 231, 236, 240, 244]


In [None]:
import re

def extract_perfect_matches(file_path):
    """Extract users, product IDs, and positions where similarity is 100.00% and the product matched."""
    perfect_matches = []
    current_user = None
    capturing = False

    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()

            # Match the start of a user's results
            user_match = re.match(r'^User (\d+):$', line)
            if user_match:
                current_user = int(user_match.group(1))
                capturing = True
                continue
            
            # Stop capturing when encountering "Profile" or next user's results
            if "Profile" in line or re.match(r'^User \d+:$', line):
                capturing = False

            # Capture lines with "100.00% - Match"
            if capturing and "Similarity: 100.00% - Match" in line:
                # Match the product ID and position
                product_match = re.match(r'^\s*(\d+)\.\s+([A-Z0-9]+)\s+-', line)
                if product_match:
                    position = int(product_match.group(1))
                    product_id = product_match.group(2)
                    perfect_matches.append((current_user, product_id, position))
    
    return perfect_matches

# Example usage:
file_path = 'results_None_2024-11-15_with_description_2_samples.txt'
matches = extract_perfect_matches(file_path)
print(f"File path {file_path}")
for user, product_id, position in matches:
    print(f"User {user}, Product ID: {product_id}, Position: {position}")
file_path = 'results_None_2024-11-15_with_description_3_samples.txt'
matches = extract_perfect_matches(file_path)
print(f"File path {file_path}")
for user, product_id, position in matches:
    print(f"User {user}, Product ID: {product_id}, Position: {position}")
file_path = 'results_None_2024-11-14_12pm_samples.txt'
matches = extract_perfect_matches(file_path)
print(f"File path {file_path}")
for user, product_id, position in matches:
    print(f"User {user}, Product ID: {product_id}, Position: {position}")
file_path = 'results_None_2024-11-14_3_samples.txt'
matches = extract_perfect_matches(file_path)
print(f"File path {file_path}")
for user, product_id, position in matches:
    print(f"User {user}, Product ID: {product_id}, Position: {position}")
file_path = 'results_None_2024-11-14_2_samples.txt'
matches = extract_perfect_matches(file_path)
print(f"File path {file_path}")
for user, product_id, position in matches:
    print(f"User {user}, Product ID: {product_id}, Position: {position}")
matches = extract_perfect_matches(file_path)



File path results_None_2024-11-15_with_description_2_samples.txt
User 23, Product ID: B07JGD2T2J, Position: 5
User 61, Product ID: B092M5K59T, Position: 15
User 61, Product ID: B092M5K59T, Position: 16
User 67, Product ID: B09C5NQSC5, Position: 13
User 67, Product ID: B09C5NQSC5, Position: 17
User 72, Product ID: B09GVHT2D3, Position: 3
User 75, Product ID: B08MC3ZLV4, Position: 14
User 196, Product ID: B08KWN77LW, Position: 18
User 203, Product ID: B09GVHT2D3, Position: 1
File path results_None_2024-11-15_with_description_3_samples.txt
User 20, Product ID: B08S1LWF9V, Position: 9
User 23, Product ID: B07JGD2T2J, Position: 7
User 23, Product ID: B07JGD2T2J, Position: 9
User 54, Product ID: B07SLFWZKN, Position: 5
User 61, Product ID: B092M5K59T, Position: 19
User 72, Product ID: B09GVHT2D3, Position: 4
User 95, Product ID: B08W8LKLHB, Position: 7
User 120, Product ID: B088PYN4VM, Position: 7
User 126, Product ID: B07SLFWZKN, Position: 5
User 160, Product ID: B0949MJRHK, Position: 6
Use

In [10]:
import re
from collections import Counter

def extract_perfect_matches(file_path):
    """Extract users, product IDs, and positions where similarity is 100.00% and the product matched."""
    perfect_matches = {}
    current_user = None
    capturing = False

    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()

            # Match the start of a user's results
            user_match = re.match(r'^User (\d+):$', line)
            if user_match:
                current_user = int(user_match.group(1))
                capturing = True
                continue
            
            # Stop capturing when encountering "Profile" or next user's results
            if "Profile" in line or re.match(r'^User \d+:$', line):
                capturing = False

            # Capture lines with "100.00% - Match"
            if capturing and "Similarity: 100.00% - Match" in line:
                # Match the product ID and position
                product_match = re.match(r'^\s*(\d+)\.\s+([A-Z0-9]+)\s+-', line)
                if product_match:
                    position = int(product_match.group(1))
                    product_id = product_match.group(2)

                    # Only keep the best position for each user
                    if current_user not in perfect_matches or perfect_matches[current_user][1] > position:
                        perfect_matches[current_user] = (product_id, position)
    
    # Convert to a list of tuples for compatibility
    return [(user, product_id, position) for user, (product_id, position) in perfect_matches.items()]

def analyze_matches(all_matches):
    """Analyze the extracted matches to find patterns."""
    user_counter = Counter()
    product_counter = Counter()

    for match in all_matches:
        user, product, _ = match
        user_counter[user] += 1
        product_counter[product] += 1

    most_common_users = user_counter.most_common()
    most_common_products = product_counter.most_common()

    return most_common_users, most_common_products

def process_files(file_paths):
    """Process multiple files and analyze results."""
    all_matches = []
    
    for file_path in file_paths:
        print(f"Processing file: {file_path}")
        matches = extract_perfect_matches(file_path)
        all_matches.extend(matches)
        for user, product_id, position in matches:
            print(f"User {user}, Product ID: {product_id}, Position: {position}")
        print("-" * 40)

    # Analyze all matches
    print("\nAnalyzing all matches...")
    most_common_users, most_common_products = analyze_matches(all_matches)

    # Print analysis results
    print("\nMost common users:")
    for user, count in most_common_users:
        print(f"User {user}: {count} times")

    print("\nMost common products:")
    for product, count in most_common_products:
        print(f"Product {product}: {count} times")

# Example usage
file_paths = [
    'results_None_2024-11-15_with_description_2_samples.txt',
    'results_None_2024-11-15_with_description_3_samples.txt',
    'results_None_2024-11-14_12pm_samples.txt',
    'results_None_2024-11-14_3_samples.txt',
    'results_None_2024-11-14_2_samples.txt'
]

process_files(file_paths)



Processing file: results_None_2024-11-15_with_description_2_samples.txt
User 23, Product ID: B07JGD2T2J, Position: 5
User 61, Product ID: B092M5K59T, Position: 15
User 67, Product ID: B09C5NQSC5, Position: 13
User 72, Product ID: B09GVHT2D3, Position: 3
User 75, Product ID: B08MC3ZLV4, Position: 14
User 196, Product ID: B08KWN77LW, Position: 18
User 203, Product ID: B09GVHT2D3, Position: 1
----------------------------------------
Processing file: results_None_2024-11-15_with_description_3_samples.txt
User 20, Product ID: B08S1LWF9V, Position: 9
User 23, Product ID: B07JGD2T2J, Position: 7
User 54, Product ID: B07SLFWZKN, Position: 5
User 61, Product ID: B092M5K59T, Position: 19
User 72, Product ID: B09GVHT2D3, Position: 4
User 95, Product ID: B08W8LKLHB, Position: 7
User 120, Product ID: B088PYN4VM, Position: 7
User 126, Product ID: B07SLFWZKN, Position: 5
User 160, Product ID: B0949MJRHK, Position: 6
User 162, Product ID: B09C5NQSC5, Position: 12
User 200, Product ID: B08G4Y4SFV, Posi

### Combine train and val set

In [25]:
import json

# Step 1: Load JSON files
def load_json(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

# File paths for the JSON files
file1_path = 'new_data/new_train_output.json'  # Replace with the path to your first JSON file
file2_path = 'new_data/new_val_output.json'  # Replace with the path to your second JSON file

# Load JSON data
data1 = load_json(file1_path)
data2 = load_json(file2_path)

# Step 2: Merge JSON data while retaining all of JSON 1
def merge_json(data1, data2):
    # Create a dictionary for quick lookup of user_id in data1
    user_map = {user['user_id']: user for user in data1}
    
    for user in data2:
        user_id = user['user_id']
        if user_id in user_map:
            # Add reviews to the existing user in data1
            user_map[user_id]['reviews'].extend(user['reviews'])
        else:
            # Add the new user from data2 to data1
            data1.append(user)
    return data1

# Step 3: Sort reviews by timestamp
def sort_reviews_by_timestamp(data):
    for user in data:
        user['reviews'].sort(key=lambda review: review['timestamp'])
    return data

# Merge the data
merged_data = merge_json(data1, data2)

# Sort the reviews for each user by timestamp
sorted_data = sort_reviews_by_timestamp(merged_data)

# Step 4: Test the merging
def test_merge(data1, data2, merged_data):
    for user in data2:
        user_id = user['user_id']
        # Find the user in the merged data
        merged_user = next((u for u in merged_data if u['user_id'] == user_id), None)
        if not merged_user:
            print(f"Test failed: User {user_id} not found in merged data.")
            return False
        # Check if all reviews from data2 are in the merged data
        for review in user['reviews']:
            if review not in merged_user['reviews']:
                print(f"Test failed: Review {review} not found in merged data for user {user_id}.")
                return False
    print("Test passed: All reviews from data2 are correctly merged into data1.")
    return True

# Run the test
test_merge(data1, data2, sorted_data)

# Step 5: Save the sorted merged data to a file (optional)
output_path = "sorted_merged_data.json"
with open(output_path, 'w') as outfile:
    json.dump(sorted_data, outfile, indent=4)

print(f"Sorted merged data saved to {output_path}")


Test passed: All reviews from data2 are correctly merged into data1.
Sorted merged data saved to sorted_merged_data.json
