# Data Preprocessing for Data Mining
This notebook preprocesses `Amazon-Products.csv` to a format suitable for data mining, matching `new_product.csv`.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import os
import re

# Paths
input_path = '../dataset/Amazon-Products.csv'
output_dir = '../pre_processing'
output_filename = 'Amazon-Products_processed.csv'
output_path = os.path.join(output_dir, output_filename)

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print(f"Loading {input_path}...")
df = pd.read_csv(input_path)

In [None]:
def clean_price(price_str):
    if pd.isna(price_str) or price_str == '':
        return 0.0
    # Remove currency symbols and commas
    clean_str = re.sub(r'[^\d.]', '', str(price_str).replace('₹', '').replace('窜ｹ', '').replace(',', ''))
    try:
        return float(clean_str)
    except ValueError:
        return 0.0

def clean_ratings_count(count):
    if pd.isna(count) or count == '':
        return 0.0
    try:
        return float(str(count).replace(',', '').replace('"', ''))
    except ValueError:
        return 0.0

print("Cleaning numeric columns...")
df['discount_price'] = df['discount_price'].apply(clean_price)
df['actual_price'] = df['actual_price'].apply(clean_price)
df['no_of_ratings'] = df['no_of_ratings'].apply(clean_ratings_count)
df['ratings'] = pd.to_numeric(df['ratings'], errors='coerce').fillna(0.0)

In [None]:
exchange_rate = 83.0 # Approx INR to USD
print(f"Converting prices to USD (Rate: {exchange_rate})...")
df['discount_price_usd'] = (df['discount_price'] / exchange_rate).round(2)
df['actual_price_usd'] = (df['actual_price'] / exchange_rate).round(2)

In [None]:
print("Encoding categories...")
le_main = LabelEncoder()
df['main_category_encoded'] = le_main.fit_transform(df['main_category'].astype(str))

le_sub = LabelEncoder()
df['sub_category_encoded'] = le_sub.fit_transform(df['sub_category'].astype(str))

In [None]:
prefix_map = {
    'home & kitchen': 'hkt',
    'appliances': 'apl',
    'electronics': 'ele',
    'accessories': 'acs',
    'toys & games': 'toy',
    'beauty & health': 'bth',
    'grocery & gourmet foods': 'gro',
    'sports & outdoors': 'spo',
    'clothing & accessories': 'cla',
    'shoes': 'sho'
}

def generate_id(row):
    cat = str(row['main_category']).lower()
    prefix = 'prd'
    for key, val in prefix_map.items():
        if key in cat:
            prefix = val
            break
    return f"{prefix}{row.name:06d}"

print("Generating unique IDs...")
df['id'] = df.apply(generate_id, axis=1)

In [None]:
final_cols = [
    'id', 'name', 'main_category', 'main_category_encoded', 
    'sub_category', 'sub_category_encoded', 'image', 'link', 
    'ratings', 'no_of_ratings', 'discount_price_usd', 'actual_price_usd'
]

print(f"Saving processed data to {output_path}...")
df[final_cols].to_csv(output_path, index=False)
print("Preprocessing Complete!")