In [7]:
import pandas as pd
import json
import ast
import datetime
import random
from dateutil.relativedelta import relativedelta

id_category = "bdab0052-6bd4-45bd-a98c-f9579b45d2d2"
folder_data = "data/earring"

In [8]:
import pandas as pd
import datetime
import csv
import uuid

# Load data
df = pd.read_csv(f'{folder_data}/tiki_products.csv')

# Convert images to list
def parse_images(img_str):
    if not isinstance(img_str, str):
        return []
    return [u.strip() for u in img_str.split(',')]

df['image_list'] = df['images'].apply(parse_images)
df['avatar'] = df['image_list'].apply(lambda x: x[0] if x else None)

# Convert to Postgres array format
def to_pg_array(url_list):
    if not url_list:
        return "{}"
    return "{" + ",".join(f'"{u}"' for u in url_list) + "}"

df['url_img'] = df['image_list'].apply(to_pg_array)

# Fixed category
df['id_category'] = id_category

# Pricing
df['immediate_purchase_price'] = df['price']
df['starting_price'] = df['price'].apply(lambda p: int(round(p * 0.5, -3)))
df['pricing_step'] = df['price'].apply(lambda p: int(round(max(1000, p * 0.02), -3)))

# Timestamp
now = datetime.datetime.now()
df['posted_date_time'] = now.isoformat()

def generate_end_date():
    now = datetime.datetime.now()
    extra_months = random.randint(1, 3)
    result = now + relativedelta(days=14) + relativedelta(months=extra_months)
    return result.isoformat()

df['end_date_time'] = df.apply(lambda _: generate_end_date(), axis=1)

# User
df['created_by'] = "158f7354-ce82-42ca-a4f2-e276f2fdfb01"
df['updated_by'] = "158f7354-ce82-42ca-a4f2-e276f2fdfb01"

# Rename HTML desc
df.rename(columns={'desc_html': 'description'}, inplace=True)

# Generate product IDs
df['id_product'] = [str(uuid.uuid4()) for _ in range(len(df))]

# Final product columns
final_cols = [
    'id_product',
    'id_category',
    'avatar',
    'name',
    'price',
    'immediate_purchase_price',
    'posted_date_time',
    'end_date_time',
    'description',
    'pricing_step',
    'starting_price',
    'url_img',
    'updated_by',
    'created_by'
]

products_df = df[final_cols]

# Save CSV
products_df.to_csv(f'{folder_data}/normalized_products.csv', index=False)

# ================================
# GENERATE SQL FOR PRODUCT TABLE
# ================================
def escape_sql(val):
    if pd.isna(val):
        return "NULL"
    if isinstance(val, str):
        return "'" + val.replace("'", "''") + "'"
    return str(val)

sql_product = ["-- INSERT INTO product"]

for _, row in products_df.iterrows():
    vals = []
    for col in final_cols:
        if col == "url_img":
            vals.append(f"'{row[col]}'")
        elif isinstance(row[col], str):
            vals.append(escape_sql(row[col]))
        else:
            vals.append(str(row[col]))

    sql_product.append(
        f"INSERT INTO product ({', '.join(final_cols)}) VALUES ({', '.join(vals)});"
    )

with open(f"{folder_data}/insert_product.sql", "w", encoding="utf-8") as f:
    f.write("\n".join(sql_product))


# ==========================================
# GENERATE SQL FOR description_history TABLE
# ==========================================
history_sql = ["-- INSERT INTO description_history"]

for _, row in products_df.iterrows():
    history_sql.append(
        f"INSERT INTO description_history (id_product, time, description) "
        f"VALUES ('{row['id_product']}', '{row['posted_date_time']}', {escape_sql(row['description'])});"
    )

with open(f"{folder_data}/insert_description_history.sql", "w", encoding="utf-8") as f:
    f.write("\n".join(history_sql))

print("Generated normalized_products.csv, insert_product.sql, insert_description_history.sql successfully!")

Generated normalized_products.csv, insert_product.sql, insert_description_history.sql successfully!
