In [None]:
import requests, time, csv, os
import random
from datetime import datetime, timedelta
import cloudinary
import cloudinary.uploader
from googletrans import Translator
import pandas as pd
from ctgan import CTGAN
import ast

In [None]:
translator = Translator()

translate_cache = {}

def translate_vi_to_en(text):
    if not text:
        return text
    if text in translate_cache:
        return translate_cache[text]
    try:
        result = translator.translate(text, src="vi", dest="en")
        translate_cache[text] = result.text
        time.sleep(0.15)
        return result.text
    except:
        return text

cloudinary.config(
  cloud_name = "dvxmaiofh", 
  api_key = "834668983718514", 
  api_secret = "KI_mVAdKhFNvtbJN4w9TWSzJqno",
  secure = True
)

CATEGORY_ID = 53540
MAX_PRODUCTS = 100

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

items = []
page = 1
MAX_PAGE = 150

def random_date(start_year=2022, end_year=2025):
    start = datetime(start_year, 1, 1)
    end = datetime(end_year, 12, 31)
    return (start + timedelta(
        days=random.randint(0, (end - start).days)
    )).strftime("%Y-%m-%d")

FAKE_MATERIALS = [
    "ABS Plastic",
    "Alloy",
    "Stainless Steel",
    "Silicone",
    "Synthetic Fabric"
]

FAKE_SIZES = ["S", "M", "L"]

FAKE_COLORS = [
    "Red",
    "Blue",
    "Green",
    "Yellow",
    "Orange",
    "Pink",
    "Purple",
    "Black",
    "White",
    "Gray",
    "Brown"
]

def sql_escape(value):
    if value is None:
        return "NULL"
    return "'" + str(value).replace("'", "''") + "'"

def write_sql(items):
    filename = f"insert_accessory.sql"
    with open(filename, "w", encoding="utf-8") as f:
        for item in items:
            f.write("-- ================================\n")
            f.write(f"-- {item['product_name']}\n")

            # 1. Product
            f.write("WITH new_product AS (\n")
            f.write("  INSERT INTO product (\n")
            f.write("    product_name, price, manufacture_date, entry_date, expiry_date, stock, type\n")
            f.write("  ) VALUES (\n")
            f.write(f"    {sql_escape(item['product_name'])},\n")
            f.write(f"    {item['price']},\n")
            f.write(f"    {sql_escape(item['manufacture_date'])},\n")
            f.write(f"    {sql_escape(item['entry_date'])},\n")
            f.write(f"    {sql_escape(item['expiry_date'])},\n")
            f.write(f"    {item['stock']},\n")
            f.write(f"    {sql_escape(item['type'])}\n")
            f.write("  ) RETURNING product_id\n")
            f.write(")")

            # 2. Product images (n·∫øu c√≥)
            imgList = item.get("imgList", [])
            if imgList:
                for idx, img in enumerate(imgList):
                    sep = "," if idx == 0 else ","
                    f.write(f"{sep}\n")
                    f.write(
                        f"img_{idx} AS (\n"
                        "  INSERT INTO productimage (product_id, image_url)\n"
                        f"  SELECT product_id, {sql_escape(img)} FROM new_product\n"
                        ")\n"
                    )

            # 3. Accessory
            f.write(
                "INSERT INTO accessory (product_id, size, color, material)\n"
                "SELECT product_id, "
                f"{sql_escape(item['size'])}, "
                f"{sql_escape(item['color'])}, "
                f"{sql_escape(item['material'])} "
                "FROM new_product;\n\n"
            )

    print(f"üíæ ƒê√£ l∆∞u {len(items)} item v√†o {filename}")

In [None]:
while len(items) < MAX_PRODUCTS:
    LIST_API = f"https://tiki.vn/api/v2/products?category={CATEGORY_ID}&page={page}"
    resp = requests.get(LIST_API, headers=headers).json()
    data = resp.get("data", [])

    if not data:
        print(f"[WARN] Page {page} has no data, skipping...")
        page += 1
        continue

    for p in data:
        if len(items) >= MAX_PRODUCTS:
            break

        pid = p["id"]
        detail = requests.get(f"https://tiki.vn/api/v2/products/{pid}", headers=headers).json()

        product_name = detail.get("name")
        price = detail.get("price")
        stock = random.randint(50, 200)
        type_ = "accessory"

        imgList = [img["base_url"] for img in detail.get("images", []) if "base_url" in img][:3]
        if not imgList:
            imgList = ["https://dummyimage.com/600x600"]

        manufacture_date = random_date(2022, 2024)
        entry_date = random_date(2024, 2025)
        expiry_date = random_date(2026, 2028)

        material = random.choice(FAKE_MATERIALS)
        size = random.choice(FAKE_SIZES)
        color = random.choice(FAKE_COLORS)

        items.append({
            "product_name": translate_vi_to_en(product_name),
            "price": price,
            "manufacture_date": manufacture_date,
            "entry_date": entry_date,
            "expiry_date": expiry_date,
            "stock": stock,
            "type": type_,
            "imgList": imgList,
            "size": size,
            "color": color,
            "material": material,
        })

        print(f"GET {len(items)} | {product_name[:40]}")

    page += 1

In [None]:
print(items)
print(len(items))
for idx, row in enumerate(items):
    img_count = len(row.get("imgList", [])) if isinstance(row.get("imgList", []), list) else 0
    print(f"S·∫£n ph·∫©m {idx+1}: {img_count} ·∫£nh")

In [None]:
# 1. Load dataset g·ªëc (100 s·∫£n ph·∫©m)
df = pd.DataFrame(items)  # ho·∫∑c pd.read_csv("accessory_products.csv")

# N·∫øu imgList l√† list, SDV kh√¥ng handle tr·ª±c ti·∫øp -> convert sang string
df['imgList'] = df['imgList'].apply(lambda x: ";".join(x) if isinstance(x, list) else x)

# 2. Kh·ªüi t·∫°o model CTGAN
model = CTGAN(epochs=500)  # tƒÉng epochs n·∫øu dataset nh·ªè ƒë·ªÉ model h·ªçc t·ªët h∆°n

# Convert dates sang string
for col in ["manufacture_date", "entry_date", "expiry_date"]:
    df[col] = df[col].astype(str)

# Convert imgList sang string
df['imgList'] = df['imgList'].apply(lambda x: ";".join(x) if isinstance(x, list) else x)

# C√°c c·ªôt categorical
discrete_columns = ["product_name", "type", "size", "color", "material",
                    "manufacture_date", "entry_date", "expiry_date", "imgList"]

# Kh·ªüi t·∫°o model
model = CTGAN(epochs=500)

# Fit
model.fit(df, discrete_columns=discrete_columns)

# 4. Generate th√™m synthetic data
synthetic_df = model.sample(3900)  # t·∫°o th√™m 3900 ƒë·ªÉ t·ªïng ~4000

def fix_imglist(imglist):
    if isinstance(imglist, str):
        imgs = [i for i in imglist.split(";") if i.strip()]
    elif isinstance(imglist, list):
        imgs = [i for i in imglist if i]
    else:
        imgs = []

    if not imgs:
        imgs = ["https://dummyimage.com/600x600"]
    return imgs

# √Åp d·ª•ng fix_imglist cho c·∫£ df g·ªëc
df['imgList'] = df['imgList'].apply(fix_imglist)

# √Åp d·ª•ng fix_imglist cho synthetic
synthetic_df['imgList'] = synthetic_df['imgList'].apply(fix_imglist)

# 6. G·ªôp d·ªØ li·ªáu g·ªëc + synthetic
full_df = pd.concat([df, synthetic_df], ignore_index=True)

for col in ["manufacture_date", "entry_date", "expiry_date"]:
    full_df[col] = pd.to_datetime(full_df[col], errors='coerce')

In [None]:
print(len(full_df))  # Ki·ªÉm tra t·ªïng s·ªë s·∫£n ph·∫©m
for idx, row in full_df.iterrows():
    img_count = len(row["imgList"]) if isinstance(row["imgList"], list) else 0
    print(f"S·∫£n ph·∫©m {idx+1}: {img_count} ·∫£nh")

In [None]:
# ==== Sau khi loop xong to√†n b·ªô, ghi SQL ho·∫∑c CSV 1 l·∫ßn ====
write_sql(full_df.to_dict(orient="records"))
full_df.to_csv("accessory_products_4000.csv", index=False, encoding='utf-8')
print(f"[INFO] ƒê√£ thu th·∫≠p t·ªïng c·ªông {len(full_df)} s·∫£n ph·∫©m")

In [1]:
import psycopg2

conn = psycopg2.connect(
    host="aws-1-ap-south-1.pooler.supabase.com",
    database="postgres",
    user="postgres.rruavcjmtgpxyznzwkhw",
    password="khaibaolocnguyen",
    port=5432
)
cur = conn.cursor()

with open("insert_accessory.sql", "r", encoding="utf-8") as f:
    sql = f.read()

cur.execute(sql)
conn.commit()
cur.close()
conn.close()
