### Add header name

In [None]:
import numpy as np
import pandas as pd

# Read the CSV file containing data into a pandas DataFrame
df = pd.read_csv("data.csv", header=None)

# Specify the header names for the DataFrame columns
header_names = ['item_path', 'item_image', 'item_name', 'fixed_item_price', 'sale_item_price', 'sales_number', 'shop_path', 'shop_name']

# Save the DataFrame to a new CSV file with specified headers
df.to_csv('./data_preprocess.csv', header=header_names, index=False)

### Preprocess

In [None]:
import numpy as np
import pandas as pd

# Read the preprocessed CSV file into a pandas DataFrame
df = pd.read_csv("data_preprocess.csv")

In [None]:
# Define a function to convert sales number strings to numeric values
def convert_to_numeric(value):
    # Remove 'Đã bán ' from the string
    value = value.replace('Đã bán ', '')
    
    # Check if 'k' (thousand) is present in the value
    if 'k' in value:
        value = value.replace('k', '')
        
        # Check if ',' is present in the value (e.g., '1,500k')
        if ',' in value:
            value = float(value.replace(',', '')) * 100
        else:
            value = float(value) * 1000
    else:
        # Remove commas and convert the value to float
        value = float(value.replace(',', ''))
    
    # Convert the numeric value to an integer and then back to a string
    return str(int(value))

# Fill missing values in the 'sales_number' column with "Đã bán 0"
df['sales_number'] = df['sales_number'].fillna("Đã bán 0")

# Apply the 'convert_to_numeric' function to the 'sales_number' column
df['sales_number'] = df['sales_number'].apply(convert_to_numeric)

In [None]:
# Define a custom conversion function to process 'sale_item_price' values
def process_sale_item_price(value):
    # Remove dots from the value
    value = value.replace(".", "")
    
    # Convert the value to an integer and then back to a string
    return str(int(value))

# Apply the custom conversion function to the 'sale_item_price' column
df['sale_item_price'] = df['sale_item_price'].apply(process_sale_item_price)

In [None]:
# Define a custom conversion function to process 'fixed_item_price' values
def process_fixed_item_price(value):
    # Remove dots and currency symbol ("₫") from the value
    value = value.replace(".", "").replace("₫", "")
    
    # Convert the value to an integer and then back to a string
    return str(int(value))

# Apply the custom conversion function to the 'fixed_item_price' column
df['fixed_item_price'] = df['fixed_item_price'].apply(process_fixed_item_price)

In [None]:
# Save the DataFrame to a CSV file named 'data_preprocess.csv' without including the index column
df.to_csv('data_preprocess.csv', index=False)

### Remove duplicate

In [None]:
import pandas as pd

# Load the preprocessed data into a DataFrame
df = pd.read_csv('data.csv')

# Check for duplicate rows based on all columns
duplicate_rows = df[df.duplicated(subset=['item_image'])]

# Print the duplicate rows (if any)
if not duplicate_rows.empty:
    print("Duplicate Rows:")
    print(duplicate_rows)
else:
    print("No duplicate rows found.")

In [None]:
# Remove duplicate rows based on all columns and keep the first occurrence
df_no_duplicates = df.drop_duplicates(subset=['item_image'])

# Save the DataFrame without duplicates back to a CSV file
df_no_duplicates.to_csv('data_no_duplicates.csv', index=False)