In [23]:
import pandas as pd

input_file = '../../data/sheets/main_data.csv'
df = pd.read_csv(input_file)

# Rename specific columns
df = df.rename(columns={
    'STYLE NO.': 'Code',
    'SIZE_US': 'Size',
    'COLOR': 'Color',
    'QUANTITY': 'Quantity',
    'BRAND NAME': 'Brand'
})

# Define validation criteria for each column


def validate_row(row):
    # Check for missing or blank values
    if pd.isnull(row['Brand']) or pd.isnull(row['Code']) or pd.isnull(row['Color']) or pd.isnull(row['Quantity']) or pd.isnull(row['Size']):
        return False
    if row['Brand'].strip() == '' or row['Code'].strip() == '' or row['Color'].strip() == '':
        return False

    # Check for correct data types
    if not isinstance(row['Brand'], str) or not isinstance(row['Code'], str) or not isinstance(row['Color'], str):
        return False
    try:
        float(row['Quantity'])
        float(row['Size'])
    except ValueError:
        return False

    return True


# Apply validation to each row
valid_rows = df.apply(validate_row, axis=1)

# Filter the DataFrame to retain only valid rows
filtered_df = df[valid_rows]

# Select the specified columns based on the new names
selected_columns = ['Brand', 'Code', 'Color', 'Quantity', 'Size']
new_df = filtered_df[selected_columns]

# Write the filtered data to a new CSV file
# output_file = '../../scripts/catalog/filtered_data.csv'
output_file = '../../scripts/catalog/filtered_data.csv'
new_df.to_csv(output_file, index=False)

# Print number of unique Codes
print(new_df['Code'].nunique())
print(f"Selected columns have been written to {output_file}")

559
Selected columns have been written to ../../scripts/catalog/filtered_data.csv


In [24]:
import pandas as pd

# Load the original CSV file
df = pd.read_csv("../../scripts/catalog/filtered_data.csv")

# Ensure Quantity is numeric, coercing errors to NaN
df['Quantity'] = pd.to_numeric(df['Quantity'], errors='coerce')

# Fill NaN values in Quantity with 0 (or handle as needed)
df['Quantity'] = df['Quantity'].fillna(0)

# First aggregation: Sum quantities based on Code, Color, and Size
aggregated_df = df.groupby(['Code', 'Color', 'Size'], as_index=False).agg({
    'Brand': 'first',  # Assuming you want to keep the first Brand name found
    'Quantity': 'sum'
})

# Print the number of unique codes after aggregation
print(f"Number of unique Codes in the original DataFrame: {
      df['Code'].nunique()}")
print(f"Number of unique Codes after aggregation: {
      aggregated_df['Code'].nunique()}")

# Write the aggregated data to a new CSV file
# aggregated_df.to_csv('../../data/sheets/step_2_output.csv', index=False)
aggregated_df.to_csv('../../scripts/catalog/grouped_data.csv', index=False)

Number of unique Codes in the original DataFrame: 559
Number of unique Codes after aggregation: 559


In [25]:
import pandas as pd

# List of codes to filter
code_list = [
    'F570', 'F1116', 'B8865', 'B8998', 'E1270', 'E1412', 'E1582',
    'E1168', 'E1264', 'E1295', 'E1696', 'E1799', 'E1531', 'E1013',
    'E1260', 'E1729', 'E1657', 'B9000', 'E1463', 'E1379', 'B8608',
    'E1340', 'E1520', 'E1502', 'E1391', 'E1730', 'E1420',
    'E1421', 'E1796', 'E1663', 'AA9270', 'B8808', 'E1387', 'E1544',
    'E1371', 'E2107', 'E1285', 'E1581', 'E2106', 'E1573', 'E1546',
    'E1822', 'E2253', 'AA9315', 'E2227', 'E2545', 'E1194', 'E1931', 'E2027', 'AA9308'
]

print(len(set(code_list)))

# Load the CSV file
input_file = ''
df = pd.read_csv('../../scripts/catalog/grouped_data.csv')

# Code	Color	Size	Brand	Quantity

filtered_df = df[df['Code'].isin(code_list)]

# Print the number of rows after filtering
print(f"Number of rows after filtering: {filtered_df.shape[0]}")

# Save the filtered DataFrame to a new CSV file
output_file = '../../scripts/catalog/grouped_top_50.csv'
filtered_df.to_csv(output_file, index=False)

print(filtered_df['Code'].nunique())
# Get the unique codes present in the CSV
unique_codes_in_csv = df['Code'].unique()

# Find codes that are in the code_list but not in the CSV
codes_not_found = [code for code in code_list if code not in unique_codes_in_csv]
print(f"Codes not found in the CSV: {codes_not_found}")


50
Number of rows after filtering: 1349
50
Codes not found in the CSV: []


In [26]:
import pandas as pd

# Load the CSV file
input_file = '../../scripts/catalog/grouped_top_50.csv'
df = pd.read_csv(input_file)

# Group by 'Code' and aggregate 'Color' and 'Size'
aggregated_df = df.groupby('Code').agg({
    'Color': lambda x: ';'.join(sorted(x.unique())),
    'Size': lambda x: ';'.join(sorted(str(i) for i in x.unique()))
}).reset_index()

# Print the result
# print(aggregated_df)

# Save the result to a new CSV file
output_file = '../../scripts/catalog/final_data.csv'
aggregated_df.to_csv(output_file, index=False)

print(aggregated_df['Code'].nunique())

50
