###### Step 1: Get the Style code, color, quantity, and size from the main sheet into a new sheet


In [5]:
import pandas as pd

input_file = '../../data/product_data/main_data.csv'
df = pd.read_csv(input_file)

# Rename specific columns
df = df.rename(columns={
    'STYLE NO.': 'Code',
    # 'STYLE NO.': 'Handle',
    # 'Handle': 'Handle',
    'SIZE_US': 'Size',
    'COLOR': 'Color',
    'QUANTITY': 'Quantity',
    'BRAND NAME': 'Brand',
    'CATEGORIES': 'Category'
})

# Define validation criteria for each column


def validate_row(row):
    # Check for missing or blank values
    if pd.isnull(row['Brand']) or pd.isnull(row['Code']) or pd.isnull(row['Color']) or pd.isnull(row['Quantity']) or pd.isnull(row['Size']):
        return False
    if row['Brand'].strip() == '' or row['Code'].strip() == '' or row['Color'].strip() == '':
        return False

    # Check for correct data types
    if not isinstance(row['Brand'], str) or not isinstance(row['Code'], str) or not isinstance(row['Color'], str):
        return False
    try:
        float(row['Quantity'])
        float(row['Size'])
    except ValueError:
        return False

    return True


# Apply validation to each row
valid_rows = df.apply(validate_row, axis=1)

# Filter the DataFrame to retain only valid rows
filtered_df = df[valid_rows]
# # Load the list of handles to discard from another CSV file
# handles_file_path = '../../data/product_data/Homecoming.csv'  # Replace with your actual file path
# handles_df = pd.read_csv(handles_file_path)
# handles_to_discard = handles_df['Handle'].tolist()
# filtered_df = filtered_df[~filtered_df['Handle'].isin(handles_to_discard)]

# Select the specified columns based on the new names
# selected_columns = ['Handle', 'Brand', 'Color', 'Quantity', 'Size', 'Category', 'Code']
selected_columns = ['Code', 'Brand', 'Color', 'Quantity', 'Size', 'Category']
new_df = filtered_df[selected_columns]

# Write the filtered data to a new CSV file
# output_file = '../../scripts/catalog/filtered_data.csv'
output_file = '../../data/main/filtered_data.csv'
new_df.to_csv(output_file, index=False)

# Print number of unique Codes
print(new_df['Code'].nunique())
# print(new_df['Code'].nunique())
print(f"Selected columns have been written to {output_file}")

481
Selected columns have been written to ../../data/main/filtered_data.csv


###### Step 2: Group this sheet by style code, color, and size and sum of quantity by group to get the total quantity


In [6]:
import pandas as pd

# Load the original CSV file
df = pd.read_csv("../../data/main/filtered_data.csv")

# Ensure Quantity is numeric, coercing errors to NaN
df['Quantity'] = pd.to_numeric(df['Quantity'], errors='coerce')

# Fill NaN values in Quantity with 0 (or handle as needed)
df['Quantity'] = df['Quantity'].fillna(0)

# First aggregation: Sum quantities based on Code, Color, and Size
aggregated_df = df.groupby(['Code', 'Color', 'Size'], as_index=False).agg({
    'Brand': 'first',  # Assuming you want to keep the first Brand name found
    'Quantity': 'sum',
    'Category': 'first',
#     'Code': 'first'
})

# Print the number of unique codes after aggregation
print(f"Number of unique Codes in the original DataFrame: {
      df['Code'].nunique()}")
print(f"Number of Codes after aggregation: {
      aggregated_df['Code'].nunique()}")
print(f"Number of Mapped after aggregation: {
      aggregated_df['Code'].nunique()}")

# Write the aggregated data to a new CSV file
aggregated_df.to_csv('../../data/main/step_2_output.csv', index=False)

Number of unique Codes in the original DataFrame: 481
Number of Codes after aggregation: 481
Number of Mapped after aggregation: 481


###### Step 3: Get the prices of available product codes


In [7]:
import pandas as pd

# Load the first CSV file
# Replace with your actual file path
file1_path = '../../data/product_data/combined_prices.csv'
file1 = pd.read_csv(file1_path)

# Load the second CSV file
# Replace with your actual file path
file2_path = '../../data/main/step_2_output.csv'
file2 = pd.read_csv(file2_path)

# Merge the two DataFrames on the 'Code' column to get the 'Price' column from file1
# merged_df = pd.merge(file2, file1[['Handle', 'Price']], on='Handle', how='left')
merged_df = pd.merge(file2, file1[['Code', 'Price']], on='Code', how='left')

# Update the 'Price' column in file2 with the values from the merged DataFrame
file2['Price'] = merged_df['Price']

# Save the updated file2 DataFrame to a new CSV file
# Replace with your desired output file path
output_file_path = '../../data/main/step_3_output.csv'

print(merged_df['Code'].nunique())
file2.to_csv(output_file_path, index=False)

print(f"Updated file has been saved to {output_file_path}")

481
Updated file has been saved to ../../data/main/step_3_output.csv


###### Step 4: Combine title, description, keywords, and tags with this new sheet based on product code


In [34]:
# import pandas as pd

# # Load the first CSV file (file_1)
# # Replace with the actual file path
# file1_path = '../../data/product_data/chatgtp_data_sheet.csv'
# file1 = pd.read_csv(file1_path)
# # removing copies
# file1 = file1.drop_duplicates(subset=['Handle'], keep='first')
# # Load the second CSV file (file_2)
# # Replace with the actual file path
# file2_path = '../../data/main/step_3_output.csv'
# file2 = pd.read_csv(file2_path)

# # Remove 'Color' column from file_1 if it exists
# if 'Color' in file1.columns:
#     file1 = file1.drop(columns=['Color'])

# # Merge the two DataFrames on the 'Code' column, keeping details from file_1
# merged_df = pd.merge(file2, file1, on='Handle', how='inner')

# # Define the columns from file_1 to copy to file_2
# columns_to_copy = ['Title', 'Long Description', 'Short Description',
#                    'Dress Type', 'Occasions', 'Keywords', 'Tags']

# # For each column to copy, fill NaN values in merged_df with the corresponding values from file_1
# for column in columns_to_copy:
#     merged_df[column] = merged_df.groupby('Handle')[column].transform('first')

# # Save the merged DataFrame to a new CSV file
# # Replace with your desired output file path
# output_file_path = '../../data/main/step_4_output.csv'
# print(merged_df['Handle'].nunique())
# merged_df.to_csv(output_file_path, index=False)

# print(f"Merged file has been saved to {output_file_path}")

284
Merged file has been saved to ../../data/main/step_4_output.csv


###### Step 5: Combine image URLs with the product code and add additional URLs


In [9]:
# import pandas as pd

# # Load the CSV files
# file1_path = '../../data/main/step_4_output.csv'
# file2_path = '../../data/product_data/s3_urls.csv'  # Replace with the actual file path

# file1 = pd.read_csv(file1_path)
# file2 = pd.read_csv(file2_path)

# print(file2['Handle'].nunique())

# # Merge the two DataFrames on the 'Handle' column
# merged_df = pd.merge(file1, file2[['Handle', 'url']], on='Handle', how='left')

# # Group URLs by 'Handle' and create a list of URLs for each 'Handle'
# url_grouped = file2.groupby('Handle')['url'].apply(lambda x: ','.join(x)).reset_index()

# # Merge the grouped URLs back to the original DataFrame
# final_df = pd.merge(file1, url_grouped, on='Handle', how='left')

# # Rename the 'url' column to 'Urls'
# final_df = final_df.rename(columns={'url': 'Urls'})

# # Discard rows where 'Urls' is NaN (no URL available)
# final_df = final_df.dropna(subset=['Urls'])

# # Save the final DataFrame to a new CSV file
# output_file_path = '../../data/main/step_5_output.csv'
# final_df.to_csv(output_file_path, index=False)

# print(f"Final CSV file with appended URLs has been saved to {output_file_path}")


import pandas as pd

# Load the CSV files
file1_path = '../../data/main/step_3_output.csv'
file2_path = '../../data/product_data/shopify_urls.csv'  # Replace with the actual file path

file1 = pd.read_csv(file1_path)
file2 = pd.read_csv(file2_path)

print(file2['Code'].nunique())

# Merge the two DataFrames on the 'Handle' column
merged_df = pd.merge(file1, file2[['Code', 'url']], on='Code', how='left')

# Group URLs by 'Handle' and create a list of URLs for each 'Handle'
url_grouped = file2.groupby('Code')['url'].apply(lambda x: ','.join(x)).reset_index()

# Merge the grouped URLs back to the original DataFrame
final_df = pd.merge(file1, url_grouped, on='Code', how='left')

# Rename the 'url' column to 'Urls'
final_df = final_df.rename(columns={'url': 'Urls'})

# Discard rows where 'Urls' is NaN (no URL available)
final_df = final_df.dropna(subset=['Urls'])

# Save the final DataFrame to a new CSV file
output_file_path = '../../data/main/step_5_output.csv'
final_df.to_csv(output_file_path, index=False)

print(f"Final CSV file with appended URLs has been saved to {output_file_path}")


342
Final CSV file with appended URLs has been saved to ../../data/main/step_5_output.csv


###### Step 6: Creating shopify csv file to upload


In [17]:

import pandas as pd
import re


# Load the CSV file
# Replace with the actual file path
file1_path = '../../data/main/step_5_output.csv'


file1 = pd.read_csv(file1_path)


# Create Shopify CSV structure


shopify_columns = ['Handle', 'Title', 'Body (HTML)', 'Vendor', 'Type', 'Tags', 'Published', 'Option1 Name', 'Option1 Value', 'Option2 Name', 'Option2 Value', 'Variant SKU', 'Variant Grams', 'Variant Inventory Tracker', 'Variant Inventory Qty', 'Variant Inventory Policy', 'Variant Fulfillment Service', 'Variant Price', 'Variant Compare at Price', 'Variant Requires Shipping', 'Variant Taxable', 'Variant Barcode', 'Image Src', 'Image Position',



                   'Image Alt Text'



                   ]


# Initialize an empty DataFrame


shopify_df = pd.DataFrame(columns=shopify_columns)
all_entries = []

current_code = None

other_images = []


# def check_sleeve_preference(combined_list):
#     # Define sleeve-related terms
#     sleeve_terms = ["Sleeveless", "Short Sleeve",
#                     "Long Sleeve", "Spaghetti Strap"]

#     # Check for the presence of sleeve terms
#     found_sleeves = set()
#     for term in sleeve_terms:
#         # Check each column in the row for the term
#         if any(re.search(rf'\b{term}\b', col, re.IGNORECASE) for col in combined_list):
#             found_sleeves.add(term)

#     # Determine the preferred sleeve term
#     if "Spaghetti Strap" in found_sleeves:
#         return "Spaghetti Strap"
#     elif "Short Sleeve" in found_sleeves:
#         return "Short Sleeve"
#     elif found_sleeves:
#         # Return any other found term if Spaghetti Strap or Short Sleeve is not found
#         return list(found_sleeves)[0]
#     else:
#         return None


# Iterate over rows using iterrows()
for index, row in file1.iterrows():

    # Check if we have encountered a new code

    if current_code is not None and current_code != row['Code']:

        # Append additional images for the previous code

        for idx, url in enumerate(other_images):

            image_entry = {'Handle': current_code.lower(),
                           'Image Src': url,
                           'Image Position': idx + 2, }

            all_entries.append(pd.DataFrame([image_entry]))

        # Reset the other_images list for the new code

        other_images = []

    # Set the current code

    current_code = row['Code']
    # current_code = row['Handle']
    category = row['Category'] if pd.notna(row['Category']) else "Uncategorized"
    category = "Wedding" if "AA" in row['Code'] else category
    # Generate handle from product name

    keywords = [str(row['Color']), str(row['Color']) + ' '+ str(category)]
    meta_features = ", ".join(keywords)

    # description = str(row['Long Description']) if pd.notna(
    #     row['Long Description']) else ''

    body_html = f'<p>This is a {category} dress by {row['Brand']}.</p>\n<ul>'

    # for i in range(0, len(keywords)):

    #     body_html += f'\n<li>{keywords[i]}</li>'

    # body_html += '\n</ul>'

    if isinstance(row['Urls'], str):

        all_images = row['Urls'].split(",")

        first_image, *other_images = all_images

    # else:

    #     continue

    title = row['Brand'] + " " + category +" " + row['Code'] if category != 'Uncategorized' else row['Brand'] + " " + row['Code']
    title = title.replace('"', "")
    title = re.sub(r'\s+', ' ', title)

    # temp_extract = [row['Occasions'], row['Keywords'],  row['Tags']]

    # meta_sleeve_length_type = check_sleeve_preference(temp_extract)

    # Create a new row for the main product entry

    product_entry = {



        'Handle': row['Code'].lower(),




        'Title': title,


        'Body (HTML)': body_html,



        'Vendor': row['Brand'],



        'Type': category,  # You can customize this



        'Tags': ",".join(keywords),



        'Published': 'TRUE',



        "Product Category": "Apparel & Accessories > Clothing > Dresses",



        'Option1 Name': 'Color',



        'Option1 Value': row['Color'],



        'Option2 Name': 'Size',



        'Option2 Value': row['Size'],



        'Variant SKU': f"{row['Code']}_{row['Color'].upper()}_{row['Size']}",



        'Variant Grams': '',  # Add weight if available



        'Variant Inventory Tracker': 'shopify',



        'Variant Inventory Qty': row['Quantity'],



        'Variant Inventory Policy': 'deny',



        'Variant Fulfillment Service': 'manual',



        'Variant Price': row['Price'],



        'Variant Compare at Price': '',



        'Variant Requires Shipping': 'TRUE',



        'Variant Taxable': 'TRUE',



        'Variant Barcode': '',



        'Image Src': first_image,  # First image URL



        'Image Position': 1,


      'Image Alt Text': row['Brand'] + " " + row['Code'].lower() + " " + row['Color'],
        'metafield.custom.clothing_features': meta_features,
        'metafield.custom.dress_occasion': re.sub(r'\s+', ' ', category),
        'metafield.custom.dress_style': row['Code'].upper(),
        'metafield.custom.skirt_dress_length_type': category,
        # 'metafield.custom.sleeve_length_type':




    }

    # Add the main product entry to the list

    all_entries.append(pd.DataFrame([product_entry]))


# Append additional images for the last code


if other_images:

    for idx, url in enumerate(other_images):

        image_entry = {



            'Handle': current_code.lower(),



            'Image Src': url,



            'Image Position': idx + 2,  # Start from position 2



        }

        all_entries.append(pd.DataFrame([image_entry]))


# Concatenate all entries into a single DataFrame


shopify_df = pd.concat(all_entries, ignore_index=True)


print(shopify_df['Handle'].nunique())
output_file_path = '../../data/shopify_sheets/main_shopify.csv'
shopify_df.to_csv(output_file_path, index=False)
print(f"Shopify CSV file has been saved to {output_file_path}")

340
Shopify CSV file has been saved to ../../data/shopify_sheets/main_shopify.csv


###### Some verificatiosn for understanding


In [37]:
# import pandas as pd

# # Load the CSV files
# # Replace with the actual file path
# file1_path = '../../data/sheets/step_5_output.csv'
# file2_path = '../../data/s3_url_images.csv'  # Replace with the actual file path

# file1 = pd.read_csv(file1_path)
# file2 = pd.read_csv(file2_path)

# # Identify common codes between file1 and file2
# common_codes = set(file1['Code']).intersection(set(file2['Product Code']))
# print(len(common_codes))
# #
# # Filter file1 to include only rows with common codes
# filtered_file1 = file1[file1['Code'].isin(common_codes)]

# # Group by 'Code' and count unique combinations of 'Size' and 'Color'
# variant_counts = filtered_file1.groupby('Code').apply(lambda x: x[[
#     'Size', 'Color']].drop_duplicates().shape[0], include_groups=False).reset_index(name='Variant Count')

# # Print the variant counts
# # print(variant_counts)

# # Calculate the sum of the variant counts
# total_variants = variant_counts['Variant Count'].sum()

# # Print the total number of variants
# print(f"Total number of variants: {total_variants}")

FileNotFoundError: [Errno 2] No such file or directory: '../../data/sheets/step_5_output.csv'

In [1]:
import pandas as pd

# Load the CSV files
df1 = pd.read_csv('../../data/shopify_sheets/main_shopify.csv')
df2 = pd.read_csv('../../data/shopify_sheets/products_export.csv')

# Convert the 'Code' column to sets for comparison
set1 = set(df1['Handle'])
set2 = set(df2['Handle'])

# Find common and uncommon codes
common_codes = set1.intersection(set2)
unique_in_file1 = set1.difference(set2)
unique_in_file2 = set2.difference(set1)

# Print results
print(f"Common codes: {len(common_codes)}")
# print(common_codes)

print(f"Unique codes in file1: {len(unique_in_file1)}")
print(unique_in_file1)

print(f"Unique codes in file2: {len(unique_in_file2)}")
print(unique_in_file2)

# Optional: Create DataFrames for common and unique codes and save to new CSV files
pd.DataFrame(common_codes, columns=['Code']).to_csv(
    'common_codes.csv', index=False)
pd.DataFrame(unique_in_file1, columns=['Code']).to_csv(
    'unique_in_file1.csv', index=False)
pd.DataFrame(unique_in_file2, columns=['Code']).to_csv(
    'unique_in_file2.csv', index=False)

Common codes: 338
Unique codes in file1: 2
{'e2253', 'f570'}
Unique codes in file2: 0
set()


In [None]:
import pandas as pd

# Define the list of handles to be discarded
handles_to_discard = ['handle1', 'handle2', 'handle3']  # Replace with actual handle values you want to discard

# Load the CSV file
input_file_path = 'path/to/your/input.csv'
df = pd.read_csv(input_file_path)

# Filter the DataFrame to keep only rows where 'Handle' is not in the list of handles to discard
filtered_df = df[~df['Handle'].isin(handles_to_discard)]

# Save the filtered DataFrame to a new CSV file
output_file_path = 'path/to/your/output.csv'
filtered_df.to_csv(output_file_path, index=False)

print(f"Filtered file has been saved to {output_file_path}")
