###### Step 1: Get the Style code, color, quantity, and size from the main sheet into a new sheet


In [51]:
import pandas as pd

input_file = '../../data/sheets/main_sheet.csv'
df = pd.read_csv(input_file)

# Rename specific columns
df = df.rename(columns={
    'STYLE NO.': 'Code',
    'SIZE_US': 'Size',
    'COLOR': 'Color',
    'QUANTITY': 'Quantity',
    'BRAND NAME': 'Brand'
})

# Define validation criteria for each column
def validate_row(row):
    # Check for missing or blank values
    if pd.isnull(row['Brand']) or pd.isnull(row['Code']) or pd.isnull(row['Color']) or pd.isnull(row['Quantity']) or pd.isnull(row['Size']):
        return False
    if row['Brand'].strip() == '' or row['Code'].strip() == '' or row['Color'].strip() == '':
        return False

    # Check for correct data types
    if not isinstance(row['Brand'], str) or not isinstance(row['Code'], str) or not isinstance(row['Color'], str):
        return False
    try:
        float(row['Quantity'])
        float(row['Size'])
    except ValueError:
        return False

    return True

# Apply validation to each row
valid_rows = df.apply(validate_row, axis=1)

# Filter the DataFrame to retain only valid rows
filtered_df = df[valid_rows]

# Select the specified columns based on the new names
selected_columns = ['Brand', 'Code', 'Color', 'Quantity', 'Size']
new_df = filtered_df[selected_columns]

# Write the filtered data to a new CSV file
output_file = '../../data/sheets/filtered_main_sheet.csv'
new_df.to_csv(output_file, index=False)

# Print number of unique Codes
print(new_df['Code'].nunique())
print(f"Selected columns have been written to {output_file}")

558
Selected columns have been written to ../../data/sheets/filtered_main_sheet.csv


###### Step 2: Group this sheet by style code, color, and size and sum of quantity by group to get the total quantity


In [52]:
import pandas as pd

# Load the original CSV file
df = pd.read_csv("../../data/sheets/filtered_main_sheet.csv")

# Ensure Quantity is numeric, coercing errors to NaN
df['Quantity'] = pd.to_numeric(df['Quantity'], errors='coerce')

# Fill NaN values in Quantity with 0 (or handle as needed)
df['Quantity'] = df['Quantity'].fillna(0)

# First aggregation: Sum quantities based on Code, Color, and Size
aggregated_df = df.groupby(['Code', 'Color', 'Size'], as_index=False).agg({
    'Brand': 'first',  # Assuming you want to keep the first Brand name found
    'Quantity': 'sum'
})

# Print the number of unique codes after aggregation
print(f"Number of unique Codes in the original DataFrame: {df['Code'].nunique()}")
print(f"Number of unique Codes after aggregation: {aggregated_df['Code'].nunique()}")

# Write the aggregated data to a new CSV file
aggregated_df.to_csv('../../data/sheets/step_2_output.csv', index=False)


Number of unique Codes in the original DataFrame: 558
Number of unique Codes after aggregation: 558


###### Step 3: Get the prices of available product codes


In [68]:
import pandas as pd

# Load the first CSV file
# Replace with your actual file path
file1_path = '../../data/transformed_products.csv'
file1 = pd.read_csv(file1_path)

# Load the second CSV file
# Replace with your actual file path
file2_path = '../../data/sheets/step_2_output.csv'
file2 = pd.read_csv(file2_path)

# Merge the two DataFrames on the 'Code' column to get the 'Price' column from file1
merged_df = pd.merge(file2, file1[['Code', 'Price']], on='Code', how='left')

# Update the 'Price' column in file2 with the values from the merged DataFrame
file2['Price'] = merged_df['Price']

# Save the updated file2 DataFrame to a new CSV file
# Replace with your desired output file path
output_file_path = '../../data/sheets/step_3_output.csv'

print(merged_df['Code'].nunique())
file2.to_csv(output_file_path, index=False)

print(f"Updated file has been saved to {output_file_path}")

558
Updated file has been saved to ../../data/sheets/step_3_output.csv


###### Step 4: Combine title, description, keywords, and tags with this new sheet based on product code


In [76]:
import pandas as pd

# Load the first CSV file (file_1)
# Replace with the actual file path
file1_path = '../../data/sheets/product_info_final.csv'
file1 = pd.read_csv(file1_path)
# removing copies
file1 = file1.drop_duplicates(subset=['Code'], keep='first')
# Load the second CSV file (file_2)
# Replace with the actual file path
file2_path = '../../data/sheets/step_3_output.csv'
file2 = pd.read_csv(file2_path)

# Remove 'Color' column from file_1 if it exists
if 'Color' in file1.columns:
    file1 = file1.drop(columns=['Color'])

# Merge the two DataFrames on the 'Code' column, keeping details from file_1
merged_df = pd.merge(file2, file1, on='Code', how='inner')

# Define the columns from file_1 to copy to file_2
columns_to_copy = ['Title', 'Long Description', 'Short Description',
                   'Dress Type', 'Occasions', 'Keywords', 'Tags']

# For each column to copy, fill NaN values in merged_df with the corresponding values from file_1
for column in columns_to_copy:
    merged_df[column] = merged_df.groupby('Code')[column].transform('first')

# Save the merged DataFrame to a new CSV file
# Replace with your desired output file path
output_file_path = '../../data/sheets/step_4_output.csv'
print(merged_df['Code'].nunique())
merged_df.to_csv(output_file_path, index=False)

print(f"Merged file has been saved to {output_file_path}")

314
Merged file has been saved to ../../data/sheets/step_4_output.csv


###### Step 5: Combine image URLs with the product code and add additional URLs


In [77]:
import pandas as pd

# Load the CSV files
# Replace with the actual file path
file1_path = '../../data/sheets/step_4_output.csv'
file2_path = '../../data/s3_url_images.csv'  # Replace with the actual file path

file1 = pd.read_csv(file1_path)
file2 = pd.read_csv(file2_path)

print(file2['Product Code'].nunique())
# Rename columns in file2 to match the columns in file1 for merging
file2 = file2.rename(columns={'Product Code': 'Code'})

# Merge the two DataFrames on the 'Code' column
merged_df = pd.merge(file1, file2[['Code', 'url']], on='Code', how='left')

# Group URLs by 'Code' and create a list of URLs for each 'Code'
url_grouped = file2.groupby('Code')['url'].apply(
    lambda x: ','.join(x)).reset_index()

# Merge the grouped URLs back to the original DataFrame
final_df = pd.merge(file1, url_grouped, on='Code', how='left')
final_df = final_df.rename(columns={'url': 'Urls'})

# Save the final DataFrame to a new CSV file
# Replace with your desired output file path
output_file_path = '../../data/sheets/step_5_output.csv'
final_df.to_csv(output_file_path, index=False)

print(f"Final CSV file with appended URLs has been saved to {
      output_file_path}")

322
Final CSV file with appended URLs has been saved to ../../data/sheets/step_5_output.csv


###### Step 6: Creating shopify csv file to upload


In [81]:

import pandas as pd
import re


# Load the CSV file
# Replace with the actual file path
file1_path = '../../data/sheets/step_5_output.csv'


file1 = pd.read_csv(file1_path)


# Create Shopify CSV structure


shopify_columns = ['Handle', 'Title', 'Body (HTML)', 'Vendor', 'Type', 'Tags', 'Published', 'Option1 Name', 'Option1 Value', 'Option2 Name', 'Option2 Value', 'Variant SKU', 'Variant Grams', 'Variant Inventory Tracker', 'Variant Inventory Qty', 'Variant Inventory Policy', 'Variant Fulfillment Service', 'Variant Price', 'Variant Compare at Price', 'Variant Requires Shipping', 'Variant Taxable', 'Variant Barcode', 'Image Src', 'Image Position',



                   'Image Alt Text'



                   ]


# Initialize an empty DataFrame


shopify_df = pd.DataFrame(columns=shopify_columns)
all_entries = []

current_code = None

other_images = []


def check_sleeve_preference(combined_list):
    # Define sleeve-related terms
    sleeve_terms = ["Sleeveless", "Short Sleeve",
                    "Long Sleeve", "Spaghetti Strap"]

    # Check for the presence of sleeve terms
    found_sleeves = set()
    for term in sleeve_terms:
        # Check each column in the row for the term
        if any(re.search(rf'\b{term}\b', col, re.IGNORECASE) for col in combined_list):
            found_sleeves.add(term)

    # Determine the preferred sleeve term
    if "Spaghetti Strap" in found_sleeves:
        return "Spaghetti Strap"
    elif "Short Sleeve" in found_sleeves:
        return "Short Sleeve"
    elif found_sleeves:
        # Return any other found term if Spaghetti Strap or Short Sleeve is not found
        return list(found_sleeves)[0]
    else:
        return None


# Iterate over rows using iterrows()
for index, row in file1.iterrows():

    # Check if we have encountered a new code

    if current_code is not None and current_code != row['Code']:

        # Append additional images for the previous code

        for idx, url in enumerate(other_images):

            image_entry = {'Handle': current_code.lower(),
                           'Image Src': url,
                           'Image Position': idx + 2, }

            all_entries.append(pd.DataFrame([image_entry]))

        # Reset the other_images list for the new code

        other_images = []

    # Set the current code

    current_code = row['Code']

    # Generate handle from product name

    keywords = str(row['Keywords']) if pd.notna(row['Keywords']) else ''

    if keywords != '':

        keywords = keywords.split(", ")
        meta_features = ", ".join(keywords)

    description = str(row['Long Description']) if pd.notna(
        row['Long Description']) else ''

    body_html = f'<p>{description}</p>\n<ul>'

    for i in range(0, len(keywords)):

        body_html += f'\n<li>{keywords[i]}</li>'

    body_html += '\n</ul>'

    if isinstance(row['Urls'], str):

        all_images = row['Urls'].split(",")

        first_image, *other_images = all_images

    else:

        continue

    title = row['Title'] + " " + row['Code']
    title = title.replace('"', "")
    title = re.sub(r'\s+', ' ', title)

    temp_extract = [row['Occasions'], row['Keywords'],  row['Tags']]

    meta_sleeve_length_type = check_sleeve_preference(temp_extract)

    # Create a new row for the main product entry

    product_entry = {



        'Handle': row['Code'].lower(),



        'Title': title,



        'Body (HTML)': body_html,



        'Vendor': row['Brand'],



        'Type': row['Dress Type'],  # You can customize this



        'Tags': row['Tags'] + "," + row['Occasions'],



        'Published': 'TRUE',



        "Product Category": "Apparel & Accessories > Clothing > Dresses",



        'Option1 Name': 'Size',



        'Option1 Value': row['Size'],



        'Option2 Name': 'Color',



        'Option2 Value': row['Color'],



        'Variant SKU': f"{row['Code']}_{row['Color'].upper()}_{row['Size']}",



        'Variant Grams': '',  # Add weight if available



        'Variant Inventory Tracker': 'shopify',



        'Variant Inventory Qty': row['Quantity'],



        'Variant Inventory Policy': 'deny',



        'Variant Fulfillment Service': 'manual',



        'Variant Price': row['Price'],



        'Variant Compare at Price': '',



        'Variant Requires Shipping': 'TRUE',



        'Variant Taxable': 'TRUE',



        'Variant Barcode': '',



        'Image Src': first_image,  # First image URL



        'Image Position': 1,



        'Image Alt Text': row['Brand'] + " " + row['Code'].lower() + " " + row['Filename'],
        'metafield.custom.clothing_features': meta_features,
        'metafield.custom.dress_occasion': re.sub(r'\s+', ' ', row['Occasions']),
        'metafield.custom.dress_style': row['Code'].upper(),
        'metafield.custom.skirt_dress_length_type': row['Dress Type'],
        'metafield.custom.sleeve_length_type': meta_sleeve_length_type




    }

    # Add the main product entry to the list

    all_entries.append(pd.DataFrame([product_entry]))


# Append additional images for the last code


if other_images:

    for idx, url in enumerate(other_images):

        image_entry = {



            'Handle': current_code.lower(),



            'Image Src': url,



            'Image Position': idx + 2,  # Start from position 2



        }

        all_entries.append(pd.DataFrame([image_entry]))


# Concatenate all entries into a single DataFrame


shopify_df = pd.concat(all_entries, ignore_index=True)


print(shopify_df['Handle'].nunique())
output_file_path = '../../data/sheets/shopify_output_demo.csv'
shopify_df.to_csv(output_file_path, index=False)
print(f"Shopify CSV file has been saved to {output_file_path}")

314
Shopify CSV file has been saved to ../../data/sheets/shopify_output_demo.csv


###### Some verificatiosn for understanding


In [75]:
import pandas as pd

# Load the CSV files
# Replace with the actual file path
file1_path = '../../data/sheets/step_5_output.csv'
file2_path = '../../data/s3_url_images.csv'  # Replace with the actual file path

file1 = pd.read_csv(file1_path)
file2 = pd.read_csv(file2_path)

# Identify common codes between file1 and file2
common_codes = set(file1['Code']).intersection(set(file2['Product Code']))
print(len(common_codes))
#
# Filter file1 to include only rows with common codes
filtered_file1 = file1[file1['Code'].isin(common_codes)]

# Group by 'Code' and count unique combinations of 'Size' and 'Color'
variant_counts = filtered_file1.groupby('Code').apply(lambda x: x[[
    'Size', 'Color']].drop_duplicates().shape[0], include_groups=False).reset_index(name='Variant Count')

# Print the variant counts
# print(variant_counts)

# Calculate the sum of the variant counts
total_variants = variant_counts['Variant Count'].sum()

# Print the total number of variants
print(f"Total number of variants: {total_variants}")

314
Total number of variants: 2383


In [79]:
import pandas as pd

# Load the CSV files
df1 = pd.read_csv('../../data/sheets/step_5_output.csv')
df2 = pd.read_csv('../../data/s3_url_images.csv')

# Convert the 'Code' column to sets for comparison
set1 = set(df1['Code'])
set2 = set(df2['Product Code'])

# Find common and uncommon codes
common_codes = set1.intersection(set2)
unique_in_file1 = set1.difference(set2)
unique_in_file2 = set2.difference(set1)

# Print results
print(f"Common codes: {len(common_codes)}")
# print(common_codes)

print(f"Unique codes in file1: {len(unique_in_file1)}")
print(unique_in_file1)

print(f"Unique codes in file2: {len(unique_in_file2)}")
print(unique_in_file2)

# Optional: Create DataFrames for common and unique codes and save to new CSV files
pd.DataFrame(common_codes, columns=['Code']).to_csv('common_codes.csv', index=False)
pd.DataFrame(unique_in_file1, columns=['Code']).to_csv('unique_in_file1.csv', index=False)
pd.DataFrame(unique_in_file2, columns=['Code']).to_csv('unique_in_file2.csv', index=False)


Common codes: 314
Unique codes in file1: 0
set()
Unique codes in file2: 8
{'E2394', 'E1028', 'E2434', 'E1984', 'E2276', 'E2152', 'E2225', 'E1340'}
