In [None]:
import pandas as pd

In [None]:
# This script reads a CSV file containing product data from Waitrose and displays basic information about the dataset.
waitrose_df = pd.read_csv('./Waitrose/waitrose_products_playwright.csv')

waitrose_df.head()  # Display the first few rows of the DataFrame


In [None]:
# This script reads a CSV file containing product data from Tesco and displays basic information about the dataset.
tesco_df = pd.read_csv('./Tesco/tesco_products_playwright.csv')
tesco_fc_df = pd.read_csv('./Tesco/tesco_products_food-cupboard_playwright.csv')

print(f"Shape before concat duplicates: {tesco_df.shape}")

tesco_df = pd.concat([tesco_df, tesco_fc_df], ignore_index=True)
print(f"Shape before removing duplicates: {tesco_df.shape}")
# Drop duplicate products based on product_id
tesco_df.drop_duplicates(subset=['product_id', 'parent_category', 'subcategory'], keep='first', inplace=True)

# Display information about the dataframe after removing duplicates
print(f"Shape after removing duplicates: {tesco_df.shape}")

tesco_df.head()


In [None]:
# Create a copy of both dataframes with the needed columns and add source column
waitrose_subset = waitrose_df[['product_id', 'product_name', 'parent_category', 'subcategory']].copy()
waitrose_subset['source'] = 'Waitrose'

tesco_subset = tesco_df[['product_id', 'product_name', 'parent_category', 'subcategory']].copy()
tesco_subset['source'] = 'Tesco'

# Combine the dataframes
combined_df = pd.concat([waitrose_subset, tesco_subset], ignore_index=True)

# Display the first few rows to verify
combined_df.head()

# Display some basic info about the combined dataframe
print(f"Combined shape: {combined_df.shape}")
print(f"Number of Waitrose products: {len(waitrose_subset)}")
print(f"Number of Tesco products: {len(tesco_subset)}")

In [21]:
combined_df['cleaned_product_name'] = combined_df['product_name'].str.replace(r'\s+', ' ', regex=True).str.strip()
combined_df['newCat'] = ''
combined_df['newSubCat'] = ''

combined_df.head()

Unnamed: 0,product_id,product_name,parent_category,subcategory,source,cleaned_product_name,newCat,newSubCat
0,568810,Waitrose Cantaloupe Melon,Fresh & Chilled,Fresh Fruit,Waitrose,Waitrose Cantaloupe Melon,,
1,38528,Waitrose Perfectly Ripe Kiwi,Fresh & Chilled,Fresh Fruit,Waitrose,Waitrose Perfectly Ripe Kiwi,,
2,55061,Waitrose Seedless Sable Grapes,Fresh & Chilled,Fresh Fruit,Waitrose,Waitrose Seedless Sable Grapes,,
3,88903,Essential Fairtrade Bananas,Fresh & Chilled,Fresh Fruit,Waitrose,Essential Fairtrade Bananas,,
4,88528,Essential Fairtrade Bananas,Fresh & Chilled,Fresh Fruit,Waitrose,Essential Fairtrade Bananas,,


In [22]:
grouped_df = combined_df.groupby(['source', 'parent_category', 'subcategory'])['product_id'].count().reset_index()
grouped_df.columns = ['source', 'parent_category', 'subcategory', 'Answer']
grouped_df.to_csv('./combined_grouped.csv', index=False)

In [25]:
mapping_df = pd.read_csv('./CategoryMapping.csv')

In [28]:
import uuid

# Create a distinct list of categories from the NewCat column, excluding 'Drop'
categories_df = mapping_df[mapping_df['NewCat'] != 'Drop']['NewCat'].drop_duplicates().reset_index(drop=True)

# Create a DataFrame with CategoryId and CategoryName
cat_df = pd.DataFrame({
    'Id': [str(uuid.uuid4()) for _ in range(len(categories_df))],
    'Name': categories_df
})

# Create a mapping dictionary for category names to their GUIDs
cat_mapping = dict(zip(cat_df['Name'], cat_df['Id']))

# Create a distinct list of subcategories from the NewSubCat column, excluding rows where NewCat is 'Drop'
subcategories_df = mapping_df[mapping_df['NewCat'] != 'Drop'][['NewCat', 'NewSubCat']].drop_duplicates().reset_index(drop=True)

# Create a DataFrame with SubCategoryId, SubCategoryName and ParentCategoryId
subcat_df = pd.DataFrame({
    'Id': [str(uuid.uuid4()) for _ in range(len(subcategories_df))],
    'Name': subcategories_df['NewSubCat'],
    'ParentCategoryId': subcategories_df['NewCat'].map(cat_mapping)
})

# Save to CSV files
cat_df.to_csv('./categories.csv', index=False)
subcat_df.to_csv('./subcategories.csv', index=False)

print(f"Categories created: {len(cat_df)}")
print(f"Subcategories created: {len(subcat_df)}")

# Display samples of both dataframes
print("\nCategories sample:")
print(cat_df.head())

print("\nSubcategories sample:")
print(subcat_df.head())

Categories created: 12
Subcategories created: 163

Categories sample:
                                     Id                       Name
0  3d6bc8ae-0b9b-4ded-badc-c4047b39e081             Baby & Toddler
1  21373bc4-0361-452d-9a05-a6d9e81a3f3a                     Bakery
2  95ba94ba-e070-4350-977e-95b732ed3021       Beer, Wine & Spirits
3  b451869b-93c4-4b9d-ab04-5aa05e72a319  Tea, Coffee & Soft Drinks
4  9602ff1f-7545-480f-913d-bd929004af34            Treats & Snacks

Subcategories sample:
                                     Id                   Name  \
0  6aed5989-9de8-4afd-a7c3-0d46a049d690              Baby Food   
1  e213fd22-9874-4493-bee2-2f84845e044e    Baby Milk & Formula   
2  67145a1b-788f-4b67-b1a4-c27f187e5804        Baby Toiletries   
3  1f9a6397-8de5-4bd2-abdc-1df19a0e0ec8  Baby Wipes & Changing   
4  0e9eef1b-9af1-4190-a418-ca2ac9aae244    Feeding Accessories   

                       ParentCategoryId  
0  3d6bc8ae-0b9b-4ded-badc-c4047b39e081  
1  3d6bc8ae-0b9b-4ded-ba