In [1]:
import pandas as pd
import numpy as np

In [2]:
# This script reads a CSV file containing product data from Waitrose and displays basic information about the dataset.
waitrose_df = pd.read_csv('./Waitrose/waitrose_products_playwright.csv')

waitrose_df.head()  # Display the first few rows of the DataFrame


Unnamed: 0,parent_category,subcategory,product_id,product_name,price,price_per_unit,size,availability,on_offer,product_url,image_url
0,Fresh & Chilled,Fresh Fruit,568810,Waitrose Cantaloupe Melon,£2.90,Price per unit\r\n\r\n£2.90 each,Each,available,True,https://www.waitrose.com/ecom/products/waitros...,https://ecom-su-static-prod.wtrecom.com/images...
1,Fresh & Chilled,Fresh Fruit,38528,Waitrose Perfectly Ripe Kiwi,£2.60,Price per unit\r\n\r\n65p each,4s,available,True,https://www.waitrose.com/ecom/products/waitros...,https://ecom-su-static-prod.wtrecom.com/images...
2,Fresh & Chilled,Fresh Fruit,55061,Waitrose Seedless Sable Grapes,£2.48,Price per unit\r\n\r\n£6.20/kg,400g,available,True,https://www.waitrose.com/ecom/products/waitros...,https://ecom-su-static-prod.wtrecom.com/images...
3,Fresh & Chilled,Fresh Fruit,88903,Essential Fairtrade Bananas,20.,Price per unit\r\n\r\n£1.10/kg,Typical weight 0.18kg,available,False,https://www.waitrose.com/ecom/products/essenti...,https://ecom-su-static-prod.wtrecom.com/images...
4,Fresh & Chilled,Fresh Fruit,88528,Essential Fairtrade Bananas,95,Price per unit\r\n\r\n19p each,5s,available,False,https://www.waitrose.com/ecom/products/essenti...,https://ecom-su-static-prod.wtrecom.com/images...


In [3]:
# This script reads a CSV file containing product data from Tesco and displays basic information about the dataset.
tesco_df = pd.read_csv('./Tesco/tesco_products_playwright.csv')
tesco_fc_df = pd.read_csv('./Tesco/tesco_products_food-cupboard_playwright.csv')

print(f"Shape before concat duplicates: {tesco_df.shape}")

tesco_df = pd.concat([tesco_df, tesco_fc_df], ignore_index=True)
print(f"Shape before removing duplicates: {tesco_df.shape}")
# Drop duplicate products based on product_id
tesco_df.drop_duplicates(subset=['product_id', 'parent_category', 'subcategory'], keep='first', inplace=True)

# Display information about the dataframe after removing duplicates
print(f"Shape after removing duplicates: {tesco_df.shape}")

tesco_df.head()


Shape before concat duplicates: (31331, 16)
Shape before removing duplicates: (37541, 16)
Shape after removing duplicates: (32522, 16)


Unnamed: 0,product_id,product_name,standard_price,price_per_unit,clubcard_price,clubcard_price_per_unit,has_clubcard_offer,offer_dates,other_promotion,rating,review_count,is_sponsored,product_url,image_url,parent_category,subcategory
0,284477542,Tesco Pink Lady Apples 5 Pack,£2.80,£0.56/each,£2.50 Clubcard Price,(£0.50/each),True,Offer valid for delivery from 08/04/2025 until...,False,2.8,143.0,False,https://www.tesco.com/groceries/en-GB/products...,https://digitalcontent.api.tesco.com/v2/media/...,Fresh Food,Fresh Fruit
1,288115395,Tesco Finest Sweet Easy Peelers 600g,£2.20,£3.67/kg,,,False,,False,2.6,158.0,False,https://www.tesco.com/groceries/en-GB/products...,https://digitalcontent.api.tesco.com/v2/media/...,Fresh Food,Fresh Fruit
2,288135169,Jaffa Easy Peelers 600g,£2.00,£3.33/kg,,,False,,False,2.4,334.0,False,https://www.tesco.com/groceries/en-GB/products...,https://digitalcontent.api.tesco.com/v2/media/...,Fresh Food,Fresh Fruit
3,284475227,Tesco Conference Pears Pack 610G,£1.75,£2.87/kg,,,False,,False,,161.0,False,https://www.tesco.com/groceries/en-GB/products...,https://digitalcontent.api.tesco.com/v2/media/...,Fresh Food,Fresh Fruit
4,305831903,Rosedene Farms Gala Apples 6 Pack,£1.49,£0.25/each,,,False,,False,3.2,75.0,False,https://www.tesco.com/groceries/en-GB/products...,https://digitalcontent.api.tesco.com/v2/media/...,Fresh Food,Fresh Fruit


In [4]:
# Create a copy of both dataframes with the needed columns and add source column
waitrose_subset = waitrose_df[['product_id', 'product_name', 'parent_category', 'subcategory']].copy()
waitrose_subset['source'] = 'Waitrose'

tesco_subset = tesco_df[['product_id', 'product_name', 'parent_category', 'subcategory']].copy()
tesco_subset['source'] = 'Tesco'

# Combine the dataframes
combined_df = pd.concat([waitrose_subset, tesco_subset], ignore_index=True)

# Display the first few rows to verify
combined_df.head()

# Display some basic info about the combined dataframe
print(f"Combined shape: {combined_df.shape}")
print(f"Number of Waitrose products: {len(waitrose_subset)}")
print(f"Number of Tesco products: {len(tesco_subset)}")

Combined shape: (58121, 5)
Number of Waitrose products: 25599
Number of Tesco products: 32522


In [5]:
combined_df['cleaned_product_name'] = combined_df['product_name'].str.replace(r'\s+', ' ', regex=True).str.strip()
combined_df['newCat'] = ''
combined_df['newSubCat'] = ''

combined_df.head()

Unnamed: 0,product_id,product_name,parent_category,subcategory,source,cleaned_product_name,newCat,newSubCat
0,568810,Waitrose Cantaloupe Melon,Fresh & Chilled,Fresh Fruit,Waitrose,Waitrose Cantaloupe Melon,,
1,38528,Waitrose Perfectly Ripe Kiwi,Fresh & Chilled,Fresh Fruit,Waitrose,Waitrose Perfectly Ripe Kiwi,,
2,55061,Waitrose Seedless Sable Grapes,Fresh & Chilled,Fresh Fruit,Waitrose,Waitrose Seedless Sable Grapes,,
3,88903,Essential Fairtrade Bananas,Fresh & Chilled,Fresh Fruit,Waitrose,Essential Fairtrade Bananas,,
4,88528,Essential Fairtrade Bananas,Fresh & Chilled,Fresh Fruit,Waitrose,Essential Fairtrade Bananas,,


In [None]:
combined_df.to_csv('./combined_cleaned.csv', index=False)
