In [2]:
import pandas as pd

In [14]:
# Load the CSV files with the correct line terminator
images_df = pd.read_csv('Datasets/images.csv', sep=',', lineterminator='\n')
products_df = pd.read_csv('Datasets/products.csv', sep=',', lineterminator='\n')
# Remove \r from column names and values
images_df.columns = images_df.columns.str.replace('\r', '')
images_df['product_id'] = images_df['product_id'].str.replace('\r', '')
images_df.head()
# products_df.head()

Unnamed: 0.1,Unnamed: 0,id,product_id
0,0,912bb259-3ad9-457b-9db1-ce1da9016057,5f5f57d7-778f-4336-bb10-b43863418c8c
1,1,b166d305-b852-4bdd-83f4-465b20da94fa,5f5f57d7-778f-4336-bb10-b43863418c8c
2,2,68f5a29d-0075-4d60-81c1-ab684a82e50c,c2c8949f-3cde-4651-a234-4a4a1b2a9ad4
3,3,f6a309d7-d247-446a-9b5e-aceefdd4334d,c2c8949f-3cde-4651-a234-4a4a1b2a9ad4
4,4,2c2b3a6f-15b3-4289-937a-15482d9f5781,8292aa4e-7f1b-4655-bf0e-f1f2c9e3ffaf


In [16]:
# Step 1: Extract root categories and assign labels
root_categories = products_df['category'].apply(lambda x: x.split('/')[0].strip())
root_categories_unique = root_categories.unique()
label_encoder = {category: label for label, category in enumerate(root_categories_unique)}
print(label_encoder)

{'Home & Garden': 0, 'Baby & Kids Stuff': 1, 'DIY Tools & Materials': 2, 'Music, Films, Books & Games': 3, 'Phones, Mobile Phones & Telecoms': 4, 'Clothes, Footwear & Accessories': 5, 'Other Goods': 6, 'Health & Beauty': 7, 'Sports, Leisure & Travel': 8, 'Appliances': 9, 'Computers & Software': 10, 'Office Furniture & Equipment': 11, 'Video Games & Consoles': 12}


In [17]:
# Apply label encoding to products_df
products_df['label'] = root_categories.map(label_encoder)
products_df.head()

Unnamed: 0,Column1,id,product_name,category,product_description,price,location\r\r,label
0,0,243809c0-9cfc-4486-ad12-3b7a16605ba9,mirror wall art wokingham berkshir gumtre,"Home & Garden / Dining, Living Room Furniture ...",mirror wall art post nisha dine live room furn...,5.0,"Wokingham, Berkshire\r\r",0
1,1,1c58d3f9-8b93-47ea-9415-204fcc2a22e6,stainless steel food steamer inver highland gu...,Home & Garden / Other Household Goods,morphi richard model 48755stainless steel 3 ti...,20.0,"Inverness, Highland\r\r",0
2,2,860673f1-57f6-47ba-8d2f-13f9e05b8f9a,sun lounger skeg lincolnshir gumtre,Home & Garden / Garden & Patio / Outdoor Setti...,2 collect dont drive 20 ono,20.0,"Skegness, Lincolnshire\r\r",0
3,3,59948726-29be-4b35-ade5-bb2fd7331856,coffe side tabl ammunit ammo box hairpin leg r...,"Home & Garden / Dining, Living Room Furniture ...",great reclaim armi ammunit box use coffe side ...,115.0,"Radstock, Somerset\r\r",0
4,5,5707be4f-49d9-4feb-b2c8-aa0868405c65,spotlight newent gloucestershir gumtre,Home & Garden / Other Household Goods,new box need spotlight post sue home garden ho...,9.0,"Newent, Gloucestershire\r\r",0


In [18]:
# Merge images and products dataframes to assign labels to images
print(images_df["product_id"])
merged_df = images_df.merge(products_df[['id', 'label']], left_on='product_id', right_on='id', how='inner')
merged_df.head()

0        5f5f57d7-778f-4336-bb10-b43863418c8c
1        5f5f57d7-778f-4336-bb10-b43863418c8c
2        c2c8949f-3cde-4651-a234-4a4a1b2a9ad4
3        c2c8949f-3cde-4651-a234-4a4a1b2a9ad4
4        8292aa4e-7f1b-4655-bf0e-f1f2c9e3ffaf
                         ...                 
12599    2b0a652b-46a2-4297-b619-5efeeb222787
12600    719fd40a-870e-4144-b324-55dff2e66fb4
12601    719fd40a-870e-4144-b324-55dff2e66fb4
12602    86d1806b-5575-4a7e-9160-f24f12be6c95
12603    86d1806b-5575-4a7e-9160-f24f12be6c95
Name: product_id, Length: 12604, dtype: object


Unnamed: 0.1,Unnamed: 0,id_x,product_id,id_y,label
0,0,912bb259-3ad9-457b-9db1-ce1da9016057,5f5f57d7-778f-4336-bb10-b43863418c8c,5f5f57d7-778f-4336-bb10-b43863418c8c,0
1,1,b166d305-b852-4bdd-83f4-465b20da94fa,5f5f57d7-778f-4336-bb10-b43863418c8c,5f5f57d7-778f-4336-bb10-b43863418c8c,0
2,2,68f5a29d-0075-4d60-81c1-ab684a82e50c,c2c8949f-3cde-4651-a234-4a4a1b2a9ad4,c2c8949f-3cde-4651-a234-4a4a1b2a9ad4,0
3,3,f6a309d7-d247-446a-9b5e-aceefdd4334d,c2c8949f-3cde-4651-a234-4a4a1b2a9ad4,c2c8949f-3cde-4651-a234-4a4a1b2a9ad4,0
4,4,2c2b3a6f-15b3-4289-937a-15482d9f5781,8292aa4e-7f1b-4655-bf0e-f1f2c9e3ffaf,8292aa4e-7f1b-4655-bf0e-f1f2c9e3ffaf,0


In [19]:
# Save the merged dataframe as training_data.csv
merged_df.to_csv('Datasets/training_data.csv', index=False)

In [20]:
# Save label encoder as a dictionary
import json

with open('Datasets/label_encoder.json', 'w') as f:
    json.dump(label_encoder, f)