In [1]:
import numpy as np
import pandas as pd
import re
pd.set_option('display.max_colwidth', 500) 
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000) 

In [2]:
df = pd.read_csv("amazon_products.csv")
df.shape

(205, 10)

In [3]:
df = df.drop_duplicates()
df.shape

(205, 10)

In [4]:
# -------- Avg Review --------
df["Avg Review"] = (
    df["Avg Review"]
    .astype(str)
    .str.extract(r"([\d\.]+)")   # get 4.7
    .astype(float)
)

# -------- Review Count --------
df["Review Count"] = (
    df["Review Count"]
    .astype(str)
    .str.replace(r"[^\d]", "", regex=True)  # keep digits only
    .replace("", np.nan)
    .astype("Int64")
)

# -------- Has Prime --------
df["Has Prime"] = df["Has Prime"].notna().map({True: "Yes", False: "No"})

# -------- Price (convert "€405\n.\n18" → 405.18) --------
def clean_price(x):
    if not isinstance(x, str):
        return np.nan
    x = x.replace("\n", "")           # remove newlines
    x = x.replace("€", "")            # drop symbol
    x = x.replace(".", "")            # remove thousand dot if exists
    m = re.search(r"(\d+)\s*\,?\s*(\d{2})$", x)
    if m:  # standard case
        return float(f"{m.group(1)}.{m.group(2)}")
    # fallback: extract decimal via regex
    nums = re.findall(r"\d+", x)
    if len(nums) == 2:
        return float(f"{nums[0]}.{nums[1]}")
    return np.nan

df["Price (€)"] = df["Price"].apply(clean_price)

In [5]:
# Specifications split
def parse_spec(spec):
    if not isinstance(spec, str):
        return {}  # return empty dict for NaN or non-string
    lines = spec.split("\n")[2:]  # drop first 2 header lines
    data = {}
    for line in lines:
        if "\t" in line:
            key, value = line.split("\t", 1)
            data[key.strip()] = value.strip()
    return data

# Apply parsing and expand into columns
specs_expanded = df["Specifications"].apply(parse_spec).apply(pd.Series)

# Merge back with original dataframe (drop old column if needed)
df = pd.concat([df.drop(columns=["Specifications"]), specs_expanded], axis=1)

In [6]:
list(df.columns)

['Image',
 'Title',
 'Avg Review',
 'Review Count',
 'Has Prime',
 'Price',
 'Delivery',
 'Availability',
 'URL',
 'Price (€)',
 'Item Weight',
 'Product Dimensions',
 'Item model number',
 'Special features',
 'Form Factor',
 'Colour',
 'Guaranteed software updates until',
 'ASIN',
 'Customer Reviews',
 'Best Sellers Rank',
 'Date First Available',
 'Brand',
 'Manufacturer',
 'Series',
 'Standing screen display size',
 'Screen Resolution',
 'Resolution',
 'Number of USB 3.0 Ports',
 'Number of HDMI Ports',
 'Voltage',
 'Are Batteries Included',
 'Batteries',
 'Processor Brand',
 'Processor Type',
 'Processor Speed',
 'Processor Count',
 'RAM Size',
 'Memory Technology',
 'Computer Memory Type',
 'Maximum Memory Supported',
 'Memory Clock Speed',
 'Hard Drive Size',
 'Hard disk description',
 'Hard Drive Interface',
 'Audio Details',
 'Speaker Description',
 'Graphics Coprocessor',
 'Graphics Chipset Brand',
 'Graphics Card Description',
 'Graphics RAM Type',
 'Graphics Card Interface'

In [7]:
newdf = df[[
    "Title",
    "Price (€)",
    "Avg Review",
    "Review Count",
    "Availability",
    "Delivery",
    "Has Prime",
    "Image",
    "ASIN",
    "Brand",
    "Model Number",
    "Product Dimensions",
    "Item Weight",
    "Best Sellers Rank",
    "URL",
]]
newdf.shape

(205, 15)

In [8]:
newdf

Unnamed: 0,Title,Price (€),Avg Review,Review Count,Availability,Delivery,Has Prime,Image,ASIN,Brand,Model Number,Product Dimensions,Item Weight,Best Sellers Rank,URL
0,Comfyable 33 cm 14 inch Padded Laptop Sleeve Carry Case for MacBook Air M4 2025 M3 M2 M1 MacBook Pro M4 Pro/Max Rice White,25.73,4.7,1964,In stock,"FREE delivery Thursday, 4 December. Order within 21 hrs 47 mins",Yes,https://m.media-amazon.com/images/I/51u-bT0CBkL._AC_SX679_.jpg,B0BHW2975C,,,‎36 x 25 x 3 cm; 350 g,‎350 g,,https://aax-eu.amazon.nl/x/c/JIcaazgp5UoRiia7qaAYM-oAAAGa2WVm2AMAAAH2AQBvbm9fdHhuX2JpZDMgICBvbm9fdHhuX2ltcDIgICDSsluT/clv1_CEuOPUxokZA0iHrVaMJw6iH8Wk9HA7pk781WxzaSycZx139bsQr7OP018HHz9hvVjq0yzpNb3XnMfU0fAmcAB1h7LQa6EM7IE24uRFtsU1yLE5QiskfB8IkGXCQZUuW6XJeXalC7rdIQEQQ8kV_3ZVhXLA8QC8x8H15tWbp4SRWS0aSJkEohXxuzLJWvKbYuDzUcfMyagcpNBHhtJZhsefQQh8yliMIIKin6O61_zAYOsoJhkeWuV2gvjBmW5lJBN2PMN-SRnR9AjX0lL9xFahCwdXiTzSjwM7ud24izaJsmYmWl1aK1xkuNcA7FBIOT3_cq0eVrbNPflpL5Dua_CpRb3-RmYjAfo4oi3PYwG7UNw10jCROW_...
1,"2025 Newest Portable Monitor 15.6 Inch, FHD 1080P IPS Ultra Slim With Kickstand, USB Connection For Laptop, PC, Mac, PS4/5, Xbox and Switch",63.49,4.5,195,In stock,"FREE delivery Tomorrow, 2 December. Order within 12 hrs 2 mins",Yes,https://m.media-amazon.com/images/I/71GnA9jckyL._AC_SX679_.jpg,‎B0DF7FB27H,‎Dopesplay,,‎1.3 x 36.2 x 36.2 cm; 660 g,,,https://www.amazon.nl/-/en/Dopesplay-Portable-Monitor-Kickstand-Connection/dp/B0DF7FB27H/ref=sr_1_225
2,"HP OmniBook Ultra 14-fd0010nd 14"" 2240 x 1400 pixels Touchscreen AMD Ryzen AI 9 16GB 1000GB SSD",899.00,,,In stock,"FREE delivery Thursday, 4 December. Order within 12 hrs 46 mins",Yes,https://m.media-amazon.com/images/I/81dAKnF+FsL._AC_SX679_.jpg,‎B0DVCCYHNQ,‎HP,,‎22.76 x 31.51 x 1.64 cm; 1.57 kg,,,https://www.amazon.nl/-/en/OmniBook-14-fd0010nd-pixels-Touchscreen-1000GB/dp/B0DVCCYHNQ/ref=sr_1_133
3,Lenovo IdeaPad Slim 5 Laptop | 14 Inch OLED Display | Intel Core i7-13620H | 16GB RAM | 512GB SSD | Intel Graphics | Windows 11 Home | QWERTZ | Grey | 3 Months Premium Care,699.62,4.6,37,Only 2 left in stock.,"FREE delivery Friday, 5 December. Order within 12 hrs 46 mins",No,https://m.media-amazon.com/images/I/71s6Hw-TwxL._AC_SX679_.jpg,‎B0DDCRR1QS,‎Lenovo,,,,,https://www.amazon.nl/-/en/IdeaPad-Display-i7-13620H-Graphics-Windows/dp/B0DDCRR1QS/ref=sr_1_105
4,"ASUS ROG Zephyrus G16 GA605KP-QR022W | 16"" | OLED | AMD Ryzen AI 7 350 | 32GB RAM | 1TB SSD | NVIDIA GeForce RTX 5070 | Windows OS | QWERTY Keyboard",99.00,4.0,1,Only 5 left in stock.,"FREE delivery Friday, 5 December. Order within 12 hrs 45 mins",Yes,https://m.media-amazon.com/images/I/71koHMfmWcL._AC_SX679_.jpg,‎B0F6NVVKCF,‎ASUS,,‎35.4 x 24.6 x 1.64 cm; 4.15 kg,,,https://www.amazon.nl/-/en/Zephyrus-GA605KP-QR022W-GeForce-Windows-Keyboard/dp/B0F6NVVKCF/ref=sr_1_154
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,"Joseph Joseph Sipp Travel Mug, Hygienic, Leak-proof, Reusable Mug, Coffee & Tea Insulated Cup with Lid, 450ml (16 fl.oz), Steel",27.57,4.2,3990,In stock,"FREE delivery Tomorrow, 2 December. Order within 9 hrs 56 mins",Yes,https://m.media-amazon.com/images/I/41IKiYuRzrL._AC_SX679_.jpg,B0B1MTZXXZ,,,‎8.13 x 8.13 x 17.6 cm; 272.16 g,,,https://www.amazon.nl/-/en/Joseph-Hygienic-Leak-proof-Reusable-Insulated/dp/B0B1MTZXXZ/ref=sr_1_225
201,Disney - Mrs. Potts (Mrs.Bric) - Teiera 3D Sagomata In Ceramica,11.17,4.6,8423,In stock,"FREE delivery Tomorrow, 2 December on your first order shipped by Amazon. Order within 9 hrs 11 mins",No,https://m.media-amazon.com/images/I/71rlwuAtEvL._AC_SX522_.jpg,‎B0722XD7KD,‎Paladone,‎PP3556DP,‎11 x 11 x 10 cm; 270 g,,,https://www.amazon.nl/-/en/Disney-Mrs-Bric-Teiera-Sagomata-Ceramica/dp/B0722XD7KD/ref=sr_1_159
202,"Stanley Quencher H2.0 Flowstate Tumbler 1.2L - 11 Hours Cold - 48 Hours With Ice - Drinking Bottle With Straw, Handle And Lid - Dishwasher Safe - Thermos Cup For Cold Drinks - Plum",34.99,4.4,10538,In stock,"FREE delivery Thursday, 4 December. Order within 22 hrs 10 mins",Yes,https://m.media-amazon.com/images/I/61gpaVoAi2L._AC_SX679_.jpg,B0DFWW8JZN,,,‎14.8 x 14.8 x 31.8 cm; 650 g,,,https://www.amazon.nl/-/en/Stanley-Quencher-H2-0-Flowstate-Tumbler/dp/B0DFWW8JZN/ref=sr_1_32
203,"MAMEIDO Thermocup 350 ml, coffee cup to go made of stainless steel, double-walled insulated, leak-proof, coffee to go cup with keep warm function (Oak Wood)",11.19,4.5,7818,,FREE delivery 27 December - 8 January on your first order shipped by Amazon,No,https://m.media-amazon.com/images/I/71i9GTeBozL._AC_SX679_.jpg,‎B08THHFHZV,‎MAMEIDO,‎HU-XI-170,‎9.1 x 9.1 x 13.7 cm; 252 g,,,https://www.amazon.nl/-/en/Thermocup-stainless-double-walled-insulated-leak-proof/dp/B08THHFHZV/ref=sr_1_10


In [9]:
newdf = newdf.drop_duplicates()
newdf.shape

(204, 15)

In [10]:
newdf.to_csv("product.csv", index=False)