# Import Necessary Libraries

In [1]:
import json
import numpy as np
import pandas as pd

### Read and Process Data

In [2]:
preprocessed_data = []


with open("Sampled_Amazon_Meta.json", 'r') as file:
    for line in file:
        data = json.loads(line)
        
        # Process each row in the dataset
        processed_row = {
            "title": data.get("title"),
            "brand": data.get("brand"),
            "rank": data.get("rank"),
            "category": data.get("category"),
            "asin": data.get("asin"),
            "also_buy": data.get("also_buy")
        }
        preprocessed_data.append(processed_row)

In [3]:
preprocessed_data

[{'title': "QIBOE Men's Baggy Jeans Denim Sweatpants Loose Pants",
  'description': ['<b>pant size(Unit:inch)</b><br> W30(tag30) Waist: 30.0 Hip: 41.7 Length: 43.3 Thigh: 26.8 Leg opening: 16.5 <br> W32(tag32) Waist: 32.0 Hip: 43.7 Length: 43.7 Thigh: 27.6 Leg opening: 16.9 <br> W33(tag34) Waist: 33.9 Hip: 45.7 Length: 44.1 Thigh: 28.3 Leg opening: 17.3 <br> W34(tag36) Waist: 35.8 Hip: 47.6 Length: 44.5 Thigh: 29.1 Leg opening: 17.7 <br> W36(tag38) Waist: 37.8 Hip: 49.6 Length: 44.9 Thigh: 29.9 Leg opening: 18.1 <br> W38(tag40) Waist: 40.0 Hip: 51.6 Length: 45.3 Thigh: 30.7 Leg opening: 18.5 <br> W40(tag42) Waist: 42.0 Hip: 53.5 Length: 45.7 Thigh: 31.5 Leg opening: 18.9 <br> W42(tag44) Waist: 44.0 Hip: 55.5 Length: 46.1 Thigh: 32.3 Leg opening: 19.3 <br> W44(tag46) Waist: 46.0 Hip: 57.5 Length: 46.5 Thigh: 33.1 Leg opening: 19.7 <br>'],
  'brand': 'QIBOE',
  'rank': '1,506,383 in Clothing, Shoes & Jewelry (',
  'price': '$38.99',
  'category': ['Clothing, Shoes & Jewelry',
   'Men',
 

### Create Dataframe

In [27]:
# Creating a DataFrame
df = pd.DataFrame(preprocessed_data)
df

Unnamed: 0,title,description,brand,rank,price,category,asin,also_buy
0,QIBOE Men's Baggy Jeans Denim Sweatpants Loose...,[<b>pant size(Unit:inch)</b><br> W30(tag30) Wa...,QIBOE,"1,506,383 in Clothing, Shoes & Jewelry (",$38.99,"[Clothing, Shoes & Jewelry, Men, Clothing, Jea...",6342509379,"[B077GQQKRV, B07CBJQTF6, B07H2Z6S9J, B06Y26PZ5..."
1,Crazy Women's Voile Crinkle Scarf Shawl,[Feature <br> -Great quality winter scarf. <br...,Crazy,"273,519 in Clothing, Shoes & Jewelry (",$0.50,"[Clothing, Shoes & Jewelry, Women, Accessories...",6342502315,"[B018YRBB80, B07FD9HWPM, B017M55DI4, B07KX6PPW..."
2,FQQ Women Sexy Lingerie Lace Dress Sheer Babyd...,[Material : Core-spun fabric silk <br> feature...,FQQ,"3,266,227 in Clothing, Shoes & Jewelry (",$2.80,"[Clothing, Shoes & Jewelry, Women, Clothing, L...",6342522545,[B00VBVXVPI]
3,Crazy Women's Sexy Leather Backless Bodycon Cl...,[Material : Core-spun fabric silk <br> feature...,Crazy,"641,576 in Clothing, Shoes & Jewelry (",$8.50,"[Clothing, Shoes & Jewelry, Women, Clothing, D...",6342522898,"[B01AHZSZ9A, B01I809NCO, B07219C7LQ, B06ZZBQMT..."
4,FQQ Women's Sexy Lingerie Babydoll Dress Sleep...,[Material : Core-spun fabric silk <br> feature...,FQQ,"1,761,440 in Clothing, Shoes & Jewelry (",$4.50,"[Clothing, Shoes & Jewelry, Women, Clothing, L...",6342523002,"[B06XY5N95G, B01LY4VKTL, B01EKRMG8C, B004SLKRY..."
...,...,...,...,...,...,...,...,...
2686213,AIRSOFTPEAK Military Belt Security Mission Fir...,[<b>Specifications:</b><br><br>Adjustable Leng...,AIRSOFTPEAK,"244,921 in Sports & Outdoors (",$9.98,"[Sports & Outdoors, Sports & Fitness, Hunting ...",B013QOPJ6G,"[B01H6Z7X74, B0013AVGKY, B01DNBIX38, B01M0MDRF..."
2686214,Hot Sale High Quality Model Js Superior Metal ...,[<b>Model--------line Capacity(LBS-YDS)-------...,Ballant,"1,136,500 in Sports & Outdoors (",,"[Sports & Outdoors, Sports & Fitness, Hunting ...",B013QPG7E8,[B01DVWV16A]
2686215,Hasika All-Weather Diversified 8 x 8 Instant S...,[],Hasika,"416,498 in Sports & Outdoors (",$134.99,"[Sports & Outdoors, Outdoor Recreation, Campin...",B013QPIVY2,"[B004Z10D16, B00VTJKB1O, B07CMZY6JB, B004MMEHT..."
2686216,Aluminum Camping Tent Guyline Runner Rope Tigh...,"[, <b>Description:</b>, 100% Brand new and hig...",Generic,"255,216 in Sports & Outdoors (",$4.99,"[Sports & Outdoors, Outdoor Recreation, Campin...",B013QQKMO8,"[B00LLUDFMI, B071WPWPGP, B01IHRAXD6, B07BHZDLQ..."


In [28]:
# Remove duplicates from the original DataFrame
df.drop_duplicates(subset=['title', 'asin'], inplace=True)

### Replace Empty Values With NaN

In [29]:
def replace_empty_with_nan(value):
    
    #if a value is an empty string replace with nan
    if isinstance(value, str) and value.strip() == "": 
        return np.nan
    
    #if a value is an empty list replace with nan
    elif isinstance(value, list) and not value:
        return np.nan
    else:
        return value
    
df = df.applymap(replace_empty_with_nan)
df.isnull().sum()

title            1233
description    524203
brand          169880
rank            18488
price          925269
category       112001
asin                0
also_buy            0
dtype: int64

In [30]:
# Drop rows with missing values
df.dropna(inplace=True)
df.isnull().sum()


title          0
description    0
brand          0
rank           0
price          0
category       0
asin           0
also_buy       0
dtype: int64

In [31]:
df

Unnamed: 0,title,description,brand,rank,price,category,asin,also_buy
0,QIBOE Men's Baggy Jeans Denim Sweatpants Loose...,[<b>pant size(Unit:inch)</b><br> W30(tag30) Wa...,QIBOE,"1,506,383 in Clothing, Shoes & Jewelry (",$38.99,"[Clothing, Shoes & Jewelry, Men, Clothing, Jea...",6342509379,"[B077GQQKRV, B07CBJQTF6, B07H2Z6S9J, B06Y26PZ5..."
1,Crazy Women's Voile Crinkle Scarf Shawl,[Feature <br> -Great quality winter scarf. <br...,Crazy,"273,519 in Clothing, Shoes & Jewelry (",$0.50,"[Clothing, Shoes & Jewelry, Women, Accessories...",6342502315,"[B018YRBB80, B07FD9HWPM, B017M55DI4, B07KX6PPW..."
2,FQQ Women Sexy Lingerie Lace Dress Sheer Babyd...,[Material : Core-spun fabric silk <br> feature...,FQQ,"3,266,227 in Clothing, Shoes & Jewelry (",$2.80,"[Clothing, Shoes & Jewelry, Women, Clothing, L...",6342522545,[B00VBVXVPI]
3,Crazy Women's Sexy Leather Backless Bodycon Cl...,[Material : Core-spun fabric silk <br> feature...,Crazy,"641,576 in Clothing, Shoes & Jewelry (",$8.50,"[Clothing, Shoes & Jewelry, Women, Clothing, D...",6342522898,"[B01AHZSZ9A, B01I809NCO, B07219C7LQ, B06ZZBQMT..."
4,FQQ Women's Sexy Lingerie Babydoll Dress Sleep...,[Material : Core-spun fabric silk <br> feature...,FQQ,"1,761,440 in Clothing, Shoes & Jewelry (",$4.50,"[Clothing, Shoes & Jewelry, Women, Clothing, L...",6342523002,"[B06XY5N95G, B01LY4VKTL, B01EKRMG8C, B004SLKRY..."
...,...,...,...,...,...,...,...,...
2686209,Pro Impact Pro Swivel - Multi-Direction Heavy ...,"[, In order to get your speed bag set up, you&...",Pro Impact,"109,374 in Sports & Outdoors (",$29.99,"[Sports & Outdoors, Sports & Fitness, Other Sp...",B013QN02Y6,"[B002ZPX2QW, B004JPUTJY, B008R166C0, B002XP18R..."
2686210,Pro Impact Rugby Kicking Tee - Heavy Duty Adju...,"[, Whether you&rsquo;re <b>practicing</b> or <...",Pro Impact,"73,247 in Sports & Outdoors (",$9.99,"[Sports & Outdoors, Sports & Fitness, Team Spo...",B013QN0Y3K,"[B003NO7SPU, B074YF68P3, B01FG07PII, B0088R319..."
2686213,AIRSOFTPEAK Military Belt Security Mission Fir...,[<b>Specifications:</b><br><br>Adjustable Leng...,AIRSOFTPEAK,"244,921 in Sports & Outdoors (",$9.98,"[Sports & Outdoors, Sports & Fitness, Hunting ...",B013QOPJ6G,"[B01H6Z7X74, B0013AVGKY, B01DNBIX38, B01M0MDRF..."
2686216,Aluminum Camping Tent Guyline Runner Rope Tigh...,"[, <b>Description:</b>, 100% Brand new and hig...",Generic,"255,216 in Sports & Outdoors (",$4.99,"[Sports & Outdoors, Outdoor Recreation, Campin...",B013QQKMO8,"[B00LLUDFMI, B071WPWPGP, B01IHRAXD6, B07BHZDLQ..."


## Process Data and Create Output

In [32]:
# Create a dictionary to store mappings of asin codes to product titles
asin_to_title = dict(zip(df['asin'], df['title']))

# Create a new column for updated 'also_buy' list
df['also_buy_product_names'] = ""

# Iterate over each row of the DataFrame
for index, row in df.iterrows():
    also_buy_list = row['also_buy']  # Get the list of codes from 'also_buy' column
    also_buy_product_names = []  # List to store updated values
    updated_also_buy = []
    also_buy_product_names.append(row['title'])  
    updated_also_buy.append(row['asin'])
    
    # Iterate over each code in the 'also_buy' list
    for code in also_buy_list:
        # Check if the code exists in the asin_to_title dictionary
        if code in asin_to_title:
            also_buy_product_names.append(asin_to_title[code])  
            updated_also_buy.append(code)
    
    # Update the 'updated_also_buy' column with the updated list
    df.at[index, 'also_buy_product_names'] = also_buy_product_names
    df.at[index, 'also_buy'] = updated_also_buy

In [33]:
df

Unnamed: 0,title,description,brand,rank,price,category,asin,also_buy,also_buy_product_names
0,QIBOE Men's Baggy Jeans Denim Sweatpants Loose...,[<b>pant size(Unit:inch)</b><br> W30(tag30) Wa...,QIBOE,"1,506,383 in Clothing, Shoes & Jewelry (",$38.99,"[Clothing, Shoes & Jewelry, Men, Clothing, Jea...",6342509379,[6342509379],[QIBOE Men's Baggy Jeans Denim Sweatpants Loos...
1,Crazy Women's Voile Crinkle Scarf Shawl,[Feature <br> -Great quality winter scarf. <br...,Crazy,"273,519 in Clothing, Shoes & Jewelry (",$0.50,"[Clothing, Shoes & Jewelry, Women, Accessories...",6342502315,"[6342502315, B00NSF70KM, B00KU1NLGO, B00G9EMVIA]","[Crazy Women's Voile Crinkle Scarf Shawl, Here..."
2,FQQ Women Sexy Lingerie Lace Dress Sheer Babyd...,[Material : Core-spun fabric silk <br> feature...,FQQ,"3,266,227 in Clothing, Shoes & Jewelry (",$2.80,"[Clothing, Shoes & Jewelry, Women, Clothing, L...",6342522545,[6342522545],[FQQ Women Sexy Lingerie Lace Dress Sheer Baby...
3,Crazy Women's Sexy Leather Backless Bodycon Cl...,[Material : Core-spun fabric silk <br> feature...,Crazy,"641,576 in Clothing, Shoes & Jewelry (",$8.50,"[Clothing, Shoes & Jewelry, Women, Clothing, D...",6342522898,"[6342522898, B01H43Z5GY]",[Crazy Women's Sexy Leather Backless Bodycon C...
4,FQQ Women's Sexy Lingerie Babydoll Dress Sleep...,[Material : Core-spun fabric silk <br> feature...,FQQ,"1,761,440 in Clothing, Shoes & Jewelry (",$4.50,"[Clothing, Shoes & Jewelry, Women, Clothing, L...",6342523002,[6342523002],[FQQ Women's Sexy Lingerie Babydoll Dress Slee...
...,...,...,...,...,...,...,...,...,...
2686209,Pro Impact Pro Swivel - Multi-Direction Heavy ...,"[, In order to get your speed bag set up, you&...",Pro Impact,"109,374 in Sports & Outdoors (",$29.99,"[Sports & Outdoors, Sports & Fitness, Other Sp...",B013QN02Y6,"[B013QN02Y6, B00LLXIMZU, B001B88VXI, B00175YK9...",[Pro Impact Pro Swivel - Multi-Direction Heavy...
2686210,Pro Impact Rugby Kicking Tee - Heavy Duty Adju...,"[, Whether you&rsquo;re <b>practicing</b> or <...",Pro Impact,"73,247 in Sports & Outdoors (",$9.99,"[Sports & Outdoors, Sports & Fitness, Team Spo...",B013QN0Y3K,"[B013QN0Y3K, B000TE9ZFI, B00WMJR00U, B001MGQBA...",[Pro Impact Rugby Kicking Tee - Heavy Duty Adj...
2686213,AIRSOFTPEAK Military Belt Security Mission Fir...,[<b>Specifications:</b><br><br>Adjustable Leng...,AIRSOFTPEAK,"244,921 in Sports & Outdoors (",$9.98,"[Sports & Outdoors, Sports & Fitness, Hunting ...",B013QOPJ6G,"[B013QOPJ6G, B0013AVGKY, B00AOCFPPE, B005DT80S...",[AIRSOFTPEAK Military Belt Security Mission Fi...
2686216,Aluminum Camping Tent Guyline Runner Rope Tigh...,"[, <b>Description:</b>, 100% Brand new and hig...",Generic,"255,216 in Sports & Outdoors (",$4.99,"[Sports & Outdoors, Outdoor Recreation, Campin...",B013QQKMO8,"[B013QQKMO8, B013FOVIGC, B005188T90, B006ZC5KLG]",[Aluminum Camping Tent Guyline Runner Rope Tig...


## Output to JSON

In [34]:
# Open a JSON file in write mode
with open('output.json', 'w') as f:
    # Iterate over each row in the DataFrame
    for _, row in df.iterrows():
        # Convert the row to a JSON string and write it to the file with a newline character
        f.write(json.dumps(row.to_dict()) + '\n')
