# Scrape and save 

In [None]:
import requests
from bs4 import BeautifulSoup
import re,json

In [None]:
class Categories():
    def __init__(self,target=None):
        self.target = target # either None or (id,name)
        self.website = 'https://www.browsenodes.com/amazon.com'
        self.level = 0 # category level; 0 level ones are the main root categories 
        self.edges = {self.level:[]} # level: (parent_id, child_id)
        self.id_to_name = {} 


        if target is None: # get the names of all the main categories (41 in total) 
            self.n = 4 # <name> <id> <name> <url> format for the home page 
            result = requests.get(self.website)
            content = result.text
            soup = BeautifulSoup(content, 'lxml')
            self.get_info(soup)
        elif len(target) == 2: # start from a child node 
            self.n = 3 # <name> <id> <url> format for all other subpages
            result = requests.get(self.get_url(self.target))
            if not re.search(r"^\d+$",target[0]):
                print("Error: please enter the category id as the first element of target.")
            else: 
                self.id_to_name[target[0]] = target[1]
                self.edges[0].append((None,self.target[0])) 
        else: 
            print("Error: please enter target in the right format.")
        # print(self.edges, self.id_to_name)

    def num_children(self):
        num = 0 
        for p,c in self.edges[self.level]:
            if c is not None: 
                num += 1 
        return num

    def get_url(self, category_id):
        return f"{self.website}/browseNodeLookup/{category_id}.html"

    def get_info(self,soup,parent_id=None):
        newlevel = self.level + 1 
        table = soup.find_all("td") # entries of the data table on the website page 
        if len(table) == 0: # no data 
            self.edges[newlevel].append((parent_id,None))
        elif "is a leaf node" in table[0].text: # reached a leaf node 
            self.edges[newlevel].append((parent_id,None))
        else:     # there exist children 
            curr_id, curr_name = None, None
            for idx, entry in enumerate(table): 
                t = entry.text
                t = re.sub(r"\s{2,}","",t)
                if (idx+1) % self.n == 1: 
                    curr_name = t
                elif (idx+1) % self.n == 2: 
                    curr_id = t 
                    if curr_id not in self.id_to_name:
                        self.id_to_name[curr_id] = curr_name
                    if parent_id is None: 
                        self.edges[0].append((None,curr_id))         
                    else: 
                        self.edges[newlevel].append((parent_id,curr_id))
                    curr_id, curr_name = None, None
    
    def process_descendents(self,filename):
        self.n = 3 
        while self.num_children() > 0:
            self.edges[self.level+1] = [] 
            for p,curr in self.edges[self.level]: 
                if curr is None: continue 
                link = self.get_url(curr)
                result = requests.get(link)
                content = result.text
                soup = BeautifulSoup(content, 'lxml')
                self.get_info(soup,curr)
                print(f"Processed level {self.level+1} children of {self.id_to_name[curr]} ({curr})")
                with open(f"../categories/{filename}","w+") as file: 
                    json.dump((self.edges,self.id_to_name),file)
            self.level += 1 

def process(parent_id, parent_name):
    categories = Categories((parent_id,parent_name))
    categories.process_descendents(f"{parent_name}.json")
    print(categories.edges, "\n", categories.id_to_name)

In [None]:
parent_id = '11059311'
parent_name = "Nail Art & Polish"
process(parent_id, parent_name)

In [None]:
parent_id = '3764401'
parent_name = "Smoking Cessation"
process(parent_id, parent_name)

In [None]:
parent_id = '7192394011'
parent_name = "Clothing, Shoes & Jewelry - Women - Jewelry"
process(parent_id, parent_name)

In [None]:
parent_id = '3887881'
parent_name = "Clothing, Shoes & Jewelry - Men - Jewelry"
process(parent_id, parent_name)

In [None]:
parent_id = '3880611'
parent_name = "Clothing, Shoes & Jewelry - Boys - Jewelry"
process(parent_id, parent_name)

In [None]:
parent_id = '3880961'
parent_name = "Clothing, Shoes & Jewelry - Girls - Jewelry"
process(parent_id, parent_name)

In [None]:
parent_id = '5524110011'
parent_name = "Musical Instruments - Instrument Accessories - Orchestral Strings Accessories & Parts"
process(parent_id, parent_name)

# Combine and translate 

In [1]:
import os 
import json
import pandas as pd 
all_id_to_name = {}
levels = {}
for filepath in os.listdir(f"../categories"): 
    if "categories_graph.json" in filepath or ".csv" in filepath: continue
    filepath = f"../categories/{filepath}"
    with open(filepath,"r") as file: 
        edges,id_to_name = json.load(file)
    all_id_to_name.update(id_to_name)
    for level, edge_list in edges.items(): 
        for s,t in edge_list: 
            if t is not None:
                if level not in levels:  levels[level] = {}
                if s not in levels[level]: levels[level][s] = []
                levels[level][s].append(t)

names =[{"name":v} for k,v in all_id_to_name.items()]
df = pd.DataFrame(names)
df.to_csv(f"../categories/names.csv")
with open(f"../data/categories_graph.json","w+") as file:  
    json.dump(levels,file) 

In [2]:
complete_id_to_name = []
zh = pd.read_csv("../categories/translations.csv")
for idx, cat_id in enumerate(all_id_to_name):
    name = all_id_to_name[cat_id]
    trans = zh['name'][idx]
    complete_id_to_name.append({"id":cat_id,"name":name,"translation":trans})
df = pd.DataFrame(complete_id_to_name)
df.to_csv(f"../data/ID_TO_NAME.csv")

# Visualizations + relevant category ids & paths  

In [7]:
import pandas as pd 
data = pd.read_csv(f"../data/ID_TO_NAME.csv")
all_id_to_name = {str(cat_id): data['name'][idx] for idx, cat_id in enumerate(data['id']) }

In [5]:
import networkx as nx 
import matplotlib.pyplot as plt 

def graph(filepath,figsize=(15,15),show=False):
  with open(filepath,"r") as file: 
      edges,id_to_name = json.load(file)
  G = nx.Graph() 
  for level, edgelist in edges.items(): 
    for e in edgelist: 
        if None in e: continue 
        i = (id_to_name[e[0]],e[0])
        j = (id_to_name[e[1]],e[1])
        G.add_edge(i,j) 

  if show: 
    plt.figure(figsize=figsize) 
    nx.draw(G,node_color="lightblue",with_labels=True) 
    plt.show() 
  return G, id_to_name

def find_node(target,id_to_name):
    targets = []
    for cat_id, name in id_to_name.items(): 
        if name == target:
            targets.append(cat_id)
    return targets  


def bfs(G, source, target):
    '''
    source: (name, id) 
    target: id  
    '''
    queue = []
    queue.append([source]) 
    while queue:
        path = queue.pop(0)
        node = path[-1]
        if target in node:  
            return path 
        for neighbor in nx.neighbors(G, node):
            new_path = list(path)
            new_path.append(neighbor)
            queue.append(new_path)
    return None 

def find_paths(source,keyword):
    G, id_to_name = graph(f"../categories/{all_id_to_name[source]}.json")
    root = (id_to_name[source],source)
    targets = find_node(keyword,id_to_name)
    paths = {}
    for cat_id in targets: 
        path = bfs(G, root, cat_id)
        print(path,"\n")
        final = path[-1]
        paths[final[-1]] = path
    return targets, paths

In [None]:
G, id_to_name = graph(f"../categories/{all_id_to_name['5524110011']}.json",show=True)

In [None]:
G, id_to_name = graph(f"../categories/{all_id_to_name['7192394011']}.json",figsize=(25,25),show=True)

In [None]:
targets, paths =  find_paths('5524110011','Violin')
with open(f"../data/categories/Violin.json","w+") as file: 
    json.dump((targets,paths),file)

In [8]:
targets, paths =  find_paths('3764401','Smokeless Inhalers')
with open(f"../data/categories/SmokelessInhalers.json","w+") as file: 
    json.dump((targets,paths),file)

[('Smoking Cessation', '3764401'), ('Smokeless Inhalers', '4078751')] 



In [None]:
sources =  ['7192394011','3887881','3880961','3880611']
keywords = ['Bracelets','Necklaces','Rings','Anklets','Earrings',
           'Brooches & Pins','Necklaces & Pendants',
           'Wedding & Engagement','Body Jewelry','Jewelry Sets']

search_ids = []
search_paths = {}
for s in sources: 
    for k in keywords: 
        targets, paths = find_paths(s,k)
        search_ids.extend(targets)
        search_paths.update(paths)

with open(f"../data/categories/Jewelry.json","w+") as file: 
    json.dump((search_ids,search_paths),file)

In [None]:
import os,json
search_ids =  {}
for filepath in os.listdir(f"../data/categories"): 
    filepath = f"../data/categories/{filepath}"
    with open(filepath,"r") as file: 
        ids,paths = json.load(file)
        for i, j in paths.items():
            search_ids[i] = []
            for k in j: 
                if k[1] != i: 
                    search_ids[i].append(k[1])
with open(f"../data/search_ids.json","w+") as file: 
    json.dump(search_ids,file)