In [3]:
from haralyzer import HarParser, HarPage
import json

har_parser = HarParser.from_file('www.zomato.com.har')

In [4]:
homes = [entry.response.text for entry in har_parser.pages[0].entries if 'home' in entry.request.url ]

In [5]:
restraunts = []

for home in homes:
	restraunts.extend(json.loads(home)['sections']['SECTION_SEARCH_RESULT'])

In [6]:
with open('restrauntList.json', 'w') as f:
	json.dump(restraunts, f)

In [7]:
import pycurl
from io import BytesIO
from bs4 import BeautifulSoup
from rich import print
from pathlib import Path
import re
# import tempfile
import orjson

'pip install pycurl orjson'

def request(url, header, post_data=None):
	contentBuffer = BytesIO()

	c = pycurl.Curl()
	c.setopt(c.URL, url)

	# Set the HTTP headers
	c.setopt(c.HTTPHEADER, header)
	c.setopt(c.WRITEDATA, contentBuffer)
	if post_data:
		c.setopt(c.POSTFIELDS, post_data)
		c.setopt(c.CUSTOMREQUEST, "POST")
	c.perform()

	# Get the response content
	response_content = contentBuffer.getvalue().decode('utf-8')
	status_code = c.getinfo(c.RESPONSE_CODE)

	c.close()

	return status_code, response_content

def save_data(data, filename):
	Path(filename).parent.mkdir(parents=True, exist_ok=True)
	with open(filename, 'w', encoding='utf-8') as f:
		f.write(orjson.dumps(data, option=orjson.OPT_INDENT_2).decode('utf-8'))

def extract_preloaded(response_content):
	extraction = re.search(r'window\.__PRELOADED_STATE__ = JSON.parse\("(.*)"\)', response_content)
	if not extraction: return None
	return orjson.loads(extraction.group(1).encode().decode('unicode_escape'))

common_headers = {
	'__fetch_req__': 'true',
	'accept': '*/*',
	'accept-language': 'en-US,en;q=0.9',
	'content-type': 'application/json',
	'priority': 'u=1, i',
	'sec-ch-ua': '"Not A;Brand";v="99", "Microsoft Edge";v="127", "Chromium";v="127"',
	'sec-ch-ua-mobile': '?0',
	'sec-ch-ua-platform': '"Windows"',
	'sec-fetch-dest': 'empty',
	'sec-fetch-mode': 'cors',
	'sec-fetch-site': 'same-origin',
	'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'
}
common_headers = [f"{key}: {value}" for key, value in common_headers.items()]

In [9]:
country, cityName = 'india', 'surat'

for index, restraunt in enumerate(restraunts):
		if restraunt['type'] != 'restaurant': continue
		print(f" ⇢ Downloading {restraunt['info']['name']} ({index + 1}/{len(restraunts)})")
		href = restraunt['cardAction']['clickUrl'].split('/')
		url = f"https://www.zomato.com/webroutes/getPage?page_url=/{href[1]}/{href[2]}/order&location=&isMobile=0"
		try:
			_, response_content = request(url, common_headers)
		except:
			print(f"Error: {restraunt['info']['name']} failed to download")
			continue

		data = orjson.loads(response_content)
		save_data(data, f'raw/{country}/{cityName}/{href[2]}.json')

		if 'order' not in data['page_data']:
			print(f"Error: {restraunt['info']['name']} does not have a menu")
			continue

		processed = {
			'basicInfo': {
				'name': data['page_data']['sections']['SECTION_BASIC_INFO']['name'],
				'cuisine_string': data['page_data']['sections']['SECTION_BASIC_INFO']['cuisine_string'],
				'canonicalUrl': data['page_info']['canonicalUrl'],
			},
			'ratings': data['page_data']['sections']['SECTION_BASIC_INFO']['rating_new']['ratings'],
			'establishment': data['page_data']['sections']['SECTION_RES_CONTACT'],
			'items': []
		}

		for menu in data['page_data']['order']['menuList']['menus']:
			menu = menu['menu']
			for category in menu['categories']:
				category = category['category']
				for item in category['items']:
					item = item['item']
					processed['items'].append({
						'name': item['name'],
						'description': item['desc'],
						'rating': item['rating'],
						'media': item.get('item_image_url'),
						'price': item['price'],
						'menuName': menu['name'],
						'categoryName': category['name'],
						'slugs': item['tag_slugs'] + item['service_slugs'] + item['dietary_slugs'] + item['inapplicable_filter_tag_slugs'] +
							item['secondary_tag_slugs'] + item['disclaimer_tag_slugs'] + [item['primary_tag_slug']]						
					})
		
		save_data(processed, f'processed/{cityName}/{href[2]}.json')