In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time 
import concurrent.futures
import re
import user_agent
import random

In [13]:
''' 
Here you should add you cookies and headers to avoid from recaptcha.
You can use this site: https://curlconverter.com/
'''

params = {
    'topArea': '2',
    'area': '1',
    'city': '5000',
    'propertyGroup': 'apartments',
    'property' : [1,3,4,49] ,
    'price': '600000--1',
    'page': 1,
}

In [14]:
def more_details(item_id,headers=None ,cookies =None):
    '''
    This function gets additional details of the real estate item by its ID from Yad2.co.il
    '''
    details  = {}
#         headers= generate_http_headers()
    response = requests.get(f'https://www.yad2.co.il/api/item/{item_id}', headers=headers, cookies=cookies)
    if 'ShieldSquare Captcha' in response.text:
        print(f"(more_details) Captcha item_id: {item_id}")
        return "break"
    
    response_json = response.json()

    try:
        details['Price'] = response_json['price']
        details['Street'] = response_json['street']
        details['Neighborhood'] = response_json['neighborhood']
        details['Parking'] = response_json['parking']
        details['Balconies'] = response_json['balconies']
        details['Size'] = response_json['square_meters']
        details['Floors'] = response_json['TotalFloor_text']
        
        # analytics_items
        details['Asset_classification'] = response_json['analytics_items']['asset_classification']
        details['Ac'] = response_json['analytics_items']['air_conditioner']
        details['Furniture'] = response_json['analytics_items']['furniture']
        details['Handicapped'] = response_json['analytics_items']['handicapped']
        details['Immediate'] = response_json['analytics_items']['immediate']
        details['Floors_text'] = response_json['analytics_items']['number_of_floors']
        details['Street_id'] = response_json['analytics_items']['street_id']
        details['Rooms'] = response_json['analytics_items']['rooms']
        details['Floor'] = response_json['analytics_items']['floor']
        details['Shelter'] = response_json['analytics_items']['shelter_room']
        details['On_pillars'] = response_json['analytics_items']['on_pillars']
        details['Elevator'] = response_json['analytics_items']['elevator']
        details['Storeroom'] = response_json['analytics_items']['storeroom']

        details['Lat']  = response_json['analytics_items']['lat']
        details['Long']  = response_json['analytics_items']['long']
        try:
            details['Home_number'] = response_json['address_home_number']
        except:
            details['Home_number'] = ''
            
#         details['home_number_2'] = response_json['address_home_num']

        ## Drop after
        details['Images'] = response_json['images']
        details['Text'] = response_json['info_text']
        details['Date_of_entry'] = response_json['date_of_entry']        
        details['Item_id'] = item_id
    except Exception as e:
        print(f"{item_id} error get details: {e}")
    return details

    

In [15]:
def get_details(ad_ids_list  ,headers=None , cookies= None):
    data = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
        futures = []
        for ad_id in ad_ids_list:
            
            futures.append(executor.submit(more_details, ad_id, headers , cookies))
        for index , future in enumerate(concurrent.futures.as_completed(futures)):
            try:
                new_data = future.result()
                if new_data == 'break':
                    df = pd.DataFrame(data)
                    return df
                
                data.append(new_data)
                
            except Exception as e:
                print(f"Error future: {e}")
         
            if index % 10 == 0:
                print(f"{index}/ {len(futures)}")
            
    return pd.DataFrame(data)

In [19]:

def get_page_data(page, headers=None , cookies=None):
    params = {
        'topArea': '2',
        'area': '1',
        'city': '5000',
        'propertyGroup': 'apartments',
        'property' : [1,3,4,49] ,
        'price': '600000--1',
        'page': page,
    }
    url = 'https://www.yad2.co.il/realestate/forsale'

    
    if page % 20 == 0:
        print(page)
    try:
        headers= generate_http_headers()
        response = requests.get(url, params=params, headers=headers, cookies=cookies)
        soup = BeautifulSoup(response.content, 'html5lib')
        text = soup.find('title').text
        if text == 'ShieldSquare Captcha':
            print(f"Captcha page: {page}")

            
    except requests.exceptions.ConnectionError:
        print(f'ConnectionError: Retrying in 3 seconds for page {page}')
        time.sleep(3)
#         headers=generate_http_headers()
        response = requests.get(url, params=params, headers=headers ,cookies=cookies)
    
#     print(f'{page} - {response}')

    if response.status_code != 200:
        print(f'-----{page}-{response}----')
        return []

    soup = BeautifulSoup(response.content, 'html5lib')
    table = soup.find('div', attrs={'class': 'column_large'})
    rooms = table.find_all('div', attrs={'class': 'rooms-item'})
    
    if not rooms:
        print(f'------last_page:{page}-------------')
        print(soup.find('title').text)
        return []
    
    ad_ids = []
    for item in table.find_all('div', attrs={'class': 'feeditem table'})[:37]:
        item_id = item.select_one('div')['item-id']
        ad_ids.append(item_id)
        
    return ad_ids

def get_ad_ids_list( headers=None, cookies = None):
    try:
        length = 120
#         length = find_number_of_pages(headers)
    except (requests.exceptions.Timeout, requests.exceptions.ConnectionError):
        print(f'Retrying in 5 seconds to find number of pages')
        time.sleep(5)
        try:
            length = find_number_of_pages(headers , cookies)
        except:
            return 

    
    ids =[]
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor:
        future = []
        for page_no in range(1,length):
            future.append(executor.submit(get_page_data, page_no , headers , cookies))
        
        for future in concurrent.futures.as_completed(future):
            try:
                ids.append(future.result())
            except:
                pass
    
    ids_list = []
    for sublist in ids:
        if sublist:  # check if sublist is not empty
            ids_list.extend(sublist)
            
    ids_list = set(ids_list) # clean duplicates
    return list(ids_list)

def find_number_of_pages(headers=None , cookies=None):
    params = {
            'topArea': '2',
            'area': '1',
            'city': '5000',
            'propertyGroup': 'apartments',
            'price': '600000--1',
            'page': 1,
        }
    response = requests.get('https://www.yad2.co.il/realestate/forsale', params=params , headers=headers ,cookies =cookies)

    
    soup = BeautifulSoup(response.content, 'html5lib')
    pages = soup.find('div', attrs = {'class':'page-num-container'}).get_text(strip = True)
    print(pages)
    numbers = re.findall(r'\d+', pages)
    last_page = int(max(map(int, numbers)))
    
    if soup.title.text == 'ShieldSquare Captcha':
        return print(soup.title.text)
#     elif str(response) !='<Response [200]>':
#         return print(f'-Error:---{response}----')
    print(last_page)
    
    return last_page - 1

headers = {'Accept': '*/*', 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.158 Safari/537.36', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US;q=0.5,en;q=0.3', 'Cache-Control': 'max-age=0', 'Upgrade-Insecure-Requests': '1', 'Referer': 'https://google.com'}

cookies = {
    '__uzma': '8366b179-a759-4275-a4a3-6b498c09436c',
    '__uzmb': '1680255804',
    '__uzme': '6555',
    '__uzmc': '172267335151',
    '__uzmd': '1681637572',
    '__uzmf': '7f600077151f84-b092-4464-9c79-2405e145574616802558048911381767769-27ff023f0123139573',
    '_ga_GQ385NHRG1': 'GS1.1.1681637572.6.0.1681637572.60.0.0',
    '_ga': 'GA1.3.786487878.1680255806',
    'guest_token': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJwYXlsb2FkIjp7InV1aWQiOiI4MTBmYTY1Yy1lNzcwLTQ2ODktOWM0My0xYmFlMmZlNGIzNDAifSwiaWF0IjoxNjgwMjU1ODA5LCJleHAiOjE3MTM0NzIwNjU1MDF9.-zfPfbRBprhYZkvP5tdnFI_EIOmD5tIB60Q2Sj2A8fs',
    '_hjSessionUser_266550': 'eyJpZCI6IjYzZDMzOTk2LTNmZTgtNWFjYy05ODNjLWUwNzkzOThhZTliNiIsImNyZWF0ZWQiOjE2ODAyNTU4MDc0MzYsImV4aXN0aW5nIjp0cnVlfQ==',
    '_fbp': 'fb.2.1680255807603.1051794013',
    '__gads': 'ID=79bf36825a7c5bfd:T=1680255813:S=ALNI_Ma8_ILszTO_GgX-0fIXwvIYfAQ9iQ',
    '__gpi': 'UID=00000bf9063d568a:T=1680255813:RT=1681637574:S=ALNI_MaBc6WEHoL3OLd0p7Oa6q3e7Z9SWQ',
    'bc.visitor_token': '7047503683635204096',
    'canary': 'never',
    '_gid': 'GA1.3.837312329.1681637573',
    '_gat_UA-708051-1': '1',
    '_hjIncludedInSessionSample_266550': '0',
    '_hjSession_266550': 'eyJpZCI6ImIzOTI5NjMyLTJkNDAtNDVkNi1iYjE3LWJjOWQ3Yjg4YWI4ZiIsImNyZWF0ZWQiOjE2ODE2Mzc1NzQzNDYsImluU2FtcGxlIjpmYWxzZX0=',
    '_hjAbsoluteSessionInProgress': '0',
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    # 'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    # 'Cookie': '__uzma=8366b179-a759-4275-a4a3-6b498c09436c; __uzmb=1680255804; __uzme=6555; __uzmc=172267335151; __uzmd=1681637572; __uzmf=7f600077151f84-b092-4464-9c79-2405e145574616802558048911381767769-27ff023f0123139573; _ga_GQ385NHRG1=GS1.1.1681637572.6.0.1681637572.60.0.0; _ga=GA1.3.786487878.1680255806; guest_token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJwYXlsb2FkIjp7InV1aWQiOiI4MTBmYTY1Yy1lNzcwLTQ2ODktOWM0My0xYmFlMmZlNGIzNDAifSwiaWF0IjoxNjgwMjU1ODA5LCJleHAiOjE3MTM0NzIwNjU1MDF9.-zfPfbRBprhYZkvP5tdnFI_EIOmD5tIB60Q2Sj2A8fs; _hjSessionUser_266550=eyJpZCI6IjYzZDMzOTk2LTNmZTgtNWFjYy05ODNjLWUwNzkzOThhZTliNiIsImNyZWF0ZWQiOjE2ODAyNTU4MDc0MzYsImV4aXN0aW5nIjp0cnVlfQ==; _fbp=fb.2.1680255807603.1051794013; __gads=ID=79bf36825a7c5bfd:T=1680255813:S=ALNI_Ma8_ILszTO_GgX-0fIXwvIYfAQ9iQ; __gpi=UID=00000bf9063d568a:T=1680255813:RT=1681637574:S=ALNI_MaBc6WEHoL3OLd0p7Oa6q3e7Z9SWQ; bc.visitor_token=7047503683635204096; canary=never; _gid=GA1.3.837312329.1681637573; _gat_UA-708051-1=1; _hjIncludedInSessionSample_266550=0; _hjSession_266550=eyJpZCI6ImIzOTI5NjMyLTJkNDAtNDVkNi1iYjE3LWJjOWQ3Yjg4YWI4ZiIsImNyZWF0ZWQiOjE2ODE2Mzc1NzQzNDYsImluU2FtcGxlIjpmYWxzZX0=; _hjAbsoluteSessionInProgress=0',
    'Upgrade-Insecure-Requests': '1',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'none',
    'Sec-Fetch-User': '?1',
}
ids_list = get_ad_ids_list(headers= headers)


20
40
60
80
100


In [17]:
print(f"total ads :{len(ids_list)}")
# ids_list


total ads :0


In [None]:
headers = [ {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299','Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',     'Accept-Language': 'en-US,en;q=0.5',     'Referer': 'https://www.google.com/',     'Connection': 'keep-alive'},   
           {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0',     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',     'Accept-Language': 'en-US,en;q=0.5',     'Referer': 'https://www.google.com/',     'Connection': 'keep-alive'},  
           {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36','Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',     'Accept-Language': 'en-US,en;q=0.5',     'Referer': 'https://www.google.com/',     'Connection': 'keep-alive'},  
           {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',     'Accept-Language': 'en-us',     'Referer': 'https://www.google.com/',     'Connection': 'keep-alive'},   
           {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36','Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',     'Accept-Language': 'en-US,en;q=0.5',     'Referer': 'https://www.google.com/',  'Connection': 'keep-alive'}]
cookies = {

}

headers = {'Accept': '*/*',
           'Connection': 'keep-alive',
           'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0",
           'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US;q=0.5,en;q=0.3',
           'Cache-Control': 'max-age=0',
           'Upgrade-Insecure-Requests': '1',
           'Referer': 'https://google.com'}

df = get_details(ids_list , headers=headers, cookies=cookies)
df

In [None]:
df2  = df.dropna(subset=['Price']).reset_index(drop=True)
df2.shape

In [None]:
#(935, 27)

In [None]:

headers = {'Accept': '*/*',
           'Connection': 'keep-alive',
           'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0",
           'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US;q=0.5,en;q=0.3',
           'Cache-Control': 'max-age=0',
           'Upgrade-Insecure-Requests': '1',
           'Referer': 'https://google.com'}

user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0",
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393",
        "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8"
    ]

response = requests.get(f'https://www.yad2.co.il/' , headers=None)
soup = BeautifulSoup(response.content, 'html5lib')
print(soup.find('title').text)
headers = [ {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299','Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',     'Accept-Language': 'en-US,en;q=0.5',     'Referer': 'https://www.google.com/',     'Connection': 'keep-alive'},   
           {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0',     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',     'Accept-Language': 'en-US,en;q=0.5',     'Referer': 'https://www.google.com/',     'Connection': 'keep-alive'},  
           {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36','Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',     'Accept-Language': 'en-US,en;q=0.5',     'Referer': 'https://www.google.com/',     'Connection': 'keep-alive'},  
           {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',     'Accept-Language': 'en-us',     'Referer': 'https://www.google.com/',     'Connection': 'keep-alive'},   
           {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36','Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',     'Accept-Language': 'en-US,en;q=0.5',     'Referer': 'https://www.google.com/',  'Connection': 'keep-alive'}]


In [3]:
def generate_http_headers():
    user_agents = generate_user_agent()
    accept_types = ["text/html", "application/json", "application/xml", "text/plain"]
    accept_encodings = ["gzip", "deflate", "br"]
    language_codes = ["en-US", "en-GB", "fr-FR", "es-ES", "de-DE"]
    
    headers = {
        'Connection': 'keep-alive',
        "User-Agent": user_agents,
        "Accept": random.choice(accept_types),
        "Accept-Encoding": random.choice(accept_encodings),
        "Accept-Language": random.choice(language_codes),
        "Cache-Control": "no-cache"
    }
    
    return headers


def generate_user_agent():
    # Randomly choose parameters for user agent
    NAVIGATOR = ['chrome', 'firefox'] 
    OS = ['mac', 'linux']
    DEVICE_TYPE = ['desktop', 'smartphone']
    
    navigator = random.choice(NAVIGATOR)
    os = random.choice(OS)
    device_type = random.choice(DEVICE_TYPE)

    # Generate user agent string with chosen parameters
    ua = user_agent.generate_user_agent(os=os, navigator=navigator, device_type=device_type)
    return ua



In [246]:
# df2.to_csv('Data/.csv')

In [9]:
response = requests.get(f'https://www.yad2.co.il/api/item/cirybsfg')
if 'ShieldSquare Captcha' in response.text:
    print(f"(more_details) Captcha item_id:")
    
response_json = response.json()
soup = BeautifulSoup(response.content, 'html5lib')



(more_details) Captcha item_id:


JSONDecodeError: Expecting value: line 1 column 1 (char 0)