# Presets

In [None]:
import argparse
import json
import math
import os
import time
from urllib.error import HTTPError
from urllib.parse import ParseResult
import re
from urllib.parse import urlparse
from urllib.request import Request
from urllib.request import urlopen
from urllib.request import urlretrieve
from bs4 import BeautifulSoup as bs
import zlib
from time import sleep

# Functions

In [None]:
def download_page(pageid):
    url = str(
        'https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/mazowieckie/'+
        'warszawa/warszawa/warszawa?limit=24&page='+
        str(pageid)
    )
    req = Request(url, headers={'User-Agent': "Mozilla/5.0"})
    sauce = urlopen(req).read()
    soup = bs(sauce, 'html5lib')
    offers = soup.find('script', type="application/json")
    return offers

In [None]:
def download_desc(url):
    req = Request(url, headers={'User-Agent': "Mozilla/5.0"})
    sauce = urlopen(req).read()
    soup = bs(sauce, 'html5lib')
    desc = soup.find('script', type="application/json")
    return desc

In [None]:
def get_n_offers(offers):
    #n_offers = json.loads(offers.string)['@graph'][0]['description']
    n_offers = json.loads(offers.string)['props']['pageProps']['pageDescription']
    n_offers = re.sub(r"Zobacz ", "", n_offers)
    n_offers = re.sub(
        r" ogłoszeń o mieszkania na sprzedaż w Warszawa, mazowieckie. Sprawdź na Otodom.pl!",
        "",
        n_offers
    )
    n_offers = int(n_offers)
    return n_offers

In [None]:
def get_offers(offers):
    offers_lst = json.loads(offers.string)['props']['pageProps']['data']['searchAds']['items']
    
    [x.update({'loc':x['location']['address']['street']}) for x in offers_lst];
    [x.update({'loc_label':str(x['locationLabel']['value'])}) for x in offers_lst];
    [x.update({'desc':str(x['seo']['details']['description'])}) for x in offers_lst];
    [x.pop('location') for x in offers_lst];
    [x.pop('locationLabel') for x in offers_lst];
    [x.pop('images') for x in offers_lst];
    [x.pop('openDays') for x in offers_lst];
    [x.pop('rentPrice') for x in offers_lst];
    [x.pop('priceFromPerSquareMeter') for x in offers_lst];
    [x.pop('investmentState') for x in offers_lst];
    [x.pop('investmentUnitsRoomsNumber') for x in offers_lst];
    [x.pop('investmentEstimatedDelivery') for x in offers_lst];
    [x.pop('investmentUnitsNumber') for x in offers_lst];
    [x.pop('seo') for x in offers_lst];
    [x.pop('totalPossibleImages') for x in offers_lst];
    [x.pop('__typename') for x in offers_lst];
    [x.pop('pushedUpAt') for x in offers_lst];
    [x.pop('investmentUnitsAreaInSquareMeters') for x in offers_lst];
    [x.pop('peoplePerRoom') for x in offers_lst];
    #[x.update({'price':x['totalPrice']['value']}) for x in offers_lst];
    [x.update({'price':x['totalPrice']['value']}) if x['totalPrice'] is not None else x.update({'price':None}) for x in offers_lst];
    [x.pop('totalPrice') for x in offers_lst];
    #[x.update({'price_psqm':x['pricePerSquareMeter']['value']}) for x in offers_lst];
    [x.pop('pricePerSquareMeter') for x in offers_lst];
    [x.update({'url':'https://www.otodom.pl/pl/oferta/'+x['slug']}) for x in offers_lst];
    [x.pop('slug') for x in offers_lst];
    [x.update({'agency_name':x['agency']['name']}) if x['agency'] is not None else x.update({'agency_name':None}) for x in offers_lst]
    [x.pop('agency') for x in offers_lst];
    [x.pop('specialOffer') for x in offers_lst];
    [x.pop('transaction') for x in offers_lst];
    [x.update({'loc':str(x['loc']['name']+', '+x['loc']['number'])}) if x['loc'] is not None else x.update({'loc':None}) for x in offers_lst];
    
    return offers_lst

In [None]:
def create_overall_lst(sleep_sec=1):
    pageid = 1
    offers = download_page(pageid)
    n_offers = get_n_offers(offers)
    offers_lst = get_offers(offers)
    full_list = []
    full_list.extend(offers_lst)
    n_pages = int(n_offers/24)
    
    for i in range(2, n_pages+1):
        offers = download_page(i)
        offers_lst = get_offers(offers)
        full_list.extend(offers_lst)
        sleep(sleep_sec)
    return full_list

In [None]:
def get_clear_desc(desc):
    if 'ad' in json.loads(desc.string)['props']['pageProps'].keys():
        offer_detailed = json.loads(desc.string)['props']['pageProps']['ad']
    else:
        return {}
    
    keys_to_keep = [
        'developmentTitle', 'description', 'location', 'characteristics', 'features', 
        'featuresWithoutCategory', 'property', 'owner'
    ]
    offer_detailed = {
        key: offer_detailed[key] for key in offer_detailed if key in keys_to_keep
    }
    offer_detailed['characteristics'] = [
        {x['key']:x['value']} for x in offer_detailed['characteristics']
    ]
    for i in offer_detailed['characteristics']:
        for key, value in i.items():
            offer_detailed[key] = value
    #offer_detailed['characteristics'] = new_dict
    offer_detailed['lat'] = offer_detailed['location']['coordinates']['latitude']
    offer_detailed['lon'] = offer_detailed['location']['coordinates']['longitude']
    
    if offer_detailed['location']['address']['district'] is not None:
        offer_detailed['district'] = offer_detailed['location']['address']['district']['name']
    else:
        offer_detailed['district'] = None
    offer_detailed['features'] = ', '.join(offer_detailed['features'])
    offer_detailed['features2'] = ', '.join(offer_detailed['featuresWithoutCategory'])
    
    if offer_detailed['property'] is not None:
        offer_detailed['p_kitchen'] = offer_detailed['property']['properties']['kitchen']
        offer_detailed['p_parking'] = offer_detailed['property']['properties']['parking']
        offer_detailed['p_rooms'] = offer_detailed['property']['properties']['rooms']
        offer_detailed['p_type'] = offer_detailed['property']['properties']['type']
        offer_detailed['p_windows'] = offer_detailed['property']['properties']['windowsOrientation']
        
        offer_detailed['p_btype'] = offer_detailed['property']['buildingProperties']['type']
        offer_detailed['p_bmaterial'] = offer_detailed['property']['buildingProperties']['material']
        offer_detailed['p_bwindows'] = offer_detailed['property']['buildingProperties']['windows']
        offer_detailed['p_bheating'] = offer_detailed['property']['buildingProperties']['heating']
        
        offer_detailed.pop('property');
    else:
        offer_detailed['p_kitchen'] = ''
        offer_detailed['p_parking'] = ''
        offer_detailed['p_rooms'] = ''
        offer_detailed['p_type'] = ''
        offer_detailed['p_windows'] = ''
        
        offer_detailed['p_btype'] = ''
        offer_detailed['p_bmaterial'] = ''
        offer_detailed['p_bwindows'] = ''
        offer_detailed['p_bheating'] = ''

    offer_detailed['owner_type'] = offer_detailed['owner']['type']
    offer_detailed['owner_name'] = offer_detailed['owner']['name']
    offer_detailed['object_name'] = offer_detailed['developmentTitle']
    offer_detailed['pprice'] = offer_detailed['price']
    offer_detailed.pop('location');
    offer_detailed.pop('featuresWithoutCategory');
    offer_detailed.pop('owner');
    offer_detailed.pop('developmentTitle');
    offer_detailed.pop('characteristics');
    offer_detailed.pop('price');
    offer_detailed['p_parking'] = ', '.join(offer_detailed['p_parking'])
    offer_detailed['p_rooms'] = ', '.join(offer_detailed['p_rooms'])
    offer_detailed['p_bwindows'] = ', '.join(offer_detailed['p_bwindows'])
    offer_detailed['description'] = bs(offer_detailed['description'], "lxml").text
    return offer_detailed

# Get full previews list

In [None]:
pageid = 1
offers = download_page(pageid)
n_offers = get_n_offers(offers)
offers_lst = get_offers(offers)
full_list = []
full_list.extend(offers_lst)
n_pages = int(n_offers/24) 

In [None]:
# takes around 22 min
for i in range(370, n_pages+1):
    offers = download_page(i)
    offers_lst = get_offers(offers)
    full_list.extend(offers_lst)
    sleep(2)

In [None]:
len(full_list)

In [None]:
with open('data/data.json', 'w') as f:
    json.dump(full_list, f)

# Update each 

In [None]:
#f = open('data/data.json.json')
#full_list = json.load(f)

In [None]:
for i in range(1, len(full_list)):
    print(i)
    try:
        url = full_list[i]['url']
        desc = download_desc(url)
        desc_clear = get_clear_desc(desc)
        full_list[i] = {**full_list[i], **desc_clear}
        with open('data/data2.json', 'w') as f:
            json.dump(full_list, f)
    except HTTPError as e:
        if e.code == 308:
            continue
        else:
            print(f"HTTP Error {e.code}: {e.reason}")
            break
    sleep(2)