In [1]:
# Data Extraction
import base64
import requests as rq
import json
import pandas as pd

In [None]:
# Function to obtain personalised token

def get_oauth_token():
    
    api_key = 'api_key'   # API key provided by Idealista
    secret = 'secret'   # Secret code provided by Idealista

    message = api_key + ":" + secret   # get personalised message

    auth = "Basic " + base64.b64encode(message.encode("ascii")).decode("ascii")   # Encode the message

    headers_dic = {"Authorization" : auth,
                   "Content-Type" : "application/x-www-form-urlencoded;charset=UTF-8"}   # Define headers

    params_dic = {"grant_type" : "client_credentials",   # Define request params
                  "scope" : "read"}

    r = rq.post("https://api.idealista.com/oauth/token",   # Perform the request with the api url, headers and params
                      headers = headers_dic,
                      params = params_dic)

    token = json.loads(r.text)['access_token']   # Get bearer token from api response, as a json

    return token

In [3]:
# The next step will be to define the search URL. This will be achieved by defining the filter parameters of the search 
# and combining them into the resulting URL.

In [4]:
# These are the parameters used to filter the search.

base_url = 'https://api.idealista.com/3.5/'     # Base search url
country = 'pt'     # Search country (es, it, pt)                          
language = 'pt'     # Search language (es, it, pt, en, ca) 
max_items = '50'     # Max items per call, the maximum set by Idealista is 50
operation = 'rent'     # Kind of operation (sale, rent) 
property_type = 'homes'     # Type of property (homes, offices, premises, garages, bedrooms)
locale = 'pt'     # description in portuguese language 
center = '38.74856538317771,-9.159916501020044'     # Coordinates of the search center "Hospital de Santa Maria"
distance = '8000'     # Max distance from the center "8km" (in order to cover the area of Lisbon)
sort = 'desc'# How to sort the found items

In [5]:
# Function to obtain search url with filters

def define_search_url(page_number):
    url = f"{base_url}{country}/search?operation={operation}&maxItems={max_items}&locale={locale}&center={center}\
    &distance={distance}&propertyType={property_type}&sort={sort}&numPage={page_number}&language={language}"
    return url

In [6]:
# get url with filters
url = define_search_url("test") # test function with pagination as "test"

In [7]:
url

'https://api.idealista.com/3.5/pt/search?operation=rent&maxItems=50&locale=pt&center=38.74856538317771,-9.159916501020044&distance=8000&propertyType=homes&sort=desc&numPage=test&language=pt'

In [8]:
# Create a function in order to do the search

In [9]:
def search_api(url):  
    '''
    This function will use the token and url created previously, and return the search results.
    '''
    token = get_oauth_token()   #  Get the bearer token

    headers = {'Content-Type': 'Content-Type: multipart/form-data;',   # Define the search headers (specified in idealista email) 
               'Authorization' : 'Bearer ' + token}

    content = rq.post(url, headers = headers)   # Return the content from the request

    result = json.loads(content.text)   # Transform the result as a json file

    return result

In [10]:
# Since it is necessary to give pagination to the search and this corresponds to the first search, 
# pagination will be set to 1.
first_search_url = define_search_url(1)

In [11]:
first_search_url

'https://api.idealista.com/3.5/pt/search?operation=rent&maxItems=50&locale=pt&center=38.74856538317771,-9.159916501020044&distance=8000&propertyType=homes&sort=desc&numPage=1&language=pt'

In [12]:
# Proceed to do the search with the paginated url
results = search_api(first_search_url)

In [13]:
# show first page results
results

{'elementList': [{'propertyCode': '33240191',
   'thumbnail': 'https://img3.idealista.pt/blur/WEB_LISTING/0/id.pro.pt.image.master/58/83/bd/236971517.jpg',
   'externalReference': '59691',
   'numPhotos': 50,
   'price': 1000.0,
   'propertyType': 'flat',
   'operation': 'rent',
   'size': 96.0,
   'rooms': 4,
   'bathrooms': 1,
   'address': 'Rua Cidade de Moçâmedes s/n',
   'province': 'Lisboa',
   'municipality': 'Camarate - Unhos - Apelação',
   'district': 'Camarate',
   'country': 'pt',
   'latitude': 38.7938568,
   'longitude': -9.1428376,
   'showAddress': True,
   'url': 'https://www.idealista.pt/imovel/33240191/',
   'distance': '5249',
   'description': 'Apartamento T4 Arrendar Camarate. Este apartamento está localizado numa zona tranquila com 4 quartos, completamente mobilado e equipada. Com excelentes acessos a Lisboa e apenas a 5 min do aeroporto. Marque já a sua visita...',
   'hasVideo': False,
   'status': 'good',
   'newDevelopment': False,
   'hasLift': False,
   'pr

In [20]:
# At this point, all the necessary code to extract the data is available, but data extraction is required for
# all the pages of the results. 
# To achieve this, a new function will be created to execute a search for each page number until reaching the last one. For each
# querie it is necessary to save results in a dataframe and subsequently join the dataframes created from each query into a
# final dataframe.

In [21]:
def extract_data(pagination):
    
    # Dataframe where all the data can be stored
    df_total = pd.DataFrame()

    # This loop will iterate over all the pages in the search results
    for i in range(1, pagination + 1): 
    
        url_page = define_search_url(i)   # Add the pagination to the url
        results = search_api(url_page)   # Get the search results
        df = pd.DataFrame.from_dict(results['elementList'])   # Save the results as a dataframe
        df_total = pd.concat([df_total, df])   # Concat the results to the main dataframe
    
    return df_total

In [14]:
# check total pages of results because Idealista API has a limitation of 100 search queries/month and each page represents
# one querie.

total_pages = results['totalPages']
total_pages

59

In [None]:
df_tot = extract_data(total_pages)

In [29]:
df_tot

Unnamed: 0,propertyCode,thumbnail,externalReference,numPhotos,price,propertyType,operation,size,rooms,bathrooms,...,has360,hasStaging,topNewDevelopment,topPlus,floor,highlight,parkingSpace,neighborhood,labels,newDevelopmentFinished
0,33240191,https://img3.idealista.pt/blur/WEB_LISTING/0/i...,59691,50,1000.0,flat,rent,96.00,4,1,...,False,False,False,False,,,,,,
1,33240181,https://img3.idealista.pt/blur/WEB_LISTING/0/i...,W-02V128,39,1750.0,flat,rent,127.00,2,2,...,False,False,False,False,,,,,,
2,32620556,https://img3.idealista.pt/blur/WEB_LISTING/0/i...,EX/002/2023,10,1250.0,flat,rent,60.00,1,1,...,False,False,False,False,1,{'groupDescription': 'Destaque'},,,,
3,33200256,https://img3.idealista.pt/blur/WEB_LISTING/0/i...,olivais/001/2024,17,1500.0,flat,rent,85.00,1,1,...,True,False,False,False,3,{'groupDescription': 'Destaque'},"{'hasParkingSpace': True, 'isParkingSpaceInclu...",,,
4,33240084,https://img3.idealista.pt/blur/WEB_LISTING/0/i...,APA_1952,16,1500.0,flat,rent,72.45,2,1,...,False,False,False,False,3,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41,33227645,,,0,4800.0,flat,rent,160.00,3,2,...,False,False,False,False,6,,"{'hasParkingSpace': True, 'isParkingSpaceInclu...",,"[{'name': 'luxuryType', 'text': 'De luxo'}]",
42,32994341,,,0,5500.0,flat,rent,280.00,4,3,...,False,False,False,False,5,,"{'hasParkingSpace': True, 'isParkingSpaceInclu...",,"[{'name': 'luxuryType', 'text': 'De luxo'}]",
43,32994662,,,0,5500.0,flat,rent,280.00,4,3,...,False,False,False,False,5,,"{'hasParkingSpace': True, 'isParkingSpaceInclu...",,"[{'name': 'luxuryType', 'text': 'De luxo'}]",
44,33168281,,,0,1600.0,flat,rent,160.00,4,3,...,False,False,False,False,12,,"{'hasParkingSpace': True, 'isParkingSpaceInclu...",,,


In [23]:
# Once all the data is collected, it just needs to be saved as a CSV file. 
# The following function has been created for that purpose:

file_path = 'idealista.csv'

def df_to_csv(df):
    '''
    This function will take a given dataframe and save it as a csv file
    '''
    df = df.reset_index()   # Reset the index in order to organise the records
    df.to_csv(file_path, index=False)   # Save it into a csv

In [24]:
# Executing the function to obtain a CSV file with all the extracted data.
df_to_csv(df_tot)

In [25]:
# Idealista API has a limitation of 100 search queries/month, so this should be taken into 
# consideration when performing the search queries