In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
#from tqdm import tqdm_notebook as tqdm
from tqdm.notebook import tqdm as tqdm
import csv
from joblib import Parallel, delayed
import multiprocessing

In [2]:
def get_pages(main):
    try:
        soup = connect(main)
        n_pages = [_.get_text(strip=True) for _ in soup.find('ul', {'class': 'pagination pagination__number'}).find_all('li')]
        #max = soup.find_all("span", class_="pagination__number")
        last_page = int(n_pages[-1])
        pages = [main]
        
        for n in range(2,last_page+1):    
            page_num = "/?pag={}".format(n)
            pages.append(main + page_num)
    except:
        pages = [main]
        
    return pages

def connect(web_addr):
    resp = requests.get(web_addr)
    return BeautifulSoup(resp.content, "html.parser")
    

def get_areas(website):
    data = connect(website)
    areas = []
    for ultag in data.find_all('ul', {'class': 'breadcrumb-list breadcrumb-list_list breadcrumb-list__related'}):
        for litag in ultag.find_all('li'):
            for i in range(len(litag.text.split(','))):
                areas.append(litag.text.split(',')[i])
    areas = [x.strip() for x in areas]
    urls = []
    
    for area in areas:
        url = website + '/' + area.replace(' ','-').lower()
        urls.append(url)
    
    return areas, urls

def get_apartment_links(website):
    data = connect(website)
    links = []
    for link in data.find_all('ul', {'class': 'annunci-list'}):
        for litag in link.find_all('li'):
            try:
                links.append(litag.a.get('href'))
            except:
                continue
    return links

def scrape_link(website):
    data = connect(website)
    info = data.find_all('dl', {'class': 'im-features__list'})
    comp_info = pd.DataFrame()
    cleaned_id_text = []
    cleaned_id__attrb_text = []
    for n in range(len(info)):
        for i in info[n].find_all('dt'):
            cleaned_id_text.append(i.text)
        for i in info[n].find_all('dd'):
            cleaned_id__attrb_text.append(i.text)

    comp_info['Id'] = cleaned_id_text
    comp_info['Attribute'] = cleaned_id__attrb_text
    comp_info
    feature = []
    for item in comp_info['Attribute']:
        try:
            feature.append(clear_df(item))
        except:
            feature.append(ultra_clear_df(item))

    comp_info['Attribute'] = feature
    return comp_info['Id'].values, comp_info['Attribute'].values
    

def remove_duplicates(x):
    return list(dict.fromkeys(x))

def clear_df(the_list):
    the_list = (the_list.split('\n')[1].split('  '))
    the_list = [value for value in the_list if value != ''][0]
    return the_list

def ultra_clear_df(the_list):
    the_list = (the_list.split('\n\n')[1].split('  '))
    the_list = [value for value in the_list if value != ''][0]
    the_list = (the_list.split('\n')[0])
    return the_list

In [3]:
## Main website link for city
## Get all areas inside the city (districts)

website = "https://www.immobiliare.it/affitto-case/torino"
areas, districts = get_areas(website)
print("Those are district's links \n")
print(districts)

Those are district's links 

['https://www.immobiliare.it/affitto-case/torino/aurora', 'https://www.immobiliare.it/affitto-case/torino/barriera-di-milano', 'https://www.immobiliare.it/affitto-case/torino/rebaudengo', 'https://www.immobiliare.it/affitto-case/torino/barriera-di-lanzo', 'https://www.immobiliare.it/affitto-case/torino/falchera', 'https://www.immobiliare.it/affitto-case/torino/barca', 'https://www.immobiliare.it/affitto-case/torino/bertolla', 'https://www.immobiliare.it/affitto-case/torino/borgo-san-paolo', 'https://www.immobiliare.it/affitto-case/torino/cenisia', 'https://www.immobiliare.it/affitto-case/torino/borgo-vittoria', 'https://www.immobiliare.it/affitto-case/torino/parco-dora', 'https://www.immobiliare.it/affitto-case/torino/campidoglio', 'https://www.immobiliare.it/affitto-case/torino/san-donato', 'https://www.immobiliare.it/affitto-case/torino/cit-turin', 'https://www.immobiliare.it/affitto-case/torino/cavoretto', 'https://www.immobiliare.it/affitto-case/torino/

## Scrape cycle initialization

In [9]:
## First of all we need to find all announces' links, in order to scrape informations inside them one by one

address = []
location = []

try:
    for url in tqdm(districts):
        pages = get_pages(url)
        for page in pages:
            add = get_apartment_links(page)
            address.append(add)
            for num in range(0,len(add)):
                location.append(url.rsplit('/', 1)[-1])
except Exception as e:
    print(e)
        
announces_links = [item for value in address for item in value]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=37.0), HTML(value='')))




In [10]:
## Check that what you scraped has a meaning...and save it

print("The numerosity of announces:\n")
print(len(announces_links))
with open('announces_list.csv', 'w') as myfile:
    wr = csv.writer(myfile)
    wr.writerow(announces_links)

The numerosity of announces:

5191


## Proper announce scraping and dataset creation 

In [11]:
## Now we pass all announces' links do the scrape_link function to obtain apartments' informations 

df_scrape = pd.DataFrame()
to_be_dropped = []
counter = 0
for link in tqdm(list(announces_links)):
    counter=counter+1
    try:
        names, values = scrape_link(link)
        temp_df = pd.DataFrame(columns=names)
        temp_df.loc[len(temp_df), :] = values[0:len(names)]
        df_scrape = df_scrape.append(temp_df, sort=False)
    except Exception as e:
        print(e)
        to_be_dropped.append(counter)
        print(to_be_dropped)
        continue

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5191.0), HTML(value='')))




In [12]:
## Eventually save useful informations odtained during the scrape process

pd.DataFrame(location).to_csv('location.csv', sep=';')
pd.DataFrame(to_be_dropped).to_csv('to_be_dropped.csv', sep=';')

In [13]:
## Eventually drop announces that reported an error during the scraping process

to_be_dropped.sort(reverse=True)

for index in to_be_dropped:
    del location[index-1]
for index in to_be_dropped:
    del announces_links[index-1]

In [22]:
## Check df size to see if we have truly collected info and save everything

print(df_scrape.shape)
df_scrape['district'] = location
df_scrape['links'] = announces_links
df_scrape.columns = map(str.lower, df_scrape.columns)
df_scrape.to_csv('dataset.csv', sep=";")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_scrape['district'] = location

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_scrape['links'] = announces_links



(5191, 16)


In [23]:
df_scrape = pd.read_csv('dataset.csv', sep=';')

In [24]:
df_scrape = df_scrape[['contratto', 'district', 'tipologia', 'superficie', 'locali', 'piano', 'tipo proprietà', 'prezzo', 'spese condominio', 'spese aggiuntive', 'anno di costruzione', 'stato', 'riscaldamento', 'climatizzazione', 'posti auto', 'links']] #'classe energetica'

In [25]:
## Unfortunately in most cases informations obtained from direct scrape are a litte bit dirty.
## I created a function that cleans my dataset. I did it in a really empirical way, 
## in order to obtain rows that can be cataloged by a specific dtype.

def cleanup(df):
    price = []
    rooms = []
    surface = []
    bathrooms = []
    floor = []
    contract = []
    tipo = []
    condominio = []
    heating = []
    built_in = []
    state = []
    riscaldamento = []
    cooling = []
    energy_class = []
    tipologia = []
    pr_type = []
    arredato = []
    
    for tipo in df['tipologia']:
        try:
            tipologia.append(tipo)
        except:
            tipologia.append(None)
    
    for superficie in df['superficie']:
        try:
            if "m" in superficie:
                #z = superficie.split('|')[0]
                s = superficie.replace(" m²", "")
                surface.append(s)
        except:
            surface.append(None)
    
    for locali in df['locali']:
        try:
            rooms.append(locali[0:1])
        except:
            rooms.append(None)
    
    for prezzo in df['prezzo']:
        try:
            price.append(prezzo.replace("Affitto ", "").replace("€ ", "").replace("/mese", "").replace(".",""))
        except:
            price.append(None)
            
    for contratto in df['contratto']:
        try:
            contract.append(contratto.replace("\n ",""))
        except:
            contract.append(None)
    
    for piano in df['piano']:
        try:
            floor.append(piano.split(' ')[0])
        except:
            floor.append(None)
    
    for tipologia in df['tipo proprietà']:
        try:
            pr_type.append(tipologia.split(',')[0])
        except:
            pr_type.append(None)
            
    for condo in df['spese condominio']:
        try:
            if "mese" in condo:
                condominio.append(condo.replace("€ ","").replace("/mese",""))
            else:
                condominio.append(None)
        except:
            condominio.append(None)
        
    for ii in df['spese aggiuntive']:
        try:
            if "anno" in ii:
                mese = int(int(ii.replace("€ ","").replace("/anno","").replace(".",""))/12)
                heating.append(mese)
            else:
                heating.append(None)
        except:
            heating.append(None)
   
    for anno_costruzione in df['anno di costruzione']:
        try:
            built_in.append(anno_costruzione)
        except:
            built_in.append(None)
    
    for stato in df['stato']:
        try:
            stat = stato.replace(" ","").lower()
            state.append(stat)
        except:
            state.append(None)
    
    for tipo_riscaldamento in df['riscaldamento']:
        try:
            if 'Centralizzato' in tipo_riscaldamento:
                riscaldamento.append('centralizzato')
            elif 'Autonomo' in tipo_riscaldamento:
                riscaldamento.append('autonomo')
        except:
            riscaldamento.append(None)
    
    for clima in df['climatizzazione']:
        try:
            cooling.append(clima.lower().split(',')[0])
        except:
            cooling.append('None')
    
    final_df = pd.DataFrame(columns=['contract', 'district', 'renting_type', 'surface', 'locals', 'floor', 'property_type', 'price', 'spese condominio', 'other_expences', 'building_year', 'status', 'heating', 'air_conditioning', 'energy_certificate', 'parking_slots'])#, 'Arredato S/N'])
    final_df['contract'] = contract
    final_df['renting_type'] = tipologia
    final_df['surface'] = surface
    final_df['locals'] = rooms
    final_df['floor'] = floor
    final_df['property_type'] = pr_type
    final_df['price'] = price
    final_df['spese condominio'] = condominio
    final_df['heating_expences'] = heating
    final_df['building_year'] = built_in
    final_df['status'] = state
    final_df['heating_system'] = riscaldamento
    final_df['air_conditioning'] = cooling
    #final_df['classe energetica'] = energy_class
    final_df['district'] = df['district'].values
    #inal_df['Arredato S/N'] = arredato
    final_df['announce_link'] = announces_links
    
    return final_df

In [26]:
## Save the cleaned dataset which is the fruit of your work.

final = cleanup(df_scrape)
final.to_csv('regression_dataset.csv', sep=";")

In [27]:
announces_links = pd.read_csv('announces_list.csv').T.index.values