In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm

In [2]:
website = "https://www.immobiliare.it/affitto-case/torino"

In [29]:
def get_pages(main):
    try:
        soup = connect(main)
        max = soup.find_all("span", class_="pagination__label")
        last_page = int(max[-1].contents[0])
        pages = [main]
        
        for n in range(2,last_page):    
            page_num = "/?pag={}".format(n)
            pages.append(main + page_num)
    except:
        pages = [main]
        
    return pages

def connect(web_addr):
    resp = requests.get(web_addr)
    return BeautifulSoup(resp.content, "html.parser")
    

def create_df(offers):
    price = []
    rooms = []
    surface = []
    bathrooms = []
    floor = []
    
    for offer in offers:
        l = list(offer.stripped_strings)
        
        if "€" in l[0]:
            stripped = l[0].replace("€ ", "").replace(".","")
            price.append(stripped)
        else:
            price.append(None)
            
        if "locali" in l:
            r = l.index("locali")-1
            rooms.append(l[r])
        else:
            rooms.append(None)
            
        if "m" in l:
            s = l.index("m")-1
            surface.append(l[s])
        else:
            surface.append(None)
            
        if "bagni" in l:
            b = l.index("bagni")-1
            bathrooms.append(l[b])
        else:
            bathrooms.append(None)
            
        if "piano" in l:
            fl = l.index("piano")-1
            floor.append(l[fl])
        else:
            floor.append(None)
                    
    return pd.DataFrame.from_dict({"Price": price, "Rooms": rooms, "Surface": surface, "Bathrooms": bathrooms, "Floor": floor})
    
def collect(url):
    pages = get_pages(url)
    df = pd.DataFrame(columns=["Price", "Rooms", "Surface", "Bathrooms", "Floor"])
    
    for page in tqdm(pages):
        soup = connect(page)
        offers = soup.find_all("ul", class_="listing-features list-piped")
        data = create_df(offers)
        df = df.append(data, ignore_index=True)
    
    df['Zona'] = url.rsplit('/', 1)[-1]

    return df

def get_areas(website):
    data = connect(website)
    areas = []
    for ultag in data.find_all('ul', {'class': 'breadcrumb-list breadcrumb-list_list thebigonelist--mouse'}):
        for litag in ultag.find_all('li'):
            for i in range(len(litag.text.split(','))):
                areas.append(litag.text.split(',')[i])
    areas = [x.strip() for x in areas]
    urls = []
    
    for area in areas:
        url = website + '/' + area.replace(' ','-').lower()
        urls.append(url)
    
    return urls

In [30]:
urls = get_areas(website)
data = pd.DataFrame()
for url in urls:
    print(url.rsplit('/', 1)[-1])
    dati = collect(url)
    data = data.append(dati)

#dati.to_csv('output.csv', sep=',', decimal='.')

centro


100%|██████████| 24/24 [00:54<00:00,  2.29s/it]


crocetta


100%|██████████| 9/9 [00:25<00:00,  2.88s/it]


san-secondo


100%|██████████| 3/3 [00:05<00:00,  1.77s/it]


cavoretto


100%|██████████| 1/1 [00:01<00:00,  1.54s/it]


gran-madre


100%|██████████| 1/1 [00:01<00:00,  1.70s/it]


colle-della-maddalena


100%|██████████| 1/1 [00:02<00:00,  2.02s/it]


superga


100%|██████████| 1/1 [00:03<00:00,  3.27s/it]


borgo-san-paolo


100%|██████████| 1/1 [00:01<00:00,  1.29s/it]


cenisia


100%|██████████| 6/6 [00:11<00:00,  1.96s/it]


lingotto


100%|██████████| 4/4 [00:11<00:00,  2.84s/it]


nizza-millefonti


100%|██████████| 2/2 [00:04<00:00,  2.05s/it]


regio-parco


100%|██████████| 2/2 [00:05<00:00,  2.57s/it]


vanchiglia


100%|██████████| 3/3 [00:05<00:00,  1.75s/it]


vanchiglietta


100%|██████████| 2/2 [00:05<00:00,  2.60s/it]


aurora


100%|██████████| 6/6 [00:12<00:00,  2.09s/it]


barriera-di-milano


100%|██████████| 3/3 [00:07<00:00,  2.49s/it]


rebaudengo


100%|██████████| 1/1 [00:01<00:00,  1.77s/it]


barriera-di-lanzo


100%|██████████| 1/1 [00:01<00:00,  1.69s/it]


falchera


100%|██████████| 1/1 [00:03<00:00,  3.28s/it]


barca


100%|██████████| 1/1 [00:01<00:00,  1.72s/it]


bertolla


100%|██████████| 1/1 [00:01<00:00,  1.54s/it]


borgo-vittoria


100%|██████████| 2/2 [00:05<00:00,  2.82s/it]


parco-dora


100%|██████████| 1/1 [00:01<00:00,  1.79s/it]


le-vallette


100%|██████████| 1/1 [00:01<00:00,  1.61s/it]


lucento


100%|██████████| 1/1 [00:03<00:00,  3.42s/it]


madonna-di-campagna


100%|██████████| 2/2 [00:02<00:00,  1.48s/it]


pozzo-strada


100%|██████████| 5/5 [00:11<00:00,  2.29s/it]


parella


100%|██████████| 5/5 [00:10<00:00,  2.08s/it]


santa-rita


100%|██████████| 7/7 [00:16<00:00,  2.30s/it]


mirafiori-nord


100%|██████████| 2/2 [00:04<00:00,  2.47s/it]


campidoglio


100%|██████████| 2/2 [00:03<00:00,  1.71s/it]


san-donato


100%|██████████| 3/3 [00:06<00:00,  2.20s/it]


cit-turin


100%|██████████| 1/1 [00:01<00:00,  1.91s/it]


madonna-del-pilone


100%|██████████| 1/1 [00:02<00:00,  2.76s/it]


sassi


100%|██████████| 1/1 [00:01<00:00,  1.64s/it]


mirafiori-sud


100%|██████████| 3/3 [00:05<00:00,  1.87s/it]


san-salvario


100%|██████████| 9/9 [00:18<00:00,  2.08s/it]


In [32]:
data.to_csv('output.csv', sep=',', decimal='.')

In [27]:
data.append(dati)

Unnamed: 0,Area,Bathrooms,Floor,Price,Rooms,Surface,Zona
0,,1,4,650,2,55,san-salvario
1,,1,T,280,1,30,san-salvario
2,,1,2,330,1,33,san-salvario
3,,1,4,270,1,15,san-salvario
4,,1,T,600,2,45,san-salvario
5,,1,1,450,1,35,san-salvario
6,,1,T,400,1,35,san-salvario
7,,1,5,750,3,75,san-salvario
8,,1,A,360,1,30,san-salvario
9,,1,2,350,2,35,san-salvario


In [28]:
data