In [1]:
!pip install beautifulsoup4
!pip install requests




In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import json

In [9]:
url = 'https://www.zillow.com/homedetails/229-W-Wood-Rd-Rensselaer-IN-47978/85412995_zpid/'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate, br',
    "Referer": "https://google.com",
    "DNT": "1",
    'Connection': 'keep-alive',
}
session = requests.Session()
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

State_abbreviations = {
    'AL': 'Alabama','AK': 'Alaska','AZ': 'Arizona','AR': 'Arkansas','CA': 'California','CO': 'Colorado','CT': 'Connecticut','DE': 'Delaware','FL': 'Florida',
    'GA': 'Georgia', 'HI': 'Hawaii', 'ID': 'Idaho','IL': 'Illinois','IN': 'Indiana','IA': 'Iowa','KS': 'Kansas','KY': 'Kentucky','LA': 'Louisiana','ME': 'Maine','MD': 'Maryland',
    'MA': 'Massachusetts','MI': 'Michigan','MN': 'Minnesota','MS': 'Mississippi','MO': 'Missouri','MT': 'Montana','NE': 'Nebraska','NV': 'Nevada',
    'NH': 'New Hampshire','NJ': 'New Jersey','NM': 'New Mexico','NY': 'New York','NC': 'North Carolina','ND': 'North Dakota','OH': 'Ohio','OK': 'Oklahoma',
    'OR': 'Oregon','PA': 'Pennsylvania','RI': 'Rhode Island','SC': 'South Carolina','SD': 'South Dakota','TN': 'Tennessee','TX': 'Texas','UT': 'Utah','VT': 'Vermont',
    'VA': 'Virginia','WA': 'Washington','WV': 'West Virginia','WI': 'Wisconsin','WY': 'Wyoming','DC': 'District of Columbia'
}

def Property_Info(url): 

    #Scraping price 
    def Hprice(url):
    
        #Scraping the price from the website and converting it to a integer 
        found_price = None
    
        # STRATEGY 1: (Data ID)
        # Zillow often leaves this attribute alone even when they change CSS classes.
        # This is the most reliable method for active listings.
        if not found_price:
            element = soup.find('span', {'data-testid': 'price'})
            if element:
                found_price = element.get_text(strip=True)
        
        # STRATEGY 2: Partial Class Match (The "Smart" Search)
        # We look for ANY <span> where the class name STARTS with "Text-c11n"
        if not found_price:
            # re.compile(r'^Text-c11n') means: Starts with "Text-c11n"
            potential_prices = soup.find_all('span', class_=re.compile(r'^Text-c11n'))
            
            for element in potential_prices:
                text = element.get_text(strip=True)
                # Validation: Is this actually a price?
                # It must start with '$', contain digits, and be short (e.g. "$549,900")
                if text.startswith('$') and any(char.isdigit() for char in text) and len(text) < 20:
                    found_price = text
                    break
        
        # STRATEGY 3: Keyword Match
        # Look for ANY tag that has the word "price" anywhere in its class name.
        if not found_price:
            # re.I means Case Insensitive (matches "Price", "price", "PRICE")
            price_tags = soup.find_all(['div', 'span'], class_=re.compile(r'price', re.I))
            
            for element in price_tags:
                text = element.get_text(strip=True)
                if '$' in text and any(char.isdigit() for char in text) and len(text) < 20: 
                    found_price = text
                    break
        
        # STRATEGY 4: The "Nuclear" Option (JSON Search)
        # If HTML fails, we scan the raw background code for the "price" variable.
        if not found_price:
            # Look for patterns like "price":549900 or "amount":549900 in the scripts
            patterns = [
                r'"price":(\d+),',
                r'"amount":(\d+),'
            ]
            for pattern in patterns:
                match = re.search(pattern, str(soup))
                if match:
                    found_price = f"${int(match.group(1)):,}"
                    break
        
        # 5. Final Result Handling
        if not found_price:
            found_price = "Price not found (Likely Off-Market or Captcha)"
        
        final_price = int(found_price.replace('$', '').replace(',', ''))
        return final_price

    
#scraping bed, bath, housing size    
    def extract_bed_bath_hsize(url):
    
        facts = {
                'beds': 'N/A', 
                'baths': 'N/A',
                'sqft': 'N/A'
            }
        body_text = soup.get_text().replace(',', '').lower() 
            
        def find_fact(text, pattern):
            match = re.search(pattern, text)
            if match:
                return match.group(1).strip()
            return 'N/A'
        
        # look for a number (1 or 2 digits, {1}) right before 'beds'. 
        facts['beds'] = find_fact(body_text, r'(\d{1})\s*(bed|beds)')
        
        # 2. Search for Baths: A number .
        facts['baths'] = find_fact(body_text, r'(\d+\.?\d*)\s*(bath|baths)')
        
        # 3. Search for SqFt: A number (often 3-5 digits) before 'sqft'.
        facts['sqft'] = find_fact(body_text, r'(\d+)\s*sqft')
    
        final_list = [
                int(facts['beds']) if facts['beds'] != 'N/A' else facts['beds'],          
                int(facts['baths']) if facts['baths'] != 'N/A' else facts['baths'],
                int(facts['sqft']) if facts['sqft'] != 'N/A' else facts['sqft']
            ]
        
        return final_list
    
    size_regex = re.compile(r'([\d,\.]+)\s*(Acres|lot|sqft)', re.IGNORECASE)
    Land = soup.find_all('span', string=size_regex)
        
    def land_sizeft(Lands):
        land_size_sqft = None 
                   
        for element in Land:
            text = element.text.strip()
            # "-- sqft" check
            if not Land:
                return None
        
            # # if sqft or acres check ()
            sqft_match = re.match(r"([\d,.]+)\s*(Square Feet|sqft)", text, re.IGNORECASE)
            acre_match = re.match(r"([\d,.]+)\s*Acres", text, re.IGNORECASE)
        
            if sqft_match:
                land_size_sqft = float(sqft_match.group(1).replace(",", ""))
                
            elif acre_match:
                acres = float(acre_match.group(1))
                land_size_sqft = round(acres * 43560, 2)
            
            if land_size_sqft is not None:
                 break 
                     
        return land_size_sqft
        

    
        #Scraping the city
    address = (soup.find_all('h1')[0]).text
    def get_city(addresses):
        address1 = address.replace('\xa0', '')
        split_address = address1.split(',')
        state_zip = split_address[-1].strip()
        state_abbr = state_zip.split()[0]
        state_full = State_abbreviations.get(state_abbr, "Unknown State")
        city = split_address[1]

        return city, state_full
        
    data_list = []
    Property_Dict =  {
            'Price': Hprice(url),
            'Bed': (extract_bed_bath_hsize(soup))[0],
            'Bath': (extract_bed_bath_hsize(soup))[1],
            'Hsize': (extract_bed_bath_hsize(soup))[2], 
            'Land_Size': land_sizeft(Land), 
            'City': (get_city(address))[0],
            'State': (get_city(address))[1] }

    data_list.append(Property_Dict)
    df = pd.DataFrame(data_list)
    return df


Property_Info(url)

Unnamed: 0,Price,Bed,Bath,Hsize,Land_Size,City,State
0,299999,3,2,1248,43560.0,Rensselaer,Indiana
