<a href="https://colab.research.google.com/github/timmtimm1/Projects-/blob/main/WebScrapingRetail.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import random
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# Getting the zillow website for homes in san diego
url = 'https://www.zillow.com/san-diego-ca/?searchQueryState=%7B"pagination'
headers = {
     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/109.0.0.0 (Edition std-1)",
     'DNT': '1',
     'Referer': 'https://www.google.com/',
     'Upgrade-Insecure-Requests': '1'
}

In [4]:
r = requests.get(url, headers=headers)
print(r)

<Response [200]>


In [5]:
soup = BeautifulSoup(r.content, 'html.parser')

In [6]:
soup.find('a', class_="StyledButton-c11n-8-102-0__sc-iv7357-0 dErLnE PaginationButton-c11n-8-102-0__sc-1i6hxyy-0 jBZehp").get('href')

'/san-diego-ca/2_p/'

In [7]:
# Creating a list for urls
urls = []
# Getting the first 30 pages
for i in range(1, 31):
  a = 'https://www.zillow.com/san-diego-ca/'+str(i)+'_p/'
  urls.append(a)

In [8]:
urls

['https://www.zillow.com/san-diego-ca/1_p/',
 'https://www.zillow.com/san-diego-ca/2_p/',
 'https://www.zillow.com/san-diego-ca/3_p/',
 'https://www.zillow.com/san-diego-ca/4_p/',
 'https://www.zillow.com/san-diego-ca/5_p/',
 'https://www.zillow.com/san-diego-ca/6_p/',
 'https://www.zillow.com/san-diego-ca/7_p/',
 'https://www.zillow.com/san-diego-ca/8_p/',
 'https://www.zillow.com/san-diego-ca/9_p/',
 'https://www.zillow.com/san-diego-ca/10_p/',
 'https://www.zillow.com/san-diego-ca/11_p/',
 'https://www.zillow.com/san-diego-ca/12_p/',
 'https://www.zillow.com/san-diego-ca/13_p/',
 'https://www.zillow.com/san-diego-ca/14_p/',
 'https://www.zillow.com/san-diego-ca/15_p/',
 'https://www.zillow.com/san-diego-ca/16_p/',
 'https://www.zillow.com/san-diego-ca/17_p/',
 'https://www.zillow.com/san-diego-ca/18_p/',
 'https://www.zillow.com/san-diego-ca/19_p/',
 'https://www.zillow.com/san-diego-ca/20_p/',
 'https://www.zillow.com/san-diego-ca/21_p/',
 'https://www.zillow.com/san-diego-ca/22_p/

In [10]:
properties = []

with requests.Session() as s:
    for url in urls:
        response = s.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all property cards
        property_cards = soup.find_all('article')

        for card in property_cards:
            price = card.find('span', class_='PropertyCardWrapper__StyledPriceLine-srp-8-102-0__sc-16e8gqd-1')
            address = card.find('address')
            details = card.find_all('li')

            # Initialize default values
            beds = baths = sqft = 'N/A'

            # Extract and parse details if available
            for detail in details:
                b_tag = detail.find('b')
                abbr_tag = detail.find('abbr')

                if b_tag and abbr_tag:
                    if abbr_tag.text == 'bds':
                        beds = b_tag.text.strip()
                    elif abbr_tag.text == 'ba':
                        baths = b_tag.text.strip()
                    elif abbr_tag.text == 'sqft':
                        sqft = b_tag.text.strip()

            properties.append({
                'Price': price.text if price else 'N/A',
                'Address': address.text if address else 'N/A',
                'Beds': beds,
                'Baths': baths,
                'Sqft': sqft
            })

In [67]:
# Converting the properties list to a DF, using Pandas
df = pd.DataFrame(properties)

In [68]:
df.head()

Unnamed: 0,Price,Address,Beds,Baths,Sqft
0,"$729,999","2883 Rhoades Rd, San Diego, CA 92139",3,2,1157
1,"$1,499,990","9159 Oviedo St, San Diego, CA 92129",5,3,2618
2,"$330,955","3950 Ohio St UNIT 230, San Diego, CA 92104",2,2,1013
3,"$299,900","1951 47th St SPACE 65, San Diego, CA 92102",3,2,1450
4,"$2,420,000","6057 Meadowpointe Row, La Jolla, CA 92037",3,3,2786


In [13]:
# Lets Clean the data

In [69]:
df['Price'] = df['Price'].replace('[\$,]', '', regex=True).astype(float)


In [70]:
df.head()

Unnamed: 0,Price,Address,Beds,Baths,Sqft
0,729999.0,"2883 Rhoades Rd, San Diego, CA 92139",3,2,1157
1,1499990.0,"9159 Oviedo St, San Diego, CA 92129",5,3,2618
2,330955.0,"3950 Ohio St UNIT 230, San Diego, CA 92104",2,2,1013
3,299900.0,"1951 47th St SPACE 65, San Diego, CA 92102",3,2,1450
4,2420000.0,"6057 Meadowpointe Row, La Jolla, CA 92037",3,3,2786


In [71]:
# There are a few N/A or -- values, lets correct them
print(df['Sqft'].unique())

['1,157' '2,618' '1,013' '1,450' '2,786' '1,264' '12,842' '3,723' '1,277'
 '1,686' '1,432' '2,201' '1,141' '1,488' '4,605' '692' '1,436' '1,808'
 '2,612' '1,600' '1,204' '1,586' '9,176' '1,767' '2,440' '--' '1,529'
 '872' '1,088' '1,440' '1,889' '1,725' '748' '3,495' '2,100' '1,748'
 '1,630' '2,429' '813' '728' '3,221' '1,446' '1,175' '1,771' '1,309'
 '2,102' '1,675' '1,248' '889' '850' 'N/A' '1,867' '1,110' '2,132' '4,735'
 '2,143' '1,036' '1,390' '3,740' '1,990' '1,228' '4,562' '1,514' '2,480'
 '1,378' '1,589' '1,041' '4,885' '1,749' '697' '2,182' '1,584' '1,150'
 '1,300' '1,232' '578' '1,628' '3,413' '2,415' '1,080' '1,395' '828'
 '1,795' '800' '2,101' '1,452' '2,352' '1,874' '1,828' '1,750' '1,438'
 '1,014' '8,317' '504' '4,824' '2,642' '2,660' '1,908' '1,346' '1,192'
 '1,718' '848' '1,217' '5,000' '1,642' '1,281' '3,100' '1,712' '1,092'
 '1,844' '2,155' '680' '2,604' '1,430' '1,974' '927' '2,402' '1,076'
 '1,371' '1,062' '5,748' '1,050' '2,112' '3,465' '878' '9,956' '1,064'
 '1,12

In [72]:
# First, importing Numpy
import numpy as np

In [73]:
df['Sqft'] = df['Sqft'].replace(['--', 'N/A'], np.nan)

# Remove commas
df['Sqft'] = df['Sqft'].replace(',', '', regex=True)

# Convert to numeric
df['Sqft'] = pd.to_numeric(df['Sqft'], errors='coerce')

In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216 entries, 0 to 215
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Price    216 non-null    float64
 1   Address  216 non-null    object 
 2   Beds     216 non-null    object 
 3   Baths    216 non-null    object 
 4   Sqft     213 non-null    float64
dtypes: float64(2), object(3)
memory usage: 8.6+ KB


In [75]:
# Expanding the adress column
df[['Street_Address', 'City_State_Zip']] = df['Address'].str.split(', ', n=1, expand=True)


In [76]:
df.head()

Unnamed: 0,Price,Address,Beds,Baths,Sqft,Street_Address,City_State_Zip
0,729999.0,"2883 Rhoades Rd, San Diego, CA 92139",3,2,1157.0,2883 Rhoades Rd,"San Diego, CA 92139"
1,1499990.0,"9159 Oviedo St, San Diego, CA 92129",5,3,2618.0,9159 Oviedo St,"San Diego, CA 92129"
2,330955.0,"3950 Ohio St UNIT 230, San Diego, CA 92104",2,2,1013.0,3950 Ohio St UNIT 230,"San Diego, CA 92104"
3,299900.0,"1951 47th St SPACE 65, San Diego, CA 92102",3,2,1450.0,1951 47th St SPACE 65,"San Diego, CA 92102"
4,2420000.0,"6057 Meadowpointe Row, La Jolla, CA 92037",3,3,2786.0,6057 Meadowpointe Row,"La Jolla, CA 92037"


In [77]:
df['Zip'] = df['City_State_Zip'].str.extract(r'(\d{5})')

In [78]:
df.head()

Unnamed: 0,Price,Address,Beds,Baths,Sqft,Street_Address,City_State_Zip,Zip
0,729999.0,"2883 Rhoades Rd, San Diego, CA 92139",3,2,1157.0,2883 Rhoades Rd,"San Diego, CA 92139",92139
1,1499990.0,"9159 Oviedo St, San Diego, CA 92129",5,3,2618.0,9159 Oviedo St,"San Diego, CA 92129",92129
2,330955.0,"3950 Ohio St UNIT 230, San Diego, CA 92104",2,2,1013.0,3950 Ohio St UNIT 230,"San Diego, CA 92104",92104
3,299900.0,"1951 47th St SPACE 65, San Diego, CA 92102",3,2,1450.0,1951 47th St SPACE 65,"San Diego, CA 92102",92102
4,2420000.0,"6057 Meadowpointe Row, La Jolla, CA 92037",3,3,2786.0,6057 Meadowpointe Row,"La Jolla, CA 92037",92037


In [79]:
df['City_State'] = df['City_State_Zip'].str.replace(r'\d{5}', '')

In [80]:
df.head()

Unnamed: 0,Price,Address,Beds,Baths,Sqft,Street_Address,City_State_Zip,Zip,City_State
0,729999.0,"2883 Rhoades Rd, San Diego, CA 92139",3,2,1157.0,2883 Rhoades Rd,"San Diego, CA 92139",92139,"San Diego, CA 92139"
1,1499990.0,"9159 Oviedo St, San Diego, CA 92129",5,3,2618.0,9159 Oviedo St,"San Diego, CA 92129",92129,"San Diego, CA 92129"
2,330955.0,"3950 Ohio St UNIT 230, San Diego, CA 92104",2,2,1013.0,3950 Ohio St UNIT 230,"San Diego, CA 92104",92104,"San Diego, CA 92104"
3,299900.0,"1951 47th St SPACE 65, San Diego, CA 92102",3,2,1450.0,1951 47th St SPACE 65,"San Diego, CA 92102",92102,"San Diego, CA 92102"
4,2420000.0,"6057 Meadowpointe Row, La Jolla, CA 92037",3,3,2786.0,6057 Meadowpointe Row,"La Jolla, CA 92037",92037,"La Jolla, CA 92037"


In [81]:
df.drop('City_State_Zip', axis=1, inplace=True)
df.drop('Address', axis=1, inplace=True)


In [82]:
df.head()

Unnamed: 0,Price,Beds,Baths,Sqft,Street_Address,Zip,City_State
0,729999.0,3,2,1157.0,2883 Rhoades Rd,92139,"San Diego, CA 92139"
1,1499990.0,5,3,2618.0,9159 Oviedo St,92129,"San Diego, CA 92129"
2,330955.0,2,2,1013.0,3950 Ohio St UNIT 230,92104,"San Diego, CA 92104"
3,299900.0,3,2,1450.0,1951 47th St SPACE 65,92102,"San Diego, CA 92102"
4,2420000.0,3,3,2786.0,6057 Meadowpointe Row,92037,"La Jolla, CA 92037"


In [83]:
df['City_State'] = df['City_State'].str.replace(',', '', regex=False).str.rsplit(' ', n=1, expand=True)[0]


In [85]:
print(df.head())

       Price Beds Baths    Sqft         Street_Address    Zip    City_State
0   729999.0    3     2  1157.0        2883 Rhoades Rd  92139  San Diego CA
1  1499990.0    5     3  2618.0         9159 Oviedo St  92129  San Diego CA
2   330955.0    2     2  1013.0  3950 Ohio St UNIT 230  92104  San Diego CA
3   299900.0    3     2  1450.0  1951 47th St SPACE 65  92102  San Diego CA
4  2420000.0    3     3  2786.0  6057 Meadowpointe Row  92037   La Jolla CA


In [103]:
df[['City','Sate']]=df['City_State'].str.rsplit(' ', n=1, expand=True)

In [105]:
df.drop('City_State', axis=1, inplace=True)