Import Dependencies

In [2]:
import timeit
import pandas as pd
from requests import get
from bs4 import BeautifulSoup

import warnings
warnings.filterwarnings('ignore')

Scrape the Properties

In [53]:
start = 1
stop  = 10

headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
           'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
           'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
           'Accept-Encoding': 'none',
           'Accept-Language': 'en-US,en;q=0.8',
           'Connection': 'keep-alive'}

Define a function to retrieve each feature

In [90]:
# def property_name(soupy_object):   # return house or property name
#     try:
#         name = soupy_object.find('span', attrs = {'class':'undefined'}).text
#     except:
#         name = None
#     return name

def address_details(soupy_object):   # return address of property
    try:
        # address = soupy_object.find('div', attrs = {'id':'fullStreetAddress'}).text
        street_address = soupy_object.find('div', class_='street-address')['title']
        city_state_zip = soupy_object.find('div', class_='dp-subtext bp-cityStateZip').text.split(', ')
        city = city_state_zip[0]
        state, zipcode = city_state_zip[1].split(' ')
        address = ", ".join(filter(None, [street_address, city, state, zipcode[:5]]))
    except:
        address = None
    return address

def total_price(soupy_object):   # return total price of property
    try:
        price = soupy_object.find('div', class_='stat-block beds-section', attrs = {'data-rf-test-id':'abp-price'}).find('div', class_='statsValue').text
    except:
        price = None
    return price

# def rate_sqft(soupy_object):   # return total price of property
#     try:
#         rate = soupy_object.find('div', attrs = {'id':"pricePerUnitArea"}).text.split(' ')[1]
#     except:
#         rate = None
#     return rate

def area_type(soupy_object):   # return area parameters
    try:
        areatyp = soupy_object.find('span', text='Property Type').parent.findNext('div').text
    except:
        areatyp = None
    return areatyp

def bedroom_count(soupy_object):   # return number of bedrooms
    try:
        bedroom = soupy_object.find('div', class_='stat-block beds-section', attrs = {'data-rf-test-id':'abp-beds'}).find('div', class_='statsValue').text
    except:
        bedroom = None
    return bedroom

def bathroom_count(soupy_object):   # return number of bathrooms
    try:
        bathroom = soupy_object.find('div', class_='stat-block baths-section', attrs = {'data-rf-test-id':'abp-baths'}).find('div', class_='statsValue').text
    except:
        bathroom = None
    return bathroom

def square_feet(soupy_object):   # return number of square feet
    try:
        sqft_val = soupy_object.find('div', class_='stat-block sqft-section', attrs = {'data-rf-test-id':'abp-sqFt'}).find('span', class_='statsValue').text
        sqft_units = soupy_object.find('div', class_='stat-block sqft-section', attrs = {'data-rf-test-id':'abp-sqFt'}).find('div', class_='statsLabel').text
        sqft = sqft_val + ' ' + sqft_units
    except:
        sqft = None
    return sqft

def lot_size(soupy_object):
    try:
        lot_size = soupy_object.find('span', text='Lot Size').parent.findNext('div').text
    except:
        lot_size = None
    return lot_size

def year_built(soupy_object):   # return age of property
    try:
        year_built = soupy_object.find('span', text='Year Built').parent.findNext('div').text
    except:
        year_built = None
    return year_built

# def availability(soupy_object):   # return area parameters
#     try:
#         avail = soupy_object.find('span', attrs = {'id':'Availability_Lbl'}).text
#     except:
#         avail = None
#     return avail

Compile into a single dataframe

In [91]:
data_list = []
def get_all(start, stop):
    for pagenumber in range(start, stop):
        url = f'https://www.redfin.com/state/New-Jersey/filter/sort=lo-price,min-year-built=2021,max-year-built=2023/page-{pagenumber}'
        req = get(url, headers = headers)
        soup = BeautifulSoup(req.content, 'html.parser')
        links = soup.find_all('a', attrs= {"class":"slider-item"})
        # print(links)
        
        for k, item in enumerate(links):
            main_url = 'https://www.redfin.com'
            sub_url = item.get('href')
            data_url = main_url + sub_url
            request = get(data_url, headers=headers)
            soup_get = BeautifulSoup(request.content, 'html.parser')

            # name = property_name(soup_get)
            address = address_details(soup_get)
            price = total_price(soup_get)
            # rate = rate_sqft(soup_get)
            areatyp = area_type(soup_get)
            bedroom = bedroom_count(soup_get)
            bathroom = bathroom_count(soup_get)
            sqft = square_feet(soup_get)
            lot_siz = lot_size(soup_get)
            yr_built = year_built(soup_get)
            # avail = availability(soup_get)
        
            data = {'Location': address, 'Price':price, 'Area_Type':areatyp, 'Bedroom': bedroom, 'Bathroom':bathroom, 
            'Square_feet':sqft, 'Lot_Size':lot_siz, 'Year_Built':yr_built}
            # data = {'Bedroom': bedroom}
            data_list.append(data)
            # break
        # break
        timestart = timeit.default_timer()
        timestop = timeit.default_timer()
        print(f'You scraped page no : {pagenumber}')
        print('Time :', timestop - timestart)
        
    return data_list

Define Dataframe

In [92]:
# get_all(start, stop)
df1 = pd.DataFrame(get_all(start, stop))

You scraped page no : 1
Time : 9.999894245993346e-08
You scraped page no : 2
Time : 3.0000046535860747e-07
You scraped page no : 3
Time : 2.00001522898674e-07
You scraped page no : 4
Time : 1.9999970390927047e-07
You scraped page no : 5
Time : 2.00001522898674e-07
You scraped page no : 6
Time : 5.000001692678779e-07
You scraped page no : 7
Time : 4.000012268079445e-07
You scraped page no : 8
Time : 3.0000046535860747e-07
You scraped page no : 9
Time : 3.0000046535860747e-07


Verify Dataframe

In [93]:
df1.head()

Unnamed: 0,Location,Price,Area_Type,Bedroom,Bathroom,Square_feet,Lot_Size,Year_Built
0,"19 Goldenrod Pl, Vernon Twp., NJ, 07418","$79,900",Single Family Residential,2,2,— Sq Ft,0.37 Acres,2023
1,"45 Aldine Shirley Rd, Elmer, NJ, 08318","$84,900",Vacant Land,—,—,1.00 Acre (Lot),1 Acre,2022
2,"1 Victory Ave #16, Pennsville, NJ, 08070","$94,900",Single Family Residential,2,1,896 Sq Ft,18.45 Acres,2021
3,"1205 Farrell Ave #4, Cherry Hill, NJ, 08002","$109,000",Single Family Residential,2,1,784 Sq Ft,"1,950 Sq. Ft.",2023
4,"1887 N Delsea Dr #8, Vineland, NJ, 08360","$114,900",Single Family Residential,2,1,832 Sq Ft,—,2022


In [94]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Location     350 non-null    object
 1   Price        350 non-null    object
 2   Area_Type    350 non-null    object
 3   Bedroom      350 non-null    object
 4   Bathroom     350 non-null    object
 5   Square_feet  350 non-null    object
 6   Lot_Size     312 non-null    object
 7   Year_Built   350 non-null    object
dtypes: object(8)
memory usage: 22.0+ KB


In [95]:
df1.duplicated().sum()

25

In [96]:
df1.nunique()

Location       325
Price          219
Area_Type       11
Bedroom          7
Bathroom         9
Square_feet    140
Lot_Size       124
Year_Built       4
dtype: int64

Create csv file

In [97]:
df1.to_csv('Datasets/Prop_1to9.csv', index_label = False)

In [None]:
projectlist = pd.read_csv("Prop_1to9.csv")
projectlist.head(16)

Import all datasets and concatenate

In [None]:
df1 = pd.read_csv("Prop_1to9.csv")

In [None]:
# df = pd.concat([df1, df2, df3], ignore_index=False)
df = df1

In [None]:
df

In [None]:
# df["Property_Name"] = df["Property_Name"].str.replace('Toll Free 1800 41 99099','Unnamed Property')

In [None]:
df.isna().sum()

In [None]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
df = df.drop_duplicates(ignore_index=True)

In [None]:
df.to_csv('Raw_Property.csv', index=False)

In [None]:
df = pd.read_csv('Raw_Property.csv')

In [None]:
print('Shape of Data :', df.shape)
df