In [76]:
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
import json
# import usaddress


In [77]:
headers = {
    'Content-Type': 'application/json',
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Sec-Fetch-Site': 'same-origin',
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'Sec-Fetch-Mode': 'cors',
    'Host': 'www.apartments.com',
    'Origin': 'https://www.apartments.com',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15',
    'Referer': 'https://www.apartments.com/san-diego-ca/3/?bb=w-6873v64M7upziI',
    'Connection': 'keep-alive',
    'Sec-Fetch-Dest': 'empty',
    'Cookie': 'your-cookie-values',
    'X-Requested-With': 'XMLHttpRequest',
    'X-CSRF-TOKEN': 'your-csrf-token'
}

def fetch_apartment_data(pages=12):
    url = 'https://www.apartments.com/services/search/'

    base_data = {
        "Map": {
            "BoundingBox": {
                "LowerRight": {"Latitude": 32.68368, "Longitude": -117.11836},
                "UpperLeft": {"Latitude": 32.739, "Longitude": -117.17948}
            },
            "CountryCode": "US"
        },
        "Geography": {
            "ID": "h6emeh3",
            "Display": "San Diego, CA",
            "GeographyType": 2,
            "Address": {
                "City": "San Diego",
                "CountryCode": "USA",
                "County": "San Diego",
                "State": "CA",
                "MarketName": "San Diego",
                "DMA": "San Diego, CA"
            },
            "Location": {"Latitude": 32.825, "Longitude": -117.094},
            "BoundingBox": {
                "LowerRight": {"Latitude": 32.53479, "Longitude": -116.90572},
                "UpperLeft": {"Latitude": 33.11425, "Longitude": -117.2823}
            },
            "v": 23508,
            "IsPmcSearchByCityState": False
        },
        "Listing": {},
        "IsBoundedSearch": True,
        "ResultSeed": 203723,
        "Options": 0,
        "CountryAbbreviation": "US"
    }

    all_results = []

    current_page = 1
    while True:
        data = base_data.copy()
        data['Paging'] = {"Page": str(current_page)}
        
        response = requests.post(url, headers=headers, json=data)
        
        if response.status_code == 200:
            result = response.json()
            all_results.append(result)

            print(f"Page {current_page} fetched successfully.")
            
            if not result.get('MetaState').get('PageNextUrl'):
                print(f"Stopping at page {current_page} as there is no 'PageNextUrl'.")
                break
        else:
            print(f"Request failed for page {current_page} with status code {response.status_code}")
            break

        current_page += 1

    return all_results

In [78]:
# example_data = fetch_apartment_data()

In [79]:
def clean_html(html_data):
    # Initialize BeautifulSoup
    soup = BeautifulSoup(html_data, "html.parser")

    # Find all the apartment listings
    listings = soup.find_all("article", class_="placard")

    # Extract the data
    apartments = []

    for listing in listings:
        title = listing.find("div", class_="property-title")
        address = listing.find("div", class_="property-address")
        pricing = listing.find("p", class_="property-pricing")
        beds = listing.find("p", class_="property-beds")
        phone = listing.find("a", class_="phone-link")
        manager = listing.find("div", class_="property-logo")
        property_link = listing.get("data-url")

        address_confirmed = address.text.strip() if address else None
        # if address_confirmed:
            

        apartments.append(
            {
                "title": title.text.strip() if title else None,
                "address": address.text.strip() if address else None,
                "pricing": pricing.text.strip() if pricing else None,
                "beds": beds.text.strip() if beds else None,
                "phone": phone.text.strip() if phone else None,
                "manager": manager.get("aria-label") if manager else None,
                "property_link": property_link,
            }
        )

    return apartments

In [80]:
def format_availability(s):
    if "Now" in s:
        return "Now"
    else:
        date_string = s.strip().replace('availibility', '').strip()
        date_obj = datetime.strptime(date_string, '%b. %d')
        current_year = datetime.now().year
        date_obj = date_obj.replace(year=current_year)
        formatted_date = date_obj.strftime('%-m/%-d/%Y')
        
        return formatted_date
        


def extract_unit_data(soup):
    units = []
    pricing_grid_items = soup.find_all('div', class_='pricingGridItem multiFamily hasUnitGrid')

    for grid_item in pricing_grid_items:
        floor_plan = grid_item.find('span', class_='modelName').text.strip()
        rent_range = grid_item.find('span', class_='rentLabel').text.strip()
        details = grid_item.find('h4', class_='detailsLabel').text.strip().split(',')
        beds = details[0].strip()
        baths = details[1].strip()
        # sq_ft = details[2].strip()

        unit_list = grid_item.find_all('li', class_='unitContainer js-unitContainer')
        
        for unit in unit_list:
            unit_number = unit.find('span', title=True).text.strip()
            price_element = unit.find('span', {'data-monetaryunittype': 'USD'})
            if price_element:
                price = price_element.text.strip()
            else:
                price = None
            sq_ft_unit = unit.find('div', class_='sqftColumn column').text.strip()

            availability_element = unit.find('span', class_='dateAvailable')
            if availability_element:
                availability = availability_element.text.strip()
            else:
                availability = None

            # Ensure no duplication by creating a unique identifier for each unit
            unique_id = f"{floor_plan}-{unit_number}"
            if unique_id not in [u['unique_id'] for u in units]:
                units.append({
                    'unique_id': unique_id,
                    'floor_plan': floor_plan,
                    'rent_range': rent_range,
                    'beds': "Studio" if beds == "Studio" else int(beds.split()[0]),
                    'baths': float(baths.split()[0]),
                    # 'sq_ft': sq_ft,
                    'unit': unit_number,
                    'price': None if price == "Call for Rent" else int(price.replace('$', '').replace(',', '')),
                    'sq_ft': int(sq_ft_unit.split('\n')[1].replace(',', '')),
                    # 'availability': format_availability(availability)
                })

    return units




def extract_individual_apartment_data(apartment):
    url = apartment['property_link']
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an HTTPError for bad responses
    except requests.RequestException as e:
        print(f"Request failed: {e}")
        return apartment
    
    soup = BeautifulSoup(response.content, 'html.parser')

    # Initialize the desired data with default values
    monthly_rent = None
    bedrooms = None
    bathrooms = None
    square_feet = None
    year_built = None
    units = None
    stories = None

    # Extract monthly rent, bedrooms, bathrooms, square feet
    price_bed_range_info = soup.find('ul', class_='priceBedRangeInfo')
    if price_bed_range_info:
        for item in price_bed_range_info.find_all('li', class_='column'):
            label = item.find('p', class_='rentInfoLabel')
            detail = item.find('p', class_='rentInfoDetail')
            if label and detail:
                label_text = label.text.strip()
                detail_text = detail.text.strip()
                if label_text == "Monthly Rent":
                    monthly_rent = detail_text
                elif label_text == "Bedrooms":
                    bedrooms = detail_text
                elif label_text == "Bathrooms":
                    bathrooms = detail_text
                elif label_text == "Square Feet":
                    square_feet = detail_text

    # Extract year built, units, and stories using regex
    details_container = soup.find('div', id='profileV2FeesWrapper')
    if details_container:
        details_text = details_container.text
        year_built_match = re.search(r'Built in (\d{4})', details_text)
        units_stories_match = re.search(r'(\d+) units/(\d+) stories', details_text)
        
        if year_built_match:
            year_built = year_built_match.group(1)
        if units_stories_match:
            units = units_stories_match.group(1)
            stories = units_stories_match.group(2)

    # Extract unit-level data
    unit_data = extract_unit_data(soup)

    # Safely convert units and stories to integers if they exist
    try:
        units = int(units) if units is not None else None
        stories = int(stories) if stories is not None else None
    except ValueError:
        units = None
        stories = None

        # if units is none, print data about the property (title and link) and error message, but skip it for now
        print(f"Units is None for {apartment['title']} at {url}")
        return None

    # Update apartment data
    apartment.update({
        'monthly_rent': monthly_rent,
        'bedrooms': bedrooms,
        'bathrooms': bathrooms,
        'square_feet': square_feet,
        'year_built': year_built,
        'units': units,
        'stories': stories,
        'unit_data': unit_data
    })

    return apartment


In [81]:
# # Example usage
# apartments = clean_html(example_data[0]["PlacardState"]["HTML"])

# # For each apartment, fetch additional data
# # test = extract_individual_apartment_data(apartments[0])

# output = []
# for apartment in apartments:
#     output.append(extract_individual_apartment_data(apartment))

In [82]:
# output

In [83]:
def extract_all_apartment_data():
    raw_pages_data = fetch_apartment_data()
    print("Fetched raw pages data.")
    all_apartments = []
    for page_data in raw_pages_data:
        # print(f"Processing page data: {page_data}")
        apartments = clean_html(page_data["PlacardState"]["HTML"])
        # print(f"Cleaned HTML for apartments: {apartments}")
        for apartment in apartments:
            # check to make sure the apartment isn't a duplicate
            if apartment['title'] in [a['title'] for a in all_apartments]:
                continue

            apartment_data = extract_individual_apartment_data(apartment)
            if apartment_data:
                print(f"Extracted data for apartment: {apartment_data['title']}")
                all_apartments.append(apartment_data)
    
    print("Completed extracting all apartment data.")
    return all_apartments

In [84]:
all_apartment_data = extract_all_apartment_data()

Page 1 fetched successfully.
Page 2 fetched successfully.
Page 3 fetched successfully.
Page 4 fetched successfully.
Page 5 fetched successfully.
Page 6 fetched successfully.
Page 7 fetched successfully.
Page 8 fetched successfully.
Page 9 fetched successfully.
Page 10 fetched successfully.
Page 11 fetched successfully.
Page 12 fetched successfully.
Stopping at page 12 as there is no 'PageNextUrl'.
Fetched raw pages data.
Extracted data for apartment: Pinnacle on The Park
Extracted data for apartment: ALX
Extracted data for apartment: Pinnacle Broadway
Extracted data for apartment: Stanza Little Italy
Extracted data for apartment: The Wyatt Makers Quarter
Extracted data for apartment: West
Extracted data for apartment: Asano on Ivy
Extracted data for apartment: K1 Apartments
Extracted data for apartment: Diega
Extracted data for apartment: Gema
Extracted data for apartment: 4th + J
Extracted data for apartment: Cielo
Extracted data for apartment: Simone
Extracted data for apartment: Rad

In [86]:
all_apartment_data

[{'title': 'Pinnacle on The Park',
  'address': '424 15th St, San Diego, CA 92101',
  'pricing': '$1,995 - $10,730',
  'beds': '1-3 Beds',
  'phone': '(619) 773-0335',
  'manager': None,
  'property_link': 'https://www.apartments.com/pinnacle-on-the-park-san-diego-ca/9r6e3y6/',
  'monthly_rent': '$1,995 - $10,730',
  'bedrooms': '1 - 3 bd',
  'bathrooms': '1 - 3 ba',
  'square_feet': '575 - 1,969 sq ft',
  'year_built': '2015',
  'units': 484,
  'stories': 45,
  'unit_data': [{'unique_id': 'SPIRE 1B-1006',
    'floor_plan': 'SPIRE 1B',
    'rent_range': '$2,425 – $2,665',
    'beds': 1,
    'baths': 1.0,
    'unit': '1006',
    'price': 2455,
    'sq_ft': 600},
   {'unique_id': 'SPIRE 1B-2207',
    'floor_plan': 'SPIRE 1B',
    'rent_range': '$2,425 – $2,665',
    'beds': 1,
    'baths': 1.0,
    'unit': '2207',
    'price': 2575,
    'sq_ft': 619},
   {'unique_id': 'SPIRE 1B-2906',
    'floor_plan': 'SPIRE 1B',
    'rent_range': '$2,425 – $2,665',
    'beds': 1,
    'baths': 1.0,
    

In [87]:
# output save this to a json file
with open('apartment_data.json', 'w') as f:
    json.dump(all_apartment_data, f, indent=4)


# for all of the apartments, take the unit data and put it in a list then save that to a json file
unit_data = []
for apartment in all_apartment_data:
    unit_data.extend(apartment['unit_data'])

with open('unit_data.json', 'w') as f:
    json.dump(unit_data, f, indent=4)

In [16]:
import usaddress

test = usaddress.tag("675 Ninth Ave Unit 1906 San Diego, CA 92101")
test[0]

# address = f"{test[0]['AddressNumber']} {test[0]['StreetName']} {test[0]['StreetNamePostType']}"
# address

test2 = usaddress.tag("321 Tenth Ave Unit FL10-ID40 San Diego, CA 92101")
test2[0]

# test3 = usaddress.tag("530 K St San Diego, CA 92101")
# test3[0]

OrderedDict([('AddressNumber', '321'),
             ('StreetName', 'Tenth'),
             ('StreetNamePostType', 'Ave'),
             ('OccupancyType', 'Unit'),
             ('OccupancyIdentifier', 'FL10-ID40'),
             ('PlaceName', 'San Diego'),
             ('StateName', 'CA'),
             ('ZipCode', '92101')])