# Web Scraping all of BiggerPockets

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
url = "https://www.biggerpockets.com/blog/category/real-estate-news/page/2"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

In [3]:
titles = soup.find_all('a', class_="post-thumbnail")
for title in titles:
    print(title["href"])
    print()

https://www.biggerpockets.com/blog/money-548

https://www.biggerpockets.com/blog/department-of-justice-targets-buyer-agent-loopholes

https://www.biggerpockets.com/blog/top-6-mistakes-real-estate-investors-are-making-in-2024

https://www.biggerpockets.com/blog/are-3d-homes-the-future-of-home-construction

https://www.biggerpockets.com/blog/new-loan-modifications-bring-three-percent-rates-back

https://www.biggerpockets.com/blog/biden-proposes-rent-increase-caps-at-five-percent

https://www.biggerpockets.com/blog/slew-of-landlord-laws-are-going-to-determine-who-landlords-can-rent-to

https://www.biggerpockets.com/blog/11-formerly-hot-markets-that-are-not-so-hot-anymore

https://www.biggerpockets.com/blog/revisions-to-florida-live-local-act-take-opportunities-from-developers

https://www.biggerpockets.com/blog/luxury-homes-reach-highest-prices-yet-as-investors-look-for-deals

https://www.biggerpockets.com/blog/surprisingly-hot-short-term-rental-markets

https://www.biggerpockets.com/blog

In [4]:
index = 2
url = "https://www.biggerpockets.com/blog/category/real-estate-news/page/" + str(index)

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
titles = soup.find_all('a', class_="post-thumbnail")

blogs = []

while len(titles) != 0:
    print(url)
    for title in titles:
        blogs.append(title["href"])

    index = index + 1
    url = url = "https://www.biggerpockets.com/blog/category/real-estate-news/page/" + str(index)
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    titles = soup.find_all('a', class_="post-thumbnail")

https://www.biggerpockets.com/blog/category/real-estate-news/page/2
https://www.biggerpockets.com/blog/category/real-estate-news/page/3
https://www.biggerpockets.com/blog/category/real-estate-news/page/4
https://www.biggerpockets.com/blog/category/real-estate-news/page/5
https://www.biggerpockets.com/blog/category/real-estate-news/page/6
https://www.biggerpockets.com/blog/category/real-estate-news/page/7
https://www.biggerpockets.com/blog/category/real-estate-news/page/8
https://www.biggerpockets.com/blog/category/real-estate-news/page/9
https://www.biggerpockets.com/blog/category/real-estate-news/page/10
https://www.biggerpockets.com/blog/category/real-estate-news/page/11
https://www.biggerpockets.com/blog/category/real-estate-news/page/12
https://www.biggerpockets.com/blog/category/real-estate-news/page/13
https://www.biggerpockets.com/blog/category/real-estate-news/page/14
https://www.biggerpockets.com/blog/category/real-estate-news/page/15
https://www.biggerpockets.com/blog/categor

In [5]:
print(len(blogs))

1740


In [6]:
blog_posts = []
count = 0

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}

for blog_url in blogs:
    response = requests.get(blog_url, headers=headers)
    if response.status_code == 403:
        print("Access denied: ", blog_url)
        continue
    soup = BeautifulSoup(response.content, 'html.parser')
    article = soup.find_all('article')
    if len(article) > 0:
        count += 1
        print(count, blog_url)
        article = article[0]
        title = article.find('h1').text.strip()
        content = article.find('main').text
        blog_posts.append({"title": title, "content": content})
    else:
        print("Skipped: ", blog_url)

print(len(blog_posts))

1 https://www.biggerpockets.com/blog/money-548
2 https://www.biggerpockets.com/blog/department-of-justice-targets-buyer-agent-loopholes
3 https://www.biggerpockets.com/blog/top-6-mistakes-real-estate-investors-are-making-in-2024
4 https://www.biggerpockets.com/blog/are-3d-homes-the-future-of-home-construction
5 https://www.biggerpockets.com/blog/new-loan-modifications-bring-three-percent-rates-back
6 https://www.biggerpockets.com/blog/biden-proposes-rent-increase-caps-at-five-percent
7 https://www.biggerpockets.com/blog/slew-of-landlord-laws-are-going-to-determine-who-landlords-can-rent-to
8 https://www.biggerpockets.com/blog/11-formerly-hot-markets-that-are-not-so-hot-anymore
9 https://www.biggerpockets.com/blog/revisions-to-florida-live-local-act-take-opportunities-from-developers
10 https://www.biggerpockets.com/blog/luxury-homes-reach-highest-prices-yet-as-investors-look-for-deals
11 https://www.biggerpockets.com/blog/surprisingly-hot-short-term-rental-markets
12 https://www.bigger

# Put into JSON

In [9]:
import json
blog_post_to_json = []
for blog in blog_posts:
    gpt_completion= "Title: " + f'{blog["title"]}' + "\n\n\nContent: " + f'{blog["content"]}'
    blog_post_to_json.append({
                                    "prompt": "Write a unique blog post with a new topic or title in relation to real estate and other topics related to real estate like market trends, investments, marketing, etc.",
                                    "completion": gpt_completion
                              })
with open("../example_docs/blog_posts.jsonl", "w") as json_file:
    for post in blog_post_to_json:
        json_file.write(json.dumps(post) + "\n")