**Tyler Osterberg**<br>
**MSDS: Data Engineering**<br>
**Project 6: Web Scraping**<br>

In [1]:
import requests as req
from bs4 import BeautifulSoup as bs
from pymongo import MongoClient

In [2]:
res = req.get('https://www.colorado.gov/articles')

In [3]:
soup = bs(res.content, 'html.parser')

In [4]:
news_items = soup.find_all('div', {'class': 'views-row'})

In [5]:
len(news_items)

12

In [6]:
news_items[0]

<div class="views-row">
<div class="my-8">
<div class="text-2xl text-co-md-blue hover:underline"><div class="views-field views-field-title"><span class="field-content"><a href="https://www.colorado.gov/governor/news/7821-colorado-cashback-governor-polis-legislative-leaders-announce-new-400-colorado-dividend">Colorado Cashback: Governor Polis, Legislative Leaders Announce New $400 Colorado Dividend for Every Hardworking Colorado Taxpayer</a></span></div> </div>
<div class="text-base text-gray-600 py-1"><div class="views-field views-field-created"><span class="field-content">April 25, 2022</span></div> </div>
<div class="text-base text-gray-600 hover:underline"><div class="views-field views-field-field-state-agency"><div class="field-content"><a href="/index.php/state-agencies/office-governor" hreflang="en">Office of the Governor</a></div></div> </div>
</div>
</div>

In [7]:
class NewsArticle:
    def __init__(self, item):
        self.title = item.find('div', class_='views-field-title').get_text()
        self.date = item.find('div', class_='views-field-created').get_text()
        self.link = item.find('a')['href']
    
    def show(self):
        print(f'Title: {self.title}\nDate: {self.date}\nLink: {self.link}')

In [8]:
def add_to_library(items, lib):
    for item in items:
        article = NewsArticle(item)
        lib.append(article)

In [9]:
library = []

add_to_library(news_items, library)
    
print(len(library))

12


**Let's get 10 pages of results now**

In [10]:
for i in range(1, 10):
    res = req.get(f'https://www.colorado.gov/articles?page={i}')
    soup = bs(res.content, 'html.parser')
    news_items = soup.find_all('div', {'class': 'views-row'})
    add_to_library(news_items, library)

In [11]:
for item in library:
    item.show()

Title: Colorado Cashback: Governor Polis, Legislative Leaders Announce New $400 Colorado Dividend for Every Hardworking Colorado Taxpayer
Date: April 25, 2022
Link: https://www.colorado.gov/governor/news/7821-colorado-cashback-governor-polis-legislative-leaders-announce-new-400-colorado-dividend
Title: Polis Administration Secures Agriculture Company for New Colorado Headquarters
Date: April 21, 2022
Link: https://oedit.colorado.gov/press-release/polis-administration-secures-agriculture-company-for-new-colorado-headquarters
Title: Press Release: Colorado Employment Situation – March 2022
Date: April 15, 2022
Link: https://cdle.colorado.gov/press-releases/press-release-colorado-employment-situation-march-2022
Title: Press Release: Colorado Employment Situation – March 2022
Date: April 15, 2022
Link: https://cdle.colorado.gov/press-releases/press-release-colorado-employment-situation-march-2022
Title: 4.7.2022: HB22-1328: Modify Main Street Business Recovery Loan Program CLIMBER in Commi

In [12]:
len(library)

88

**Persisting the scraped data**

In [13]:
client = MongoClient()
db = client['co_gov']
news = db['articles']

In [14]:
def make_dict(lib):
    res = []
    for item in lib:
        obj = {'title': item.title, 'date': item.date, 'link': item.link}
        res.append(obj)
    return res

In [15]:
obj_lib = make_dict(library)
obj_lib

[{'title': 'Colorado Cashback: Governor Polis, Legislative Leaders Announce New $400 Colorado Dividend for Every Hardworking Colorado Taxpayer',
  'date': 'April 25, 2022',
  'link': 'https://www.colorado.gov/governor/news/7821-colorado-cashback-governor-polis-legislative-leaders-announce-new-400-colorado-dividend'},
 {'title': 'Polis Administration Secures Agriculture Company for New Colorado Headquarters',
  'date': 'April 21, 2022',
  'link': 'https://oedit.colorado.gov/press-release/polis-administration-secures-agriculture-company-for-new-colorado-headquarters'},
 {'title': 'Press Release: Colorado Employment Situation – March 2022',
  'date': 'April 15, 2022',
  'link': 'https://cdle.colorado.gov/press-releases/press-release-colorado-employment-situation-march-2022'},
 {'title': 'Press Release: Colorado Employment Situation – March 2022',
  'date': 'April 15, 2022',
  'link': 'https://cdle.colorado.gov/press-releases/press-release-colorado-employment-situation-march-2022'},
 {'tit

In [19]:
records = news.insert_many(obj_lib)
print(records.inserted_ids)

ServerSelectionTimeoutError: localhost:27017: [Errno 111] Connection refused, Timeout: 30s, Topology Description: <TopologyDescription id: 62a646b7a9d2ef5ad0ae9183, topology_type: Unknown, servers: [<ServerDescription ('localhost', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27017: [Errno 111] Connection refused')>]>

In [None]:
news.find_one()