In [10]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import pymongo
import pandas as pd

In [2]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)


In [3]:
# Define database and collection
db = client.nhl_db
collection = db.articles

In [4]:
# URL of page to be scraped
url = 'https://www.nhl.com/'

In [5]:
# Retrieve page with the requests module
response = requests.get(url)
# Create BeautifulSoup object; parse with 'lxml'
soup = BeautifulSoup(response.text, 'lxml')


In [6]:
# Retrieve the parent divs for all articles
results = soup.find_all('li', class_='mixed-feed__item--article')

In [7]:
# Loop through results to retrieve article title, header, and timestamp of article
for result in results:
    title = result.find('h4', class_='mixed-feed__header').text

    lede = result.find('h5', class_='mixed-feed__subheader').text

    # The time and date of article publication
    date = result.find('time')['datetime']
    # Slice the datetime string for the date
    article_date = date[:10]
    # Slice the datetime string for the time
    time = date[11:16]
    # Determine whether article was published in AM or PM
    if (int(time[:2]) >= 13):
        meridiem = 'pm'
    else:
        meridiem = 'am'

    # Concatenate time string
    time = time + meridiem
    print('-----------------')
    print(title)
    print(lede)
    print(article_date)
    print(time)

    # Dictionary to be inserted into MongoDB
    post = {
        'title': title,
        'lede': lede,
        'date': article_date,
        'time published': time
    }

    # Insert dictionary into MongoDB as a document
    collection.insert_one(post)

-----------------
Kadri suspension has Maple Leafs adjusting for Game 2 against Bruins
Toronto aims to even series in Boston without forward
2018-04-13
21:26pm
-----------------
Couturier's transition to scorer for Flyers evident in playoffs
Former checking-line center making offensive impact against Penguins
2018-04-14
11:09am
-----------------
Stanley Cup Playoffs Buzz: Avalanche, Devils try to bounce back
Maple Leafs, Ducks also seek to even series after losing openers
2018-04-14
09:41am
-----------------
Kings look for Quick to stand tall against Golden Knights in Game 3
Los Angeles, down 2-0 in first-round series, aims for better play in front of goaltender after 54-save performance
2018-04-14
10:50am
-----------------
Golden Knights make more history with double-overtime win against Kings
Vegas is third team with multi-OT victory in inaugural NHL season, fifth to capture first two playoff games
2018-04-14
11:40am
-----------------
Byfuglien has huge 'impact' for Jets in Game 2 wi

In [13]:
# Display the MongoDB records created above
articles = db.articles.find()
result = []
for article in articles:
   # print(article)
    result.append(article)

In [14]:
result

[{'_id': ObjectId('5ad24c0239fa064690109231'),
  'date': '2018-04-13',
  'lede': 'Toronto aims to even series in Boston without forward',
  'time published': '21:26pm',
  'title': 'Kadri suspension has Maple Leafs adjusting for Game 2 against Bruins'},
 {'_id': ObjectId('5ad24c0239fa064690109232'),
  'date': '2018-04-14',
  'lede': 'Former checking-line center making offensive impact against Penguins',
  'time published': '11:09am',
  'title': "Couturier's transition to scorer for Flyers evident in playoffs"},
 {'_id': ObjectId('5ad24c0239fa064690109233'),
  'date': '2018-04-14',
  'lede': 'Maple Leafs, Ducks also seek to even series after losing openers',
  'time published': '09:41am',
  'title': 'Stanley Cup Playoffs Buzz: Avalanche, Devils try to bounce back'},
 {'_id': ObjectId('5ad24c0239fa064690109234'),
  'date': '2018-04-14',
  'lede': 'Los Angeles, down 2-0 in first-round series, aims for better play in front of goaltender after 54-save performance',
  'time published': '10:50

In [16]:
article_db = pd.DataFrame(result)
article_db

Unnamed: 0,_id,date,lede,time published,title
0,5ad24c0239fa064690109231,2018-04-13,Toronto aims to even series in Boston without ...,21:26pm,Kadri suspension has Maple Leafs adjusting for...
1,5ad24c0239fa064690109232,2018-04-14,Former checking-line center making offensive i...,11:09am,Couturier's transition to scorer for Flyers ev...
2,5ad24c0239fa064690109233,2018-04-14,"Maple Leafs, Ducks also seek to even series af...",09:41am,"Stanley Cup Playoffs Buzz: Avalanche, Devils t..."
3,5ad24c0239fa064690109234,2018-04-14,"Los Angeles, down 2-0 in first-round series, a...",10:50am,Kings look for Quick to stand tall against Gol...
4,5ad24c0239fa064690109235,2018-04-14,Vegas is third team with multi-OT victory in i...,11:40am,Golden Knights make more history with double-o...
5,5ad24c0239fa064690109236,2018-04-14,Defenseman excites fans with big hit on Koivu ...,01:33am,Byfuglien has huge 'impact' for Jets in Game 2...
6,5ad24c0239fa064690109237,2018-04-13,Viewer's guide for upcoming NHL postseason games,10:23am,Where to watch 2018 Stanley Cup Playoffs
7,5ad24c0239fa064690109238,2018-04-14,"Updates from NHL.com correspondents, teams, re...",13:50pm,"Fantasy projected lineups, starting goalies"
8,5ad24c0239fa064690109239,2018-04-13,Forward disciplined for boarding Bruins forwar...,18:01pm,"Kadri suspended three games, out for Maple Lea..."
9,5ad24c0239fa06469010923a,2018-04-14,Even Eastern series with Penguins behind perfo...,01:01am,"Flyers stick with plan, respond in Game 2"


In [31]:
img_re = soup.find_all('img alt',class_ ='mediawall_panel-image swiper-lazy swiper-lazy-loaded').img
img_re

AttributeError: 'NoneType' object has no attribute 'img'

In [39]:
img_res = soup.find_all('img',class_='recent-media-navbar-module__image')
img_res[1]

<img class="recent-media-navbar-module__image" data-src="//nhl.bamcontent.com/images/photos/280306162/image/raw.jpg"/>