In [1]:
from bs4 import BeautifulSoup
import urllib.request

### Setup

The Player class will make the data easier to sort through and separate unique objects, since we'll be collecting photos from rosters over several seasons.

In [2]:
class Player:
    def __init__(self, name, team, jpg):
        self.name=name;
        self.team=team;
        self.jpg=jpg;
    def __eq__(self, other):
        return self.name==other.name
    def __hash__(self):
        return hash(('name', self.name, 'team', self.team, 'jpg', self.jpg))

The following functions are for scraping the NHL website. 
(An exception for the 2004-2005 season, which was cancelled due to the lockout, is handled accordingly. #history)

In [3]:
def get_multiseasons(base_url, seasonrange_floor, seasonrange_ceiling):
    allplayers=[];
    for year in range(seasonrange_floor, seasonrange_ceiling):
        try:
            url=base_url+"/"+str(year);
            team=base_url.split('/')[3];
            allplayers=allplayers+getPlayers(url, team);
            print("Team: ",team,", Season: ", year,"added \n")
        except:
            print("2004-2005 season unavailable.")
    return allplayers;
        
def getPlayers(url, team):
    
    player_list=[];
    html=urllib.request.urlopen(url)
    soup = BeautifulSoup(html, 'html.parser')
    
    images = soup.find_all("img","player-photo")
    lastnames=soup.find_all("span","name-col__item name-col__lastName")
    firstnames=soup.find_all("span","name-col__item name-col__firstName")
    
    player_zip=zip(firstnames, lastnames, images)
    for player in player_zip:
        name=player[0].string+' '+player[1].string;
        jpg=player[-1]['src'];
        player_list.append(Player(name=name, team=team, jpg=jpg))
        
    return player_list

We'll collect photos from the 2000-2001 season to the 2018-2019 season using a list of rosters from every team in the NHL. Duplicates will be removed before images are collected.

In [4]:
seasonrange_floor=2000;
seasonrange_ceiling=2020;
all_players=[];

In [5]:
fname="nhlroster_urls"

In [6]:
with open(fname) as f:
    content = f.read().splitlines()

In [None]:
for page in content:
    all_players=all_players+get_multiseasons(page, seasonrange_floor, seasonrange_ceiling);

2004-2005 season unavailable.
Team:  blackhawks , Season:  2001 added 

Team:  blackhawks , Season:  2002 added 

Team:  blackhawks , Season:  2003 added 



We can see there have 7557 unique players in the NHL within the last 20 years. 

In [None]:
reduced_allplayers=list(dict.fromkeys(all_players))
print("Unique players: ",len(reduced_allplayers))

In [None]:
len(reduced_allplayers)

We'll store the images in our data directory, removing any images that have been removed from the site.

In [None]:
data_dir="./project4_data/"
photos_unavailable=0;

In [None]:
for i in range(len(reduced_allplayers)):
    try:
        urllib.request.urlretrieve(reduced_allplayers[i].jpg, data_dir+reduced_allplayers[i].name+'.jpg')

    except: 
        print("Player photgraph unavailable.")
        print(reduced_allplayers[i].jpg,i,reduced_allplayers[i].name)
        photos_unavailable+=1