In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy               as np
import pandas              as pd
import matplotlib.pyplot   as plt
import seaborn             as sns

import re
from bs4 import BeautifulSoup
import requests
import validators

%matplotlib inline

In [None]:
flag = False
player_urls = []

while not flag:
    player_url = input("""Please copy and paste your prospect's URL (type 'exit' when done): """)
    if player_url.lower() == 'exit': 
        break
    valid = validators.url(player_url)
    if valid == False or player_url[0:4] != 'http' or "eliteprospects.com" not in player_url:
        print("Invalid URL, please try again.")
    else:
        player_urls.append(player_url)
        
# URL EXAMPLES
# player_urls = ["https://www.eliteprospects.com/player/90230/a.j.-white",
#                "https://www.eliteprospects.com/player/232966/matt-jurusik",
#                "https://www.eliteprospects.com/player/91186/mikko-rantanen",
#                "https://www.eliteprospects.com/player/199655/cale-makar"]

In [None]:
data = []

for player in player_urls:
    data.append(requests.get(player).text)

In [None]:
soup = []
for player in data:
    soup.append(BeautifulSoup(player,"html.parser"))

## Player Info

In [None]:
player_data_header = ['Name']

for element in soup[0].find_all("div", class_="col-xs-12 col-6 p-0"):
    player_data_header.append(element.text.strip())

for element in soup[0].find_all("div", class_="col-xs-12 col-7 p-0"):
    player_data_header.append(element.text.strip())

In [None]:
player_data = []

for plyr in soup:
    for element in plyr.find_all("div", class_="ep-entity-header__name"):
        player_data.append(element.text.strip())
        
    for element in plyr.find_all("div", class_="col-xs-12 col-18 text-right p-0 ep-text-color--black"):
        player_data.append(element.text.strip())

    for element in plyr.find_all("div", class_="col-xs-12 col-17 text-right p-0 ep-text-color--black"):
        player_data.append(element.text.strip())

In [None]:
player_data_header

In [None]:
player_data_header[6] = 'Shoots/Catches'

In [None]:
player_data

In [None]:
p_data = np.reshape(player_data,(len(player_urls),11))

In [None]:
df = pd.DataFrame(data= p_data, columns=player_data_header)

In [None]:
df.head()

## Player Statistics

In [None]:
league_stats = soup[0].find_all("div", id="league-stats")[0]

In [None]:
non_g_cols = ['Season','Team','League','Regular Games Played','Regular Goals',
           'Regular Assists', 'Regular Total Points', 'Regular PIM', 'Regular +/-',
           'Postseason Status','Postseason Games Played','Postseason Goals',
           'Postseason Assists', 'Postseason Total Points', 'Postseason PIM', 'Postseason +/-']

In [None]:
g_cols = ['Season', 'Team', 'League', 'Regular Games Played', 'Regular Goal Difference',
            'Regular Goal Against Average', 'Regular Save %', 'Regular Goals Against', 'Regular Saves (Total)',
            'Regular Shutout', 'Regular Season Record (W-L-T)', 'Regular Time On Ice',
            'Postseason', 'Postseason Games Played', 'Postseason Goal Difference', 'Postseason Goal Against Average',
            'Postseason Save %', 'Postseason Goals Against', 'Postseason Saves (Total)', 'Postseason Shutout',
            'Postseason Season Record (W-L-T)', 'Postseason Time On Ice']

In [None]:
df_stats = []

for player in soup:
    league_stats = player.find_all("div", id="league-stats")[0]
    
    if player.find('div',class_='col-xs-12 col-18 text-right p-0 ep-text-color--black').text.strip() == 'G':
        stats_dict = {'season sorted':[], 'team': [], 'league': [], 'regular gp': [],
              'regular gd': [], 'regular gaa': [], 'regular svp': [], 'regular ga': [],
              'regular svs': [], 'regular so': [], 'regular wlt': [], 'regular toi': [],
              'postseason': [], 'postseason gp': [], 'postseason gd': [], 'postseason gaa': [],
              'postseason svp': [], 'postseason ga': [], 'postseason svs': [],
              'postseason so': [], 'postseason wlt': [], 'postseason toi': []}
        for key in stats_dict:
            if key == 'postseason':
                raw_list = league_stats.tbody.find_all(lambda tag: tag.name == 'td' and tag.get('class') == [key])
                stats_dict[key] = list(map(lambda x: x.text.strip(), raw_list))
            else:
                raw_list = league_stats.tbody.find_all('td', class_=key)
                stats_dict[key] = list(map(lambda x: x.text.strip(), raw_list))
    
        temp_df = pd.DataFrame(stats_dict)
        temp_df.columns = g_cols
    
    else:
        stats_dict = {'season sorted':[], 'team': [], 'league': [], 'regular gp': [],
                      'regular g': [], 'regular a': [], 'regular tp': [], 'regular pim': [],
                      'regular pm': [], 'postseason': [], 'postseason gp': [], 'postseason g': [],
                      'postseason a': [], 'postseason tp': [], 'postseason pim': [], 'postseason pm': []}

        for key in stats_dict:
            if key == 'postseason':
                raw_list = league_stats.tbody.find_all(lambda tag: tag.name == 'td' and tag.get('class') == [key])
                stats_dict[key] = list(map(lambda x: x.text.strip(), raw_list))  
            else:
                raw_list = league_stats.tbody.find_all('td', class_=key)
                stats_dict[key] = list(map(lambda x: x.text.strip(), raw_list))
        
        temp_df = pd.DataFrame(stats_dict)
        temp_df.columns = non_g_cols
    
    df_stats.append(temp_df)

In [None]:
df_stats[0]

In [None]:
df['Stats'] = df_stats

In [None]:
df['Stats'][0]