In [1]:
import time
import re
from collections import defaultdict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup

In [3]:
def work_player_profile(param, season):
    url = "https://www.basketball-reference.com" + param
    res = requests.get(url)
    soup = BeautifulSoup(res.text)

    data_dict = {}

    per_game = soup.find(attrs={'id': 'all_per_game'})
    for row in per_game.findAll("tr"):
        if 'id' in row.attrs and row.attrs['id'] == "per_game." + season:
            data_dict['pos'] = row.find('td', attrs={'data-stat': 'pos'}).text
            data_dict['fg'] = float(row.find('td', attrs={'data-stat': 'fg_per_g'}).text)
            data_dict['fga'] = float(row.find('td', attrs={'data-stat': 'fga_per_g'}).text)
            data_dict['fg3'] = float(row.find('td', attrs={'data-stat': 'fg3_per_g'}).text)
            data_dict['fg3a'] = float(row.find('td', attrs={'data-stat': 'fg3a_per_g'}).text)
            data_dict['fta'] = float(row.find('td', attrs={'data-stat': 'fta_per_g'}).text)
            data_dict['efg'] = float(row.find('td', attrs={'data-stat': 'efg_pct'}).text)
            data_dict['tov'] = float(row.find('td', attrs={'data-stat': 'tov_per_g'}).text)
            data_dict['pf'] = float(row.find('td', attrs={'data-stat': 'pf_per_g'}).text)
            break
    
    advanced_table = soup.find(attrs={'id': 'all_advanced'})
    for child in advanced_table.children:
        if "table_outer_container" in child:
            other_soup = BeautifulSoup(child)
            rows = other_soup.findAll("tr")
    for row in rows:
        if 'id' in row.attrs and row.attrs['id'] == "advanced." + season:
            data_dict.update(
                {
                    'per': float(row.find('td', attrs={'data-stat': 'per'}).text),
                    'ts_pct': float(row.find('td', attrs={'data-stat': 'ts_pct'}).text),
                    'usg_pct': float(row.find('td', attrs={'data-stat': 'usg_pct'}).text),
                    'ows': float(row.find('td', attrs={'data-stat': 'ows'}).text),
                    'dws': float(row.find('td', attrs={'data-stat': 'dws'}).text),
                    'obpm': float(row.find('td', attrs={'data-stat': 'obpm'}).text),
                    'dbpm': float(row.find('td', attrs={'data-stat': 'dbpm'}).text),
                    'vorp': float(row.find('td', attrs={'data-stat': 'vorp'}).text),
                    'season': str(int(season)-1) + "-" + season[-2:],
                }
            )
    return data_dict

In [1]:
def get_stats_of_voting(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.text)
    item = soup.find(attrs={'class': 'stats_table'})
    rows = item.findAll("tr")

    season = url.split(".html")[0][-4:]

    print(f"Current season: {season}")

    players_stats = defaultdict(list)

    for index, row in enumerate(rows):

        print(f"\tCurrent index: {index + 1} of {len(rows)}")
        td_cells = row.findAll("td")
        if not td_cells:
            continue
        for cell in td_cells:
            if 'data-stat' not in cell.attrs:
                continue
            if cell['data-stat'] == 'team_id':
                base = "https://www.basketball-reference.com"
                try:
                    link = cell.find("a")['href']
                except Exception:
                    players_stats['win_pct'].append(0.5)  # average
                url = base + link
                time.sleep(1)
                soup = BeautifulSoup(requests.get(url).text)
                for item in soup.findAll("p"):
                    if "Record" in item.text:
                        record = re.findall("\d+\-\d+", item.text)[0]
                        splitted = record.split("-")
                        players_stats['win_pct'].append(float(splitted[0]) / (float(splitted[1]) + float(splitted[0])))
            if cell['data-stat'] == 'player':
                time.sleep(1)
                advanced_dict = work_player_profile(cell.find("a")['href'], season)
                for key in advanced_dict:
                    players_stats[key].append(advanced_dict[key])
                players_stats[cell['data-stat']].append(cell.getText())
            else:
                text = cell.getText() or "0"
                players_stats[cell['data-stat']].append(text)
    return players_stats

In [5]:
seasons = range(1981,2020)

new_data = defaultdict(list)

for season in seasons:
    full_url = f"https://www.basketball-reference.com/awards/awards_{str(season)}.html"
    cur_season_dict = get_stats_of_voting(full_url)
    for key in cur_season_dict:
        new_data[key].extend(cur_season_dict[key])

Current season: 1981
	Current index: 1 of 33
	Current index: 2 of 33
	Current index: 3 of 33
	Current index: 4 of 33
	Current index: 5 of 33
	Current index: 6 of 33
	Current index: 7 of 33
	Current index: 8 of 33
	Current index: 9 of 33
	Current index: 10 of 33
	Current index: 11 of 33
	Current index: 12 of 33
	Current index: 13 of 33
	Current index: 14 of 33
	Current index: 15 of 33
	Current index: 16 of 33
	Current index: 17 of 33
	Current index: 18 of 33
	Current index: 19 of 33
	Current index: 20 of 33
	Current index: 21 of 33
	Current index: 22 of 33
	Current index: 23 of 33
	Current index: 24 of 33
	Current index: 25 of 33
	Current index: 26 of 33
	Current index: 27 of 33
	Current index: 28 of 33
	Current index: 29 of 33
	Current index: 30 of 33
	Current index: 31 of 33
	Current index: 32 of 33
	Current index: 33 of 33
Current season: 1982
	Current index: 1 of 27
	Current index: 2 of 27
	Current index: 3 of 27
	Current index: 4 of 27
	Current index: 5 of 27
	Current index: 6 of 2

	Current index: 6 of 19
	Current index: 7 of 19
	Current index: 8 of 19
	Current index: 9 of 19
	Current index: 10 of 19
	Current index: 11 of 19
	Current index: 12 of 19
	Current index: 13 of 19
	Current index: 14 of 19
	Current index: 15 of 19
	Current index: 16 of 19
	Current index: 17 of 19
	Current index: 18 of 19
	Current index: 19 of 19
Current season: 1997
	Current index: 1 of 22
	Current index: 2 of 22
	Current index: 3 of 22
	Current index: 4 of 22
	Current index: 5 of 22
	Current index: 6 of 22
	Current index: 7 of 22
	Current index: 8 of 22
	Current index: 9 of 22
	Current index: 10 of 22
	Current index: 11 of 22
	Current index: 12 of 22
	Current index: 13 of 22
	Current index: 14 of 22
	Current index: 15 of 22
	Current index: 16 of 22
	Current index: 17 of 22
	Current index: 18 of 22
	Current index: 19 of 22
	Current index: 20 of 22
	Current index: 21 of 22
	Current index: 22 of 22
Current season: 1998
	Current index: 1 of 21
	Current index: 2 of 21
	Current index: 3 of 21

Current season: 2014
	Current index: 1 of 19
	Current index: 2 of 19
	Current index: 3 of 19
	Current index: 4 of 19
	Current index: 5 of 19
	Current index: 6 of 19
	Current index: 7 of 19
	Current index: 8 of 19
	Current index: 9 of 19
	Current index: 10 of 19
	Current index: 11 of 19
	Current index: 12 of 19
	Current index: 13 of 19
	Current index: 14 of 19
	Current index: 15 of 19
	Current index: 16 of 19
	Current index: 17 of 19
	Current index: 18 of 19
	Current index: 19 of 19
Current season: 2015
	Current index: 1 of 14
	Current index: 2 of 14
	Current index: 3 of 14
	Current index: 4 of 14
	Current index: 5 of 14
	Current index: 6 of 14
	Current index: 7 of 14
	Current index: 8 of 14
	Current index: 9 of 14
	Current index: 10 of 14
	Current index: 11 of 14
	Current index: 12 of 14
	Current index: 13 of 14
	Current index: 14 of 14
Current season: 2016
	Current index: 1 of 12
	Current index: 2 of 12
	Current index: 3 of 12
	Current index: 4 of 12
	Current index: 5 of 12
	Current i

In [None]:
df = pd.DataFrame.from_dict(new_data, orient='index')

In [35]:
data_frame = df.transpose()

In [37]:
data_frame.to_csv("mvp_votings.csv")