In [1]:
# Created by: Anthony ElHabr
# Purpose: Extract scores for every NBA team for a given season (including current season) from espn.go.com
# Modified from: http://danielfrg.com/blog/2013/04/01/nba-scraping-data/

# import requests
from urllib2 import urlopen
import pandas as pd
from bs4 import BeautifulSoup
import re
import os
from collections import OrderedDict
from datetime import datetime, date
import csv


year = 2016
sportsbooks = ['Westage', 'PinnacleSports.com', '5Dimes.eu',
               'BOVADA.lv', 'BETONLINE.ag', 'SportsBetting.ag']
teams_df = pd.read_csv("csvs\\nba-teams.csv", index_col=False)
# betting_df = pd.read_csv("nba-betting-lines.csv", index_col=False)
url_template = "http://espn.go.com/nba/team/schedule/_/name/{0}/year/{1}/{2}"

# TODO:
# 1) find games in betting_df without scores recorded
# 2) record scores
# 3) calculate betting line results

In [13]:
start_all = datetime.now()
for index, row in teams_df.iterrows():
    start = datetime.now()

    team_abbrv = row['team_abbrv']

    # r = request.get(url_template.formatrow['team_abbrv'].lower(), year, row['team_url_name'])
    # schedule_table = BeautifulSoup(r.text).table
    url = url_template.format(
        row['team_abbrv'].lower(), year, row['team_url_name'])
    html = urlopen(url)
    soup = BeautifulSoup(html, 'html.parser')

    # schedule_table = soup.table
    # sched_data = soup.find_all(name='tr', attrs={'class':re.compile('oddrow'), 'class':re.compile('evenrow')})
    sched_data = soup.find_all(name='tr', attrs={'class': re.compile('row')})

# function call here
    sched_dict_keys = ['game_id', 'season_yr', 'game_date', 'team_schedule', 'win_flag', 'wins_to_date', 'losses_to_date', 'home_flag', 'home_team', 'home_score', 'away_team', 'away_score']
    sched_dict = OrderedDict((key, []) for key in sched_dict_keys)

    for row in sched_data:
        # print row
        cols = row.find_all('td')

        sched_dict['season_yr'].append(year)

        try:
            date_raw = cols[0].text
            date_nums = datetime.strptime(date_raw, '%a, %b %d')
            date_month = date_nums.month
            date_day = date_nums.day

            if date_month > 7:
                date_fixed = date(year-1, date_month, date_day)
            else:
                date_fixed = date(year, date_month, date_day)
        except:
            print "Leap day"
            date_fixed = date(year, 2, 29)

        game_date = date_fixed.strftime('%m/%d/%Y')
        sched_dict['game_date'].append(game_date)
        
        sched_dict['team_schedule'].append(team_abbrv)

        home_bool = True if cols[1].li.text == 'vs' else False
        sched_dict['home_flag'].append('1' if home_bool else '0')

        # text == '@' if team is away_team
        # other_team = cols[1].find_all('a')[1].text
        other_team_abbrv = cols[1].find_all('a')[1]['href'].split('/')[-2]

        home_team = team_abbrv if home_bool else other_team_abbrv
        home_team = home_team.upper()
        sched_dict['home_team'].append(home_team)

        away_team = team_abbrv if not home_bool else other_team_abbrv
        away_team = away_team.upper()
        sched_dict['away_team'].append(away_team)

        try:
            sched_dict['game_id'].append(cols[2].a['href'].split('recap?id=')[1])

            win_bool = True if cols[2].span.text == 'W' else False
            sched_dict['win_flag'].append('1' if win_bool else '0')

            wl_record = cols[3].text.split('-')
            sched_dict['wins_to_date'].append(wl_record[0])
            sched_dict['losses_to_date'].append(wl_record[1])

            # split(' ') is to ignore possible 'OT' label
            both_scores = cols[2].a.text.split(' ')[0].split('-')
            if home_bool:
                if win_bool:
                    sched_dict['home_score'].append(both_scores[0])
                    sched_dict['away_score'].append(both_scores[1])
                else:
                    sched_dict['home_score'].append(both_scores[0])
                    sched_dict['away_score'].append(both_scores[1])
            else:
                if win_bool:
                    sched_dict['home_score'].append(both_scores[1])
                    sched_dict['away_score'].append(both_scores[0])
                else:
                    sched_dict['home_score'].append(both_scores[0])
                    sched_dict['away_score'].append(both_scores[1])
        except Exception as error:
            print game_date
            print error
            default_val = 'N/A'
            sched_dict['game_id'].append(default_val)
            sched_dict['win_flag'].append(default_val)
            sched_dict['wins_to_date'].append(default_val)
            sched_dict['losses_to_date'].append(default_val)
            sched_dict['home_score'].append(default_val)
            sched_dict['away_score'].append(default_val)

    end = datetime.now()
    time_diff = end - start
    print "Finished getting scores for {0} in {1} s".format(team_abbrv, time_diff)

end_all = datetime.now()
time_diff_all = end_all - start_all
print "Finished getting all scores for {0} in {1} s".format(year, time_diff_all)

03/17/2016
list index out of range
03/19/2016
'NoneType' object has no attribute '__getitem__'
03/21/2016
'NoneType' object has no attribute '__getitem__'
03/23/2016
'NoneType' object has no attribute '__getitem__'
03/25/2016
'NoneType' object has no attribute '__getitem__'
03/26/2016
'NoneType' object has no attribute '__getitem__'
03/28/2016
'NoneType' object has no attribute '__getitem__'
03/30/2016
'NoneType' object has no attribute '__getitem__'
04/01/2016
'NoneType' object has no attribute '__getitem__'
04/05/2016
'NoneType' object has no attribute '__getitem__'
04/07/2016
'NoneType' object has no attribute '__getitem__'
04/09/2016
'NoneType' object has no attribute '__getitem__'
04/11/2016
'NoneType' object has no attribute '__getitem__'
04/13/2016
'NoneType' object has no attribute '__getitem__'
Finished getting scores for ATL in 0:00:01.263000 s
Finished getting all scores for 2016 in 0:00:01.267000 s


In [15]:
sched_dict_items = sched_dict.items()

for key, value in sched_dict_items[:]:
    print key, len(value)

game_id 82
season_yr 82
game_date 82


In [17]:
scores_df = pd.DataFrame(sched_dict)

scores_f = 'csvs\\nba-season-scores-{0}.csv'.format(year)
if not os.path.isfile(scores_f):
    scores_df.to_csv(scores_f, index=False)
    print "Creating new file '{0}'".format(scores_f)
else:
    print "Failed to log data to existing {0}".format(scores_f)
    print "Please delete the existing file."

Creating new file 'nba-season-scores-2016.csv'
