In [1]:
# Created by: Anthony ElHabr
# Purpose: Extract NBA betting lines from espn.go.com

from urllib2 import urlopen
import pandas as pd
from bs4 import BeautifulSoup, SoupStrainer
import re

TEAMS = ['ATL', 'BOS', 'BKN', 'CHA', 'CHI', 'CLE', 'DAL', 'DEN',\
        'DET', 'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA',\
        'MIL', 'MIN', 'NOP', 'NYK', 'OKC', 'ORL', 'PHI', 'PHX',\
        'POR', 'SAC', 'SAS', 'TOR', 'UTA', 'WAS']
sportsbooks = ['Westage', 'PinnacleSports.com', '5Dimes.eu', 'BOVADA.lv'\
               'BETONLINE.ag', 'SportsBetting.ag']

url = "http://www.espn.go.com/nba/lines"
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')

all_rows = soup.find_all('tr')

'''
# alternate method
html_page = urlopen(url).read()
only_tr_tags = SoupStrainer('tr')
soup_2 = BeautifulSoup(html_page, 'html.parser', parse_only=only_tr_tags)
'''


"\n# alternate method\nhtml_page = urlopen(url).read()\nonly_tr_tags = SoupStrainer('tr')\nsoup_2 = BeautifulSoup(html_page, 'html.parser', parse_only=only_tr_tags)\n"

In [2]:
class MatchupNBA:
    'Class for NBA matchup'
    
    def __init__(self, book_number='N/A'):
        self.book_number = book_number
    
    def append_data(self, book_number, away_team, home_team,\
                    home_spread, away_spread_payout, home_spread_payout,\
                    point_total, over_payout, under_payout,\
                    away_money_line, home_money_line):
        # don't need to save away_spread because it is the opposite of
        # home_spread
        pass


In [3]:
matchup_date = soup.find('h1').text
date = matchup_date[re.search(', ', matchup_date).end():]
date

u'March 7'

In [4]:
all_data = []

for row in range(len(all_rows)):  
    for col in all_rows[row].find_all('td', recursive=False):   
        all_data.append(col.text)

all_data

[u'Minnesota at Charlotte, 7:00 PM ET (502)',
 u'\xa0',
 u'SPREAD',
 u'TOTAL',
 u'MONEY LINE',
 u'Westgate',
 u'+9.5-9.5MIN: -110CHA: -110',
 u'212o: -110u: -110',
 u'MIN: +405CHA: -525',
 u'+9.5-9.5',
 u'MIN: -110CHA: -110',
 u'212',
 u'o: -110u: -110',
 u'PinnacleSports.com',
 u'+9.5-9.5MIN: -105CHA: -105',
 u'212.5o: -106u: -104',
 u'MIN: +420CHA: -495',
 u'+9.5-9.5',
 u'MIN: -105CHA: -105',
 u'212.5',
 u'o: -106u: -104',
 u'5Dimes.eu',
 u'+9.5-9.5MIN: -110CHA: -110',
 u'212.5o: -110u: -110',
 u'MIN: +415CHA: -525',
 u'+9.5-9.5',
 u'MIN: -110CHA: -110',
 u'212.5',
 u'o: -110u: -110',
 u'BOVADA.lv',
 u'+9.5-9.5MIN: -115CHA: -105',
 u'N/A',
 u'MIN: 0CHA: 0',
 u'+9.5-9.5',
 u'MIN: -115CHA: -105',
 u'BETONLINE.ag',
 u'+9.5-9.5MIN: -110CHA: -110',
 u'212.5o: -110u: -110',
 u'MIN: +405CHA: -500',
 u'+9.5-9.5',
 u'MIN: -110CHA: -110',
 u'212.5',
 u'o: -110u: -110',
 u'SportsBetting.ag',
 u'+9.5-9.5MIN: -110CHA: -110',
 u'212.5o: -110u: -110',
 u'MIN: +405CHA: -500',
 u'+9.5-9.5',
 u'MIN: -

In [5]:
rotation_number = []
away_team_city = []
home_team_city = []
away_team_abbrv = []
home_team_abbrv = []
sportsbook_name = []
away_point_spread = []
home_point_spread = []
away_spread_payout = []
home_spread_payout = []
point_total = []
over_payout = []
under_payout = []
away_moneyline = []
home_moneyline = []

'''
# Example of single matchup in data set
[u'Oklahoma City at Milwaukee, 3:30 PM ET (824)', # 0 - rotation_number and
                                                  #     away/home_team city
 u'\xa0', # 1
 u'SPREAD', # 2
 u'TOTAL', # 3
 u'MONEY LINE', # 4
 u'Westgate', # 5 - sportsbook_name
 u'-8+8OKC: -105MIL: -115', # 6 - away/home_point_spread, away/home_team_abbrv,
                            #     and away/home_spread_payout
 u'218.5o: -110u: -110', # 7 - point_total and over/under_payout
 u'OKC: -360MIL: +295', # 8 - away/home_team_abbrv and away/home_moneyline
 u'-8+8', # 9 - away/home_point_spread
 u'OKC: -105MIL: -115', # 10 - away/home_team_abbrv and away/home_spread_payout
 u'218.5', # 11 - point_total
 u'o: -110u: -110', # 12 - over/under_payout
 u'PinnacleSports.com',
 u'-7.5+7.5OKC: -107MIL: -103',
 ... # etc.
 u'o: -110u: -110',
 u'Preview \xbb PickCenter  \xbb ',
 u'Golden State at LA Lakers, 3:30 PM ET (822)',
 u'\xa0',
'''

# Exceptions:
# 1) Spread table may say 'EVEN', so away/home spreads and away/home spread
# payouts are not shown
# 2) Total table may say 'N/A', so point total and over/under payouts
# are not shown
# 3) When Total table says, 'N/A', Money Line table will say '0' for both teams

# print len(all_data)

# 54 = 6 sportsbook_names (i.e. u'Westage', u'PinnacleSports.com', u'5Dimes.eu',
#                               u'BOVADA.lv', u'BETONLINE.ag', 'SportsBetting.ag')
#      * 8 data points per sportsbook
#      + 4 col headers (i.e.  u'\xa0', u'SPREAD', u'TOTAL', u'MONEY LINE')
#      + 1 matchup header (i.e. u'Oklahoma City at Milwaukee, 3:30 PM ET (824)')
#      + 1 "preview" line (i.e.  u'Preview \xbb PickCenter  \xbb ')
# for i in range(0, len(all_data), 54):
for i in range(len(all_data)):
    # use rotation_number as a pseudo index to find the rest of the data
    if re.search('\)', all_data[i][-1]):
        # print i
        # get all of the data associated with a single matchup
        # realizing that data is distributed in the same pattern
        # for each of 6 sportsbooks
        j = 0
        while j < len(sportsbooks):
            # use rotation_number as pseudo index
            # example of line to parse:
            # u'Oklahoma City at Milwaukee, 3:30 PM ET (824)'
            rotation_number.\
            append(all_data[i]\
                   [re.search('\(', all_data[i]).start():
                    ])
 
            # example of line to parse:
            # u'Oklahoma City at Milwaukee, 3:30 PM ET (824)'
            away_team_city.\
            append(all_data[i]\
                   [:\
                    re.search(' at', all_data[i]).start()])
 
            # example of line to parse:
            # u'Oklahoma City at Milwaukee, 3:30 PM ET (824)'
            home_team_city.\
            append(all_data[i]\
                   [re.search(' at ', all_data[i]).end():\
                    re.search(', ', all_data[i]).start()])
            
            # example of line to parse:
            # u'OKC: -105MIL: -115'
            away_team_abbrv.\
            append(all_data[i+j*8+10]\
                   [:\
                    re.search('[A-Z]: ', all_data[i+j*8+10]).start()+1])
            
            # example of line to parse:
            # u'OKC: -105MIL: -115'
            home_team_abbrv.\
            append(all_data[i+j*8+10]\
                   [re.search('[0-9][A-Z]', all_data[i+j*8+10]).start()+1:\
                    re.search('[0-9][A-Z]+:', all_data[i+j*8+10]).end()-1])
            
            # example of line to parse:
            # u'Westgate'
            sportsbook_name.append(all_data[i+j*8+5])
            
            # example of line to parse:
            # u'-8+8'
            # TODO: account for exception #1
            away_point_spread.\
            append(all_data[i+j*8+9]\
                   [:\
                    re.search('[0-9][+-]', all_data[i+j*8+9]).start()+1])
            
            # example of line to parse:
            # u'-8+8'
            # direct way to store home_spread
            # TODO: account for exception #1
            home_point_spread.\
            append(all_data[i+j*8+9]\
                   [re.search('[0-9][+-][0-9]', all_data[i+j*8+9]).start()+1:\
                    ])
            
            '''
            # indirect way to store home_spread
            if all_data[i+j*8+9][0]=='-':\
                home_point_spread.\
                append(\'+'+(all_data[i+j*8+9]\
                             [1:\
                              re.search('[0-9][+-]', all_data[i+j*8+9]).start()+1]))
            elif all_data[i][0]=='+':\
                home_point_spread.\
                append('-'+(all_data[i+j*8+9]\
                            [1:\
                             re.search('[0-9][+-]', all_data[i+j*8+9]).start()+1]))
            '''
            
            # example of line to parse: u'OKC: -105MIL: -115'
            away_spread_payout.\
            append(all_data[i+j*8+10]\
                  [re.search('[A-Z]+: ', all_data[i+j*8+10]).end():\
                   re.search('[0-9][A-Z]+:', all_data[i+j*8+10]).start()+1])
            
            # example of line to parse: u'OKC: -105MIL: -115'
            home_spread_payout.\
            append(all_data[j*8+10]\
                  [re.search('[0-9][A-Z]+: ', all_data[i+j*8+10]).end():\
                   ])
            
            # example of line to parse: u'218.5'
            # TODO: account for exception #2
            point_total.append(all_data[i+j*8+11])
            
            # example of line to parse: u'o: -110u: -110'
            # TODO: account for exception #2
            over_payout.\
            append(all_data[i+j*8+12]\
                  [re.search('o: ', all_data[i+j*8+12]).end():\
                   re.search('u', all_data[i+j*8+12]).start()])
            
            # example of line to parse: u'o: -110u: -110'
            # TODO: account for exception #2
            under_payout.\
            append(all_data[i+j*8+12]\
                  [re.search('u: ', all_data[i+j*8+12]).end():\
                   ])
            
            # example of line to parse: u'OKC: -360MIL: +295'
            # NOTE: away/home_moneyline may not be preceded by [+-]
            # (e.g. when away/home_moneyline is 0)
            away_moneyline.\
            append(all_data[i+j*8+8]\
                  [re.search('[A-Z]+: ', all_data[i+j*8+8]).end():\
                   re.search('[0-9][A-Z]+: ', all_data[i+j*8+8]).start()+1])
            
            # example of line to parse: u'OKC: -360MIL: +295'
            home_moneyline.\
            append(all_data[i+j*8+8]\
                  [re.search('[0-9][A-Z]+: ', all_data[i+j*8+8]).end():\
                   ])
            
            j = j + 1

all_matchup_data = []
            
for i in range(len(rotation_number)):
    # print rotation_number[i], away_team_city[i], home_team_city[i],
    print sportsbook_name[i],\
    away_team_abbrv[i], home_team_abbrv[i],\
    away_point_spread[i], home_point_spread[i],\
    away_spread_payout[i], home_spread_payout[i],\
    point_total[i], over_payout[i], under_payout[i],\
    away_moneyline[i], home_moneyline[i]
print

# TODO: put all matchup data into a single list
# TODO: put all matchup data into a single dict with
#       matchup_date-rotation_number-sportsbook_name as the key

AttributeError: 'NoneType' object has no attribute 'end'

In [6]:
all_data

[u'Minnesota at Charlotte, 7:00 PM ET (502)',
 u'\xa0',
 u'SPREAD',
 u'TOTAL',
 u'MONEY LINE',
 u'Westgate',
 u'+9.5-9.5MIN: -110CHA: -110',
 u'212o: -110u: -110',
 u'MIN: +405CHA: -525',
 u'+9.5-9.5',
 u'MIN: -110CHA: -110',
 u'212',
 u'o: -110u: -110',
 u'PinnacleSports.com',
 u'+9.5-9.5MIN: -105CHA: -105',
 u'212.5o: -106u: -104',
 u'MIN: +420CHA: -495',
 u'+9.5-9.5',
 u'MIN: -105CHA: -105',
 u'212.5',
 u'o: -106u: -104',
 u'5Dimes.eu',
 u'+9.5-9.5MIN: -110CHA: -110',
 u'212.5o: -110u: -110',
 u'MIN: +415CHA: -525',
 u'+9.5-9.5',
 u'MIN: -110CHA: -110',
 u'212.5',
 u'o: -110u: -110',
 u'BOVADA.lv',
 u'+9.5-9.5MIN: -115CHA: -105',
 u'N/A',
 u'MIN: 0CHA: 0',
 u'+9.5-9.5',
 u'MIN: -115CHA: -105',
 u'BETONLINE.ag',
 u'+9.5-9.5MIN: -110CHA: -110',
 u'212.5o: -110u: -110',
 u'MIN: +405CHA: -500',
 u'+9.5-9.5',
 u'MIN: -110CHA: -110',
 u'212.5',
 u'o: -110u: -110',
 u'SportsBetting.ag',
 u'+9.5-9.5MIN: -110CHA: -110',
 u'212.5o: -110u: -110',
 u'MIN: +405CHA: -500',
 u'+9.5-9.5',
 u'MIN: -