In [None]:
# Created by: Anthony ElHabr
# Purpose: Extract scores for every NBA team for a given season from espn.go.com
# Modified from: http://danielfrg.com/blog/2013/04/01/nba-scraping-data/
# Only need to run this script once

# import requests
from urllib2 import urlopen
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, date

YEAR = 2012
teams_df = pd.read_csv("nba-teams.csv", index_col=False)
url_template = "http://espn.go.com/nba/team/schedule/_/name/{0}/year/{1}/{2}"

# teams_df

In [None]:
start_all = datetime.now()

game_id = []
season_yr = []
game_date = []
team_schedule = []
win_flag = []
wins_to_date = []
losses_to_date = []
home_flag = []
home_team = []
home_score = []
away_team = []
away_score = []

for index, row in teams_df.iterrows():
    start = datetime.now()
    
    team_abbrv = row['team_abbrv']

    # r = request.get(url_template.formatrow['team_abbrv'].lower(),\
    #                          YEAR, row['team_url_name'])
    # schedule_table = BeautifulSoup(r.text).table
    url = url_template.format(row['team_abbrv'].lower(),\
                              YEAR, row['team_url_name'])
    html = urlopen(url)
    soup = BeautifulSoup(html, 'html.parser')
    schedule_table = soup.table

    # ignore class="stathead" row
    for row in schedule_table.find_all('tr')[1:]:
        columns = row.find_all('td')
        
        # try/except block will skip class="colhead" rows
        # and only parse class="oddrow..." and class="evenrow..." rows
        try:
            game_id.append(columns[2].a['href'].split('?id=')[1])
            
            season_yr.append(YEAR)

            try:
                d = datetime.strptime(columns[0].text, '%a, %b %d')

                if d.month > 7:
                    d = date(YEAR-1, d.month, d.day)
                else:
                    d = date(YEAR, d.month, d.day)
            except:
                d = date(YEAR, 2, 29)
            
            d = d.strftime('%m/%d/%Y')
            game_date.append(d)
            
            team_schedule.append(team_abbrv)

            win_boolean = True if columns[2].span.text == 'W' else False
            win_flag.append('1' if win_boolean else '0')
            
            win_loss_record = columns[3].text.split('-')
            wins_to_date.append(win_loss_record[0])
            losses_to_date.append(win_loss_record[1])
            
            home_boolean = True if columns[1].li.text == 'vs' else False
            home_flag.append('1' if home_boolean else '0')
            
            # text == '@' if team is away_team
            other_team_name = columns[1].find_all('a')[1].text
            other_team_abbrv = columns[1].find_all('a')[1]\
            ['href'].split('/')[-2]
            home_team_abbrv = team_abbrv if home_boolean else other_team_abbrv
            home_team_abbrv = home_team_abbrv.upper()
            home_team.append(home_team_abbrv)
            away_team_abbrv = team_abbrv if not home_boolean else other_team_abbrv
            away_team_abbrv = away_team_abbrv.upper()
            away_team.append(away_team_abbrv)

            # split(' ') is to ignore possible 'OT' label
            both_scores = columns[2].a.text.split(' ')[0].split('-')
            if home_boolean:
                if win_boolean:
                    home_score.append(both_scores[0])
                    away_score.append(both_scores[1])
                else:
                    home_score.append(both_scores[1])
                    away_score.append(both_scores[0])
            else:
                if win_boolean:
                    home_score.append(both_scores[1])
                    away_score.append(both_scores[0])
                else:
                    home_score.append(both_scores[0])
                    away_score.append(both_scores[1])
        except Exception as e:
            pass
            # print(e)

    end = datetime.now()
    time_diff = end - start
    print "Finished getting scores for {0} in {1} s".format(team_abbrv, time_diff)

end_all = datetime.now()
time_diff_all = end_all - start_all
print "Finished getting all scores in {0} s".format(time_diff_all)            

In [None]:
games_db_list = []
set_of_lists = [game_id, season_yr, game_date,\
                team_schedule, win_flag,\
                wins_to_date, losses_to_date, home_flag,\
                home_team, away_team, home_score, away_score]

# for l in set_of_lists:
#     print len(l)

for i in range(len(game_id)):
    single_item = []
    
    for j in set_of_lists:
        single_item.append(j[i])
    
    games_db_list.append(single_item)

# games_db_list

In [None]:
df_col_headers = ['game_id', 'season_yr', 'game_date',\
                  'team_schedule', 'win_flag',\
                  'wins_to_date', 'losses_to_date', 'home_flag',\
                  'home_team', 'away_team', 'home_score', 'away_score']
scores_df = pd.DataFrame(games_db_list, columns=df_col_headers)

# scores_df

In [None]:
'''
# NOTE: dict does produce df in same order because it uses hashing to reorganize
# the data that it is given
games_dict = {'game_id': game_id,\
              'game_date': game_date,\
              'home_team': home_team,\
              'away_team': away_team,\
              'home_team_score': home_team_score,\
              'away_team_score': away_team_score}

# print len(game_id), len(game_date), len(home_team), len(away_team),\
# len(home_team_score), len(away_team_score)

games_df = pd.DataFrame(games_dict).drop_duplicates(subset='game_id').\
set_index('game_id')

# games_df.head
'''
pass

In [None]:
import os

f = 'nba-season-scores-{0}.csv'.format(YEAR)
if not os.path.isfile(f):
    scores_df.to_csv(f, index=False)
    print "Creating new file '{0}'".format(f)
else:
    print "Need to delete existing {0} before creating new one".format(f)
    # print "Overwriting existing file"