In [13]:
import basketball_reference_web_scraper.readers as br
import pandas as pd
import ast
import json

In [15]:
# Scrapes the data for 2015 and returns a JSON (key-value pair) string
# Note that the JSON type and native Python dict type are essentially the same thing
fifteen = br.return_json_encoded_schedule(2015)

In [26]:
# `ast.literal_eval` takes a string and transforms it into a dict if the string can be converted
# i.e. the brackets and everything are properly opened and closed
# We use `json.dumps` to write our data to file as a JSON string - hence `dump s` (string)
final_fifteen = ast.literal_eval(fifteen)['parsed_event_list']
with open('fifteen.json', 'a') as _f:
    _f.write(json.dumps(final_fifteen) + '\n')

In [36]:
loaded_fifteen = []
with open('fifteen.json', 'r') as _f:
    # Here, because we're loading from a file pointer (which points to a buffer (or chunk) of memory) we use `load`
    loaded_schedule = json.load(_f)
    # This loads a list of dictionaries from file
    for game in loaded_schedule:
        # But, `type(game)` is `unicode` here, which is a type of string
        # Thus, we have to use `ast.literal_eval` again to convert it to a valid dictionary
        loaded_fifteen.append(ast.literal_eval(game))

In [50]:
# Once we have a list of dictionaries, we can simply wrap this list in a call to `pd.DataFrame` 
# pandas magic will automatically convert this into a DataFrame
df_fifteen = pd.DataFrame(loaded_fifteen)
# Cast the `start_time` column to a proper datetime format from string using the `pd.to_datetime` function
df_fifteen['start_time'] = pd.to_datetime(df_fifteen['start_time'])
df_fifteen.head()

Unnamed: 0,home_team_name,home_team_score,start_time,visiting_team_name,visiting_team_score
0,Atlanta Hawks,94,2015-10-28 00:00:00,Detroit Pistons,106
1,Chicago Bulls,97,2015-10-28 00:00:00,Cleveland Cavaliers,95
2,Golden State Warriors,111,2015-10-28 02:30:00,New Orleans Pelicans,95
3,Boston Celtics,112,2015-10-28 23:30:00,Philadelphia 76ers,95
4,Brooklyn Nets,100,2015-10-28 23:30:00,Chicago Bulls,115


In [58]:
# Get a list of unique dates on the NBA schedule (to get box scores!)
# Any pd.Series datatype, native Python list type, or array-like object should have a `map` method
# (A method being a function belonging to a particular class (like the `to_datetime` method of `pandas`)
# Using `my_array.map(FUNCTION)` takes a FUNCTION that is then applied to every element in the array
# Here, we are using a simple lambda function that takes each element in the array `df_fifteen['start_time']` as a variable `t`
# For each of these elements, which are each datetime objects, we run `t.date()` or the `date()` method on each object
# This returns the YYYY-MM-DD date of each object (removing the HH:MM:SS part)
# We can then run the `.unique()` method that belongs to pd.Series in order to get a list of unique dates for our schedule
fifteen_unique_dates = df_fifteen['start_time'].map(lambda t: t.date()).unique()
print fifteen_unique_dates

[datetime.date(2015, 10, 28) datetime.date(2015, 10, 29)
 datetime.date(2015, 10, 30) datetime.date(2015, 10, 31)
 datetime.date(2015, 11, 1) datetime.date(2015, 11, 2)
 datetime.date(2015, 11, 3) datetime.date(2015, 11, 4)
 datetime.date(2015, 11, 5) datetime.date(2015, 11, 6)
 datetime.date(2015, 11, 7) datetime.date(2015, 11, 8)
 datetime.date(2015, 11, 9) datetime.date(2015, 11, 10)
 datetime.date(2015, 11, 11) datetime.date(2015, 11, 12)
 datetime.date(2015, 11, 13) datetime.date(2015, 11, 14)
 datetime.date(2015, 11, 15) datetime.date(2015, 11, 16)
 datetime.date(2015, 11, 17) datetime.date(2015, 11, 18)
 datetime.date(2015, 11, 19) datetime.date(2015, 11, 20)
 datetime.date(2015, 11, 21) datetime.date(2015, 11, 22)
 datetime.date(2015, 11, 23) datetime.date(2015, 11, 24)
 datetime.date(2015, 11, 25) datetime.date(2015, 11, 26)
 datetime.date(2015, 11, 28) datetime.date(2015, 11, 29)
 datetime.date(2015, 11, 30) datetime.date(2015, 12, 1)
 datetime.date(2015, 12, 2) datetime.date

In [41]:
# We can use `DataFrame.to_csv(FILE_PATH)` to save our DataFrame as a CSV file
df_fifteen.to_csv('2015_schedule.csv')

In [43]:
# We can use `pd.read_csv(FILE_PATH)` to read our data in as a CSV
# Note that we use the optional parameter `index_col` to indicate that the first column of the CSV 
# should be used as an index for our DataFrame (because we also exported it in the line above)
pd.read_csv('2015_schedule.csv', index_col=0)

Unnamed: 0,home_team_name,home_team_score,start_time,visiting_team_name,visiting_team_score
0,Atlanta Hawks,94,2015-10-28 00:00:00+00:00,Detroit Pistons,106
1,Chicago Bulls,97,2015-10-28 00:00:00+00:00,Cleveland Cavaliers,95
2,Golden State Warriors,111,2015-10-28 02:30:00+00:00,New Orleans Pelicans,95
3,Boston Celtics,112,2015-10-28 23:30:00+00:00,Philadelphia 76ers,95
4,Brooklyn Nets,100,2015-10-28 23:30:00+00:00,Chicago Bulls,115
5,Detroit Pistons,92,2015-10-28 23:30:00+00:00,Utah Jazz,87
6,Houston Rockets,85,2015-10-29 00:00:00+00:00,Denver Nuggets,105
7,Los Angeles Lakers,111,2015-10-29 02:30:00+00:00,Minnesota Timberwolves,112
8,Memphis Grizzlies,76,2015-10-29 00:00:00+00:00,Cleveland Cavaliers,106
9,Miami Heat,104,2015-10-28 23:30:00+00:00,Charlotte Hornets,94


In [59]:
# Your tasks:
# 1. Scrape all of the schedules for 2012-present and use pd.concat() to merge them all into one CSV `schedules.csv`
# 2. You can get a set of unique dates in the NBA schedule as per above
# 3. Use that list of unique dates to get box scores for all players on that day
# 4. Combine all of the box scores for all years into one DataFrame/CSV `box_scores.csv`
#    (they are interchangeable - CSV is for saving to file, DF is for use in pandas)
# 5. Also get the season-level statistics for each player for each year and combine into `season_stats.csv`

In [60]:
# You should end up with 3 CSVs that contain all of the data from 2012-present
# `schedules.csv` contains a list of all the scheduled games
# `box_scores.csv` contains every player's per-game box score performance
# `season_stats.csv` contains every player's per-season performance

In [62]:
# For the machine-learning step:
# 1. We want to predict the likelihood that a team (OKC) wins on a per-game level
# 2. To do this, we generate features for our model that are valid on a per-game level, such as "Average Player 3PT% in Previous Season"
# 3. We can construct a naive model that just calculates the average of all the box-score stats of each player from last season
# 4. This can be our PoC that we complete by Thursday night at the latest

# Call me if you need help assembling the data per above. Let me know when we can start feature generation + ML