# Extract player info

In [None]:
from bs4 import BeautifulSoup
import urllib.request as url
import re

def find_player_id_per_year(years):
    players_season = {}
    for year in years:
        players = {}
        url_year = "http://www.espn.com/nba/statistics/player/_/stat/scoring-per-game/sort/avgPoints/year/" + str(year) + "/qualified/false"
        for page in [-1] + list(range(41, 482, 40)):
            url_page = ""
            if page == -1:
                url_page = url_year
            else:
                url_page = url_year + "/count/" + str(page)
            html_page = url.urlopen(url_page)
            soup = BeautifulSoup(html_page)
            for link in soup.findAll('a', attrs={'href': re.compile("^http://www.espn.com/nba/player/_/id")}):
                players_id = link.get('href').split('/')
                players[players_id[len(players_id)-1]] = players_id[len(players_id)-2]
        players_season[str(year-1) + "-" + str(year)] = players
    return players_season;    

In [None]:
import pandas
def read_players(player_id_years, years):
    player_info = {}
    for year in years:
        player_id = player_id_years[str(year-1)+"-"+str(year)]
        for player in player_id:
            url_path = "http://www.espn.com/nba/player/gamelog/_/id/" + player_id[player] + "/year/" + str(year) + "/" + player
            try:
                dataframe = pandas.read_html(url_path)
                if player not in player_info:
                    player_info[player] = {}
                player_info[player][str(year-1)+"-"+str(year)] = dataframe
            except url.HTTPError:
                pass
    return player_info
# http://www.espn.com/nba/player/stats/_/id/2994526/bryn-forbes

In [None]:
player_id_years = find_player_id_per_year([2018])
test_player_info = read_players(player_id_years,[2018])
player_id_years = find_player_id_per_year([2013,2014,2015,2016,2017])
train_player_info = read_players(player_id_years, [2013,2014,2015,2016,2017])

In [None]:
import pickle 
with open('player_data/test_player_info.pkl', 'wb') as fp:
    pickle.dump(test_player_info, fp)
with open('player_data/train_player_info.pkl', 'wb') as fp:
    pickle.dump(train_player_info, fp)

In [None]:
def player_data_processing(data):
    X = {}
    # print (data[player][2017][1][index:index+2])
    for player in data:
        for year in data[player]:
            feature  = []
            table = np.array(data[player][year][1][0])
            if len(table)<4:
                continue
    #         print (table)
            if 'REGULAR SEASON STATS' not in table:
                continue
            index = int(np.argwhere(table == 'REGULAR SEASON STATS'))
    #         print (index)
            for i in range (1,15):
                x= str(data[player][year][1][index+1:index+2][i])
                x = x.split('\n')[0]
                x = x.split(' ')[-1]
                if '-' in x:
                    x = x.split('-')[-1]
                feature.append(x)
            feature = np.asarray(feature)
            feature = list(map(eval, feature))
            if player not in X:
                X[player]={}
            if year not in X[player]:
                X[player][year] = []
            X[player][year] = feature
    return X

In [None]:
with open('player_data/test_player_info.pkl', 'rb') as fp:
    test_player_info = pickle.load(fp)
with open('player_data/train_player_info.pkl', 'rb') as fp:
    train_player_info = pickle.load(fp)
    
test_processed_player_data = player_data_processing(test_player_info)
train_processed_player_data = player_data_processing(train_player_info)

with open('player_data/test_processed_player_data.pkl', 'wb') as fp:
    pickle.dump(test_processed_player_data, fp)
with open('player_data/train_processed_player_data.pkl', 'wb') as fp:
    pickle.dump(train_processed_player_data, fp)

In [None]:
print(train_processed_player_data)

In [None]:
print(test_processed_player_data)

# Extract team info

In [None]:
def find_teams_url(team_url):
    html_page = url.urlopen(team_url)
    soup = BeautifulSoup(html_page)
    team_url = []
    for link in soup.findAll('a', attrs={'href': re.compile("^http://www.espn.com/nba/team/_/name/")}):
        team_url.append(link.get('href'))
    return set(team_url)


def read_teams(team_url, years):
    team_stat = {}
    for url_path in team_url:
        url_component = url_path.split('/_/')
        stat_url = url_component[0] + '/schedule/_/' + 'name/' + url_component[1].split('/')[1] + '/season/'
        year_stat = {}
        for year in years:
            try:
                dataframe = pandas.read_html(stat_url + str(year) + '/seasontype/2')
                year_stat[year] = dataframe
            except url.HTTPError:
                pass
            print(stat_url + str(year) + '/seasontype/2')
        frames = []
        for year in years:
            df_changed = year_stat[year][2].loc[2:, 0:2]
            before_half = df_changed[0].str.split(" ",expand=True)[1].isin(["Jan","Feb","Mar","Apr","May","Jun","Jul"])
            df_changed[0][before_half] = df_changed[0][before_half]+', '+str(year)
            df_changed[0][~before_half] = df_changed[0][~before_half]+', '+str(year-1)
            df_changed[3] = str(year - 1) + "-" + str(year)
            frames.append(df_changed)
        result = pandas.concat(frames)
        result.index = range(len(result))
        
        team_name_component = url_component[1][5:].split('/')[1].split('-')[:-1]
        team_name = ' '.join(team_name_component)
        
        team_stat[team_name] = result
    return team_stat

def read_teams_total_and_players_per_year(team_url, years):
    team_total = {}
    for url_path in team_url:
        url_component = url_path.split('/_/')
        stat_url = url_component[0] + '/stats/_/' + 'name/' + url_component[1].split('/')[1] + '/year/'
        year_info = {}
        for year in years:
            try:
                dataframe = pandas.read_html(stat_url + str(year))
                team_total_info = pandas.concat([dataframe[0][len(dataframe[0])-1:],dataframe[1][len(dataframe[1])-1:]])
                team_total_info.index = range(len(team_total_info))
                team_players = dataframe[0][2:][0]
                team_players.index = range(len(team_players))
                year_info[str(year-1) + "-" + str(year)] = (team_total_info, team_players)
            except url.HTTPError:
                pass
            print(stat_url + str(year))
        
        team_name_component = url_component[1][5:].split('/')[1].split('-')[:-1]
        team_name = ' '.join(team_name_component)
        
        team_total[team_name] = year_info
    return team_total

In [None]:
team_url = find_teams_url("http://www.espn.com/nba/teams")
# test = read_teams(team_url, [2018])
# train = read_teams(team_url, [2013,2014,2015,2016,2017])

In [None]:
# from datetime import datetime
def data_processing(test):
    game_info = {}
    for team in test:
        for i in range(5,len(test[team][1])):
            #opponent of the current game
            if team not in game_info:
                game_info[team] = []
            #the results of former 5 games of the team
            info = []
            #opponent info
            opp_info = test[team][1][i].strip().lower()
            if opp_info[0] == '@':
                opp = opp_info[1:].strip()
                info.append(opp)
            else:
                opp = opp_info[2:].strip()
                info.append(opp)
            #lose or win
            for j in range(1,6):
                if test[team][2][5-j][0] == 'L':
                    info.append('0')
                else:
                    info.append('1')
            #Home field or away of the current game
            if (test[team][1][i].strip()[0] == 'v'):
                info.append('1')
            else:
                info.append('0')
#             #back to back
#             print(team)
#             d1 = datetime.strptime(test[team][0][i], "%a, %b %d, %Y")
#             d2 = datetime.strptime(test[team][0][i-1], "%a, %b %d, %Y")
#             if (d1-d2).days==1:
#                 info.append('1')
#             else:
#                 info.append('0')
            #the feature vector and result of the current game       
            if (test[team][2][i][0] == 'L'):
                game_info[team].append((info, '0', test[team][3][i]))
            else:
                game_info[team].append((info, '1', test[team][3][i]))
                
    return game_info

In [None]:
def total_and_players_to_dict(data):
    total_dict = {}
    team_players = {}
    for team in data:
        total_dict[team] = {}
        team_players[team] = {}
        for season in data[team]:
            #process team total info first
            lst = [data[team][season][0][1][0]]
            for i in range(4, 14):
                lst.append(str(data[team][season][0][i][0]))
            for i in range(1, 15):
                lst.append(str(data[team][season][0][i][1]))
            total_dict[team][season] = lst
            #determine the players of each team per season
            team_players[team][season] = [data[team][season][1][i][:data[team][season][1][i].rfind(',')] for i in range(len(data[team][season][1]))]
    return total_dict, team_players

test_total_and_players = read_teams_total_and_players_per_year(team_url, [2018])
train_total_and_players = read_teams_total_and_players_per_year(team_url, [2013,2014,2015,2016,2017])

test_total, test_team_players = total_and_players_to_dict(test_total_and_players)
train_total, train_team_players = total_and_players_to_dict(train_total_and_players)

In [None]:
with open('team_data/test_total.pkl', 'wb') as fp:
    pickle.dump(test_total, fp)
with open('team_data/train_total.pkl', 'wb') as fp:
    pickle.dump(train_total, fp)

team_players = {}
for team in train_team_players:
    team_players[team] = train_team_players[team]
    for season in test_team_players[team]:
        team_players[team][season] = test_team_players[team][season]
    
with open('team_data/team_players.pkl', 'wb') as fp:
    pickle.dump(team_players, fp)

In [None]:
print(team_players)

In [None]:
import pickle 

all_games_train = data_processing(train)
all_games_test = data_processing(test)

with open('team_data/all_games_train.pkl', 'wb') as fp:
    pickle.dump(all_games_train, fp)
with open('team_data/all_games_test.pkl', 'wb') as fp:
    pickle.dump(all_games_train, fp)

# Extract game info

In [None]:
def read_game_info(team_url, years):
    games_url = {}
    for url_path in team_url:
        url_component = url_path.split('/_/')
        stat_url = url_component[0] + '/schedule/_/' + 'name/' + url_component[1].split('/')[1] + '/season/'
        for year in years:
            if year not in games_url:
                games_url[year] = set()
            try:
                html_page = url.urlopen(stat_url + str(year) + '/seasontype/2')
                soup = BeautifulSoup(html_page)
                for link in soup.findAll('a', attrs={'href': re.compile("^http://www.espn.com/nba/game")}):
                    games_url[year].add("http://www.espn.com/nba/boxscore?gameId=" + link.get('href').split('=')[-1])
            except url.HTTPError:
                pass
    games_info = {}
    for year in games_url:
        games_info[str(year-1)+"-"+str(year)] = []
        for url_path in games_url[year]:
            try:
                dataframe = pandas.read_html(url_path)
                games_info[str(year-1)+"-"+str(year)].append(dataframe)
            except url.HTTPError:
                pass
        
    return games_info

In [None]:
game_info = read_game_info(team_url, [2018])
with open('game_data/test_game_info_raw.pkl', 'wb') as fp:
    pickle.dump(game_info, fp)

train_game_info = read_game_info(team_url, [2013,2014,2015,2016,2017])
with open('game_data/train_game_info_raw.pkl', 'wb') as fp:
    pickle.dump(train_game_info, fp)
#example: http://www.espn.com/nba/boxscore?gameId=400974442

In [None]:
with open('game_data/test_game_info_raw.pkl', 'rb') as fp:
    test_game_info = pickle.load(fp)
with open('game_data/train_game_info_raw.pkl', 'rb') as fp:
    train_game_info = pickle.load(fp)

In [None]:
# df = test_game_info["2017-2018"][0][1]
# df.MIN = pandas.to_numeric(df.MIN, errors='coerce')
# df = df.sort_values(by=['MIN'], ascending=False)
# test_game_info["2017-2018"][0][1]

In [None]:
def process_game_info(data, team_player, team_name_brev):
    games_per_season = {}
    for season in data:
        games_per_season[season] = []
        for df in data[season]:
            scores = tuple(df[0]['T'])
            #team1 info
            team1 = team_name_brev[df[0]['Unnamed: 0'][0]]
            df[1].MIN = pandas.to_numeric(df[1].MIN, errors='coerce')
            starters = [cleanName(name, team_player[team1]) for name in df[1].sort_values(by=['MIN'], ascending=False)[:5]]
            ben = [cleanName(name, team_player[team1]) for name in df[1].sort_values(by=['MIN'], ascending=False)[5:10]]
            team1_players = starters+ben
            #team2 info
            team2 = team_name_brev[df[0]['Unnamed: 0'][1]]
            df[2].MIN = pandas.to_numeric(df[2].MIN, errors='coerce')
            starters = [cleanName(name, team_player[team2]) for name in df[2].sort_values(by=['MIN'], ascending=False)[:5]]
            ben = [cleanName(name, team_player[team2]) for name in df[2].sort_values(by=['MIN'], ascending=False)[5:10]]
            team2_players = starters+ben
        games_per_season[season].append([team1_players, team2_players, scores])
    return games_per_season
            
def cleanName(name, team_players):
    i = len(name) - 1
    while i >= 0:
        if not name[i].islower():
            break
    name = name[:int(i/2)]
    cleaned = name.split('.')
    for player in team_players:
        if player.startswith(cleaned[0].strip()) and player.split(' ')[1] == cleaned[1].strip():
            return player
    return name

In [None]:
with open('team_data/team_abrv.pkl', 'rb') as fp:
    team_abrv = pickle.load(fp)
with open('team_data/team_players.pkl', 'rb') as fp:
    team_players = pickle.load(fp)

In [None]:
test_game_info_processed = process_game_info(test_game_info, team_players, team_abrv)

In [None]:
train_game_info_processed = process_game_info(train_game_info, team_players, team_abrv)