In [1]:
# import packages
import requests
import json
import pandas as pd
from pyquery import PyQuery as pq  # PyQuery我其實沒用到哈哈哈

# 流程
1. 查看該網站network
2. 找出取得資料的xhr
3. 使用GET請求來取得資料（JSON格式）
4. 解析JSON
5. 展示所得到的球員資料
6. 將球員資料儲存成DataFrame

## 爬蟲前置處理
* 這邊由於這個網頁是動態生成的，因此我們不能直接用requests去要他，我們要用GET請求的方式去處理

In [2]:
session = requests.session()
# 設定請求訊息
session.header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
                  'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}

In [3]:
# 透過輸入的網址送出請求，並取得內容
def get_info(url):
    # 回傳的一般都是JSON格（字典）
    html = session.get(url)
    # 直接取出JSON格式
    return html.json()

## 分別用以下兩url帶入可得到hitting和pitching的資料
* hitting
    * http://mlb.mlb.com/pubajax/wf/flow/stats.splayer?season=2019&sort_order=%27desc%27&sort_column=%27avg%27&stat_type=hitting&page_type=SortablePlayer&game_type=%27R%27&player_pool=QUALIFIER&season_type=ANY&sport_code=%27mlb%27&results=1000&recSP=1&recPP=200
* pitching
    * http://mlb.mlb.com/pubajax/wf/flow/stats.splayer?season=2019&sort_order=%27asc%27&sort_column=%27era%27&stat_type=pitching&page_type=SortablePlayer&game_type=%27R%27&player_pool=QUALIFIER&season_type=ANY&sport_code=%27mlb%27&results=1000&position=%271%27&recSP=1&recPP=100

In [4]:
# 目前是打者的資訊
doc_json = get_info("http://mlb.mlb.com/pubajax/wf/flow/stats.splayer?season=2019&sort_order=%27desc%27&sort_column=%27avg%27&stat_type=hitting&page_type=SortablePlayer&game_type=%27R%27&player_pool=QUALIFIER&season_type=ANY&sport_code=%27mlb%27&results=1000&recSP=1&recPP=200")

In [5]:
# checkpoint
# 查看目前資料樣式
doc_json

{'stats_sortable_player': {'copyRight': ' Copyright 2019 MLB Advanced Media, L.P.  Use of any content on this page acknowledges agreement to the terms posted here http://gdx.mlb.com/components/copyright.txt  ',
  'queryResults': {'recPP': '200',
   'created': '2019-06-25T04:53:09',
   'recSP': '1',
   'totalP': '1',
   'recs': '161',
   'totalSize': '161',
   'row': [{'gidp': '3',
     'sac': '0',
     'np': '1334',
     'name_display_first_last': 'Cody Bellinger',
     'pos': 'RF',
     'rank': '1',
     'tb': '190',
     'team_brief': 'Dodgers',
     'sport_id': '1',
     'name_display_last_init': 'Bellinger, C',
     'bb': '48',
     'avg': '.349',
     'slg': '.699',
     'ops': '1.143',
     'hbp': '1',
     'team_abbrev': 'LAD',
     'so': '49',
     'league_id': '104',
     'sf': '3',
     'team': 'lan',
     'league': 'NL',
     'cs': '5',
     'sb': '8',
     'go_ao': '0.52',
     'last_name': 'Bellinger',
     'player_id': '641355',
     'ibb': '10',
     'player_qualifier': 

In [6]:
# checkpoint
# 透過一層一層解析到達球員數據層
# 並查看第一位球員數據
doc_json['stats_sortable_player']['queryResults']['row'][0]

{'gidp': '3',
 'sac': '0',
 'np': '1334',
 'name_display_first_last': 'Cody Bellinger',
 'pos': 'RF',
 'rank': '1',
 'tb': '190',
 'team_brief': 'Dodgers',
 'sport_id': '1',
 'name_display_last_init': 'Bellinger, C',
 'bb': '48',
 'avg': '.349',
 'slg': '.699',
 'ops': '1.143',
 'hbp': '1',
 'team_abbrev': 'LAD',
 'so': '49',
 'league_id': '104',
 'sf': '3',
 'team': 'lan',
 'league': 'NL',
 'cs': '5',
 'sb': '8',
 'go_ao': '0.52',
 'last_name': 'Bellinger',
 'player_id': '641355',
 'ibb': '10',
 'player_qualifier': '324',
 'team_id': '119',
 'go': '45',
 'hr': '25',
 'minimum_qualifier': '245',
 'gdp': '3',
 'name_display_roster': 'Bellinger, Cody',
 'qualifies': 'Y',
 'rbi': '61',
 'name_first': 'Cody',
 'bats': 'L',
 'xbh': '43',
 'g': '76',
 'd': '16',
 'team_name': 'Los Angeles Dodgers',
 'sport': 'MLB',
 'tpa': '324',
 'name_display_last_first': 'Bellinger, Cody',
 'h': '95',
 'obp': '.444',
 't': '2',
 'ao': '86',
 'r': '61',
 'ab': '272',
 'name_last': 'Bellinger'}

### 各數據對應key

球員數據層  
`doc_json['stats_sortable_player']['queryResults']['row']`

* 打者數據：
    * 打數
        * `'ab'`
    * 上壘率
        * `'obp'`
    * 聯盟平均上壘率
        * ??
    * 長打率
        * `'slg'`
    * 聯盟平均長打率
        * ??
    * 整體攻擊指數
        * `'ops'`  
---
* 註：`聯盟平均上壘率和聯盟平均長打率因為不屬於選手數據所以不再所得的Data內，可能要看是要直接在網站上找資料或是拿所有選手的數據加總來計算`

In [7]:
# 透過一層一層解析到達球員數據層
# 並將其儲存為一變數方便重複使用
player_stats_json = doc_json['stats_sortable_player']['queryResults']['row']

In [8]:
# 建立各list儲存各資料
# 準備放入DataFrame中
player_list = []
ab_list = []
obp_list = []
slg_list = []
ops_list = []

In [10]:
# 展示特定球員我們所需要的資料
def show_player_stats(index):
    print('Player: ',player_stats_json[index]['name_display_last_init'])
    print('AB    : ',player_stats_json[index]['ab'])
    print('OBP   : ',player_stats_json[index]['obp'])
    print('SLG   : ',player_stats_json[index]['slg'])
    print('OPS   : ',player_stats_json[index]['ops'])
    
# 將各球員資料儲存到list中
def store_player_stats(index):
    player_list.append(player_stats_json[index]['name_display_last_init'])
    ab_list.append(player_stats_json[index]['ab'])
    obp_list.append(player_stats_json[index]['obp'])
    slg_list.append(player_stats_json[index]['slg'])
    ops_list.append(player_stats_json[index]['ops'])

In [11]:
# checkpoint
# 試查看第一位球員
show_player_stats(0)

Player:  Bellinger, C
AB    :  272
OBP   :  .444
SLG   :  .699
OPS   :  1.143


In [12]:
# 透過迴圈展示所有球員
# 並將球員資料儲存到list中
for i in range(len(player_stats_json)):
    show_player_stats(i)
    store_player_stats(i)
    print('------------------------')

Player:  Bellinger, C
AB    :  272
OBP   :  .444
SLG   :  .699
OPS   :  1.143
------------------------
Player:  Yelich, C
AB    :  266
OBP   :  .435
SLG   :  .744
OPS   :  1.179
------------------------
Player:  McNeil, J
AB    :  242
OBP   :  .403
SLG   :  .488
OPS   :  .891
------------------------
Player:  LeMahieu, D
AB    :  293
OBP   :  .380
SLG   :  .495
OPS   :  .875
------------------------
Player:  Blackmon, C
AB    :  269
OBP   :  .376
SLG   :  .643
OPS   :  1.019
------------------------
Player:  Polanco, J
AB    :  301
OBP   :  .383
SLG   :  .542
OPS   :  .925
------------------------
Player:  Arenado, N
AB    :  296
OBP   :  .386
SLG   :  .588
OPS   :  .973
------------------------
Player:  Brantley, M
AB    :  295
OBP   :  .382
SLG   :  .512
OPS   :  .893
------------------------
Player:  Dahl, D
AB    :  254
OBP   :  .370
SLG   :  .524
OPS   :  .894
------------------------
Player:  Freeman, F
AB    :  308
OBP   :  .402
SLG   :  .601
OPS   :  1.003
---------------------

AB    :  293
OBP   :  .306
SLG   :  .512
OPS   :  .818
------------------------
Player:  Castro, S
AB    :  302
OBP   :  .268
SLG   :  .321
OPS   :  .589
------------------------
Player:  Pujols, A
AB    :  232
OBP   :  .309
SLG   :  .448
OPS   :  .758
------------------------
Player:  Dozier, B
AB    :  234
OBP   :  .318
SLG   :  .436
OPS   :  .754
------------------------
Player:  Smith, M
AB    :  239
OBP   :  .306
SLG   :  .364
OPS   :  .670
------------------------
Player:  Duggar, S
AB    :  248
OBP   :  .277
SLG   :  .343
OPS   :  .620
------------------------
Player:  Gardner, B
AB    :  248
OBP   :  .317
SLG   :  .440
OPS   :  .756
------------------------
Player:  Nunez, R
AB    :  270
OBP   :  .291
SLG   :  .463
OPS   :  .754
------------------------
Player:  Panik, J
AB    :  249
OBP   :  .314
SLG   :  .321
OPS   :  .636
------------------------
Player:  Pederson, J
AB    :  215
OBP   :  .328
SLG   :  .558
OPS   :  .886
------------------------
Player:  Calhoun, K
AB    :  

In [13]:
# 建立DataFrame
players_dataframe = pd.DataFrame()

In [14]:
# 將各個list放入dataframe對應項目
players_dataframe['Player'] = player_list
players_dataframe['AB'] = ab_list
players_dataframe['OBP'] = obp_list
players_dataframe['SLG'] = slg_list
players_dataframe['OPS'] = ops_list

In [15]:
# checkpoint
# 查看當前dataframe狀況
players_dataframe

Unnamed: 0,Player,AB,OBP,SLG,OPS
0,"Bellinger, C",272,.444,.699,1.143
1,"Yelich, C",266,.435,.744,1.179
2,"McNeil, J",242,.403,.488,.891
3,"LeMahieu, D",293,.380,.495,.875
4,"Blackmon, C",269,.376,.643,1.019
5,"Polanco, J",301,.383,.542,.925
6,"Arenado, N",296,.386,.588,.973
7,"Brantley, M",295,.382,.512,.893
8,"Dahl, D",254,.370,.524,.894
9,"Freeman, F",308,.402,.601,1.003
