# Web Scraper of RealGM.com

### Import libraries

In [12]:
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import re

### Read the html file

In [13]:
URL = '/nba/stats/2018/Averages/All/points/All/desc/1/Regular_Season'
response = requests.get('https://basketball.realgm.com/nba/stats/2018/Averages/All/points/All/desc/1/Regular_Season')
soup = BeautifulSoup(response.content,'html.parser')

##### Another solution
#response = urlopen('https://basketball.realgm.com{}'.format(URL))
#soup = BeautifulSoup(html,'html.parser')

### Create a pandas dataframe with the heads from URL

In [14]:
head = soup.find('table',{'class':'tablesaw compact'}).find('thead').find('tr')
trnames = head.find_all('th')
names = [th.text for th in trnames]
df = pd.DataFrame(columns=names)

In [15]:
print(df)

Empty DataFrame
Columns: [#, Player, Team, GP, MPG, FGM, FGA, FG%, 3PM, 3PA, 3P%, FTM, FTA, FT%, TOV, PF, ORB, DRB, RPG, APG, SPG, BPG, PPG]
Index: []

[0 rows x 23 columns]


## To avoid scraping same web, we set a null page to record all URLs. 

This is because in reallife, most web pages are just contain a portion of information, we have to click next page to find more info. We set a null set call 'pages' to record all these websites. Once a webpage have been searched, it will be recorded in 'pages', so we will not search this website again.

In [16]:
pages = set()
pages.add(URL)

##### define the function of scraper

In [17]:
def getLinks(myUrl):
    global pages
    global df
    html = urlopen('https://basketball.realgm.com{}'.format(myUrl))
    bs = BeautifulSoup(html,'html.parser')
    table = bs.find('table',{'class':'tablesaw compact'}).find('tbody')
    trs = table.find_all('tr')
    for tr in trs:
        tds = tr.find_all('td')
        row = [td.text for td in tds]
        df = df.append(pd.Series(row,index=names),ignore_index=True)   
# Here re.compile is to search any url link that match the form in (), 
# ".*" means matching any characters.
    for link in bs.find('div',{'class':'main-container'}).find_all('a',href\
    =re.compile('^(/nba/stats/2018/Averages/All/points/All/desc/.*/Regular_Season)')):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                newPage = link.attrs['href']
                print('-'*20)
                print(newPage)
                pages.add(newPage)
                getLinks(newPage)




##### Now input the first url link of NBA players stats, it will automatically return all data from this url link and following pages.

In [18]:
getLinks(URL)

--------------------
/nba/stats/2018/Averages/All/points/All/desc/2/Regular_Season
--------------------
/nba/stats/2018/Averages/All/points/All/desc/3/Regular_Season
--------------------
/nba/stats/2018/Averages/All/points/All/desc/4/Regular_Season
--------------------
/nba/stats/2018/Averages/All/points/All/desc/5/Regular_Season
--------------------
/nba/stats/2018/Averages/All/points/All/desc/6/Regular_Season


##### Save the dataframe as csv file

In [19]:
print(df)
df.to_csv('2017-2018 NBA Stats.csv',index=False)

       #                 Player Team  GP   MPG   FGM   FGA   FG%  3PM   3PA  \
0      1           James Harden  HOU  72  35.4   9.0  20.1  .449  3.7  10.0   
1      2          Anthony Davis  NOP  75  36.4  10.4  19.5  .534  0.7   2.2   
2      3           LeBron James  CLE  82  36.9  10.5  19.3  .542  1.8   5.0   
3      4         Damian Lillard  POR  73  36.6   8.5  19.4  .439  3.1   8.6   
4      5  Giannis Antetokounmpo  MIL  75  36.8   9.9  18.7  .529  0.6   1.9   
5      6          Stephen Curry  GSW  51  32.0   8.4  16.9  .495  4.2   9.8   
6      7           Kevin Durant  GSW  68  34.2   9.3  18.0  .516  2.5   6.1   
7      8      Russell Westbrook  OKC  80  36.4   9.5  21.1  .449  1.2   4.1   
8      9       DeMarcus Cousins  NOP  48  36.2   8.5  18.0  .470  2.2   6.1   
9     10           Devin Booker  PHX  54  34.5   8.4  19.5  .432  2.7   7.1   
10    11           Kyrie Irving  BOS  60  32.2   8.9  18.1  .491  2.8   6.8   
11    12      LaMarcus Aldridge  SAS  75  33.4   9.2

Done!