## The purpose: use BeautifulSoup,splinter, and pandas to scape the info from www.nba.com

## and then, I cleaned the data in pandas and save it as ".csv"

In [1]:
# Dependencies
from bs4 import BeautifulSoup
import requests

In [2]:
import pandas as pd

In [3]:
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist

In [4]:
# https://splinter.readthedocs.io/en/latest/drivers/chrome.html
!which chromedriver

In [5]:
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

In [6]:
# URL of page to be scraped
url = 'http://www.nba.com/players'
browser.visit(url)

In [7]:
# Retrieve page with the requests module
######## not use "response = requests.get(url)""
# but use this below
html = browser.html

# Create BeautifulSoup object; parse with 'html.parser'
######## similar to requests - use "soup = BeautifulSoup(response.text, 'html.parser')""

soup = BeautifulSoup(html, 'html.parser')

In [8]:
# Examine the results, then determine element that contains sought info
#print(soup.prettify())

In [8]:
# results are returned as an iterable list
results = soup.find_all('a', class_='playerList')

In [9]:
# count the final results 
len(results)

490

In [10]:
# save the scraped data into lists
headshot_src_list = []
name_list = []
number_in_team_list = []
position_list = []
team_list = []

for result in results:
    ### pay attention to all the path way to reach the target value
    headshot_src_short = result.find('img')['data-src']
    headshot_src = 'http:' + headshot_src_short
    headshot_src_list.append(headshot_src)
    #print(headshot_src_list)
    
    name = result.find('span', class_='name-label').text
    name_list.append(name)
    #print(name_list)
    
    number_in_team = result.p.find_all('span')[0].text
    number_in_team_list.append(number_in_team)
    #print(number_in_team_list)
        
    position = result.p.find_all('span')[1].text
    position_list.append(position)
    #print(position_list)
    
    team = result.p.find_all('span')[2].text
    team_list.append(team)

In [25]:
# save all the lists as a dataframe
recent_player_comma = pd.DataFrame({'name':name_list,\
                           'number_in_team':number_in_team_list,\
                           'team':team_list,\
                           'headshot_src':headshot_src_list})

In [26]:
recent_player_comma.head()

Unnamed: 0,name,number_in_team,team,headshot_src
0,"Adams, Jaylen",#10,Hawks,http://ak-static.cms.nba.com/wp-content/upload...
1,"Adams, Steven",#12,Thunder,http://ak-static.cms.nba.com/wp-content/upload...
2,"Adebayo, Bam",#13,Heat,http://ak-static.cms.nba.com/wp-content/upload...
3,"Adel, Deng",#32,Cavaliers,http://ak-static.cms.nba.com/wp-content/upload...
4,"Aldridge, LaMarcus",#12,Spurs,http://ak-static.cms.nba.com/wp-content/upload...


In [27]:
recent_player_comma['number_in_team'] = recent_player_comma['number_in_team'].str.replace('\#', '')

In [28]:
recent_player_comma.head()

Unnamed: 0,name,number_in_team,team,headshot_src
0,"Adams, Jaylen",10,Hawks,http://ak-static.cms.nba.com/wp-content/upload...
1,"Adams, Steven",12,Thunder,http://ak-static.cms.nba.com/wp-content/upload...
2,"Adebayo, Bam",13,Heat,http://ak-static.cms.nba.com/wp-content/upload...
3,"Adel, Deng",32,Cavaliers,http://ak-static.cms.nba.com/wp-content/upload...
4,"Aldridge, LaMarcus",12,Spurs,http://ak-static.cms.nba.com/wp-content/upload...


In [29]:
# new data frame with split value columns 
#new = recent_player_comma["name"].str.split(",",1, expand = True) 

recent_player_comma['lastName'], recent_player_comma['firstName'] = recent_player_comma['name'].str.split(',', 1).str

In [30]:
recent_player_comma.head()

Unnamed: 0,name,number_in_team,team,headshot_src,lastName,firstName
0,"Adams, Jaylen",10,Hawks,http://ak-static.cms.nba.com/wp-content/upload...,Adams,Jaylen
1,"Adams, Steven",12,Thunder,http://ak-static.cms.nba.com/wp-content/upload...,Adams,Steven
2,"Adebayo, Bam",13,Heat,http://ak-static.cms.nba.com/wp-content/upload...,Adebayo,Bam
3,"Adel, Deng",32,Cavaliers,http://ak-static.cms.nba.com/wp-content/upload...,Adel,Deng
4,"Aldridge, LaMarcus",12,Spurs,http://ak-static.cms.nba.com/wp-content/upload...,Aldridge,LaMarcus


In [21]:
# Dropping old Name columns 
recent_player = recent_player_comma.drop(columns =["name"], inplace = True) 

In [22]:
recent_player.head()

AttributeError: 'NoneType' object has no attribute 'head'

In [None]:
recent_player.to_csv("recent_players.csv")