In [20]:
#Imports
import requests # Request module
import pandas as pd # Data Wrangling
import numpy as np # Data Wrangling
from bs4 import BeautifulSoup #Web sraping module
import sys

In [2]:
def get_headers(soup):
    '''This function get's the column names to use for the data frame.'''
    headers = []
    
    #Get rounds header
    rounds = soup.find_all(class_="rounds hidden-small hidden-medium")[0].get_text()
    headers.append(rounds)
    
    #Get other headers
    stat_headers = soup.find_all(class_="col-stat hidden-small hidden-medium")
    for header in stat_headers:
        headers.append(header.get_text())
    
    return headers

In [3]:
#Get Players
def get_players(soup):
    '''This function takes the beautiful soup created and uses it to gather player names from the specified stats page.'''
    
    player_list = []
    
    #Get player as html tags
    players = soup.select('td a')[1:] #Use 1 beacuse first line of all tables is not useful.
    #Loop through list
    for player in players:
        player_list.append(player.get_text())
    
    return player_list

In [4]:
#Get Stats
def get_stats(soup, categories):
    '''This function takes the soup created before and the number of categories needed to generate this'''
    
    #Finds all tags with class specified and puts into a list
    stats = soup.find_all(class_="hidden-small hidden-medium")
    
    #Initialize stats list
    stat_list = []
    
    #Loop through 
    for i in range(0, len(stats)-categories+1, categories):
        temp_list = []
        for j in range(categories):
            temp_list.append(stats[i + j].get_text())
        stat_list.append(temp_list)
            
    return stat_list

In [29]:
def make_dataframe(url, categories):
        
    ##Create soup object from url.
    response = requests.get(url)
    text = response.text
    soup = BeautifulSoup(text, 'lxml')
    
    #1. Get Headers
    headers = get_headers(soup)
    
    #2. Get Players
    players = get_players(soup)
    
    #3. Get Stats
    stats = get_stats(soup, categories)
    
    #4. Make stats dictionary.
    stats_dictionary = stats_dict(players, stats)
    
    #Make dataframe
    frame = pd.DataFrame(stats_dictionary, index = headers).T
    
    #Reset index
    frame = frame.reset_index()
    
    #For each Dataframe, change index column to 'NAME'
    frame = frame.rename(index = str, columns = {'index': 'NAME'})
    return frame

In [42]:
def stats_dict(players, stats):
        '''This function takes two lists, players and stats, 
        and creates a dictionary with the player being the key 
        and the stats as the vales (as a list)'''
    
        #initialize player dictionary
        player_dict = {}
    
        #Loop through player list
        for i, player in enumerate(players):
            try:
                player_dict[player] = stats[i]
            except(IndexError):
                print(i,player)
            
    
        return player_dict

In [43]:
years = [str(i) for i in range(2014, 2019)]
for year in years:
    print(year)
    #Fedex cup points
    fcp = make_dataframe("https://www.pgatour.com/stats/stat.02671.y{}.html".format(year), 6)[['NAME', 'POINTS']]
    
    #Top 10's and wins
    top10 = make_dataframe("https://www.pgatour.com/stats/stat.138.y{}.html".format(year), 5)[['NAME', 'TOP 10', '1ST']]

    #Scoring statistics, keep rounds from this page as it most accurately reflects total rounds player completed in season.
    scoring = make_dataframe("https://www.pgatour.com/stats/stat.120.y{}.html".format(year), 5)[['NAME', 'ROUNDS', 'AVG']]
    scoring = scoring.rename(columns={'AVG':'SCORING'})

    #Driving Distance
    drivedistance = make_dataframe("https://www.pgatour.com/stats/stat.101.y{}.html".format(year), 4)[['NAME', 'AVG.']]
    #Rename Columns
    drivedistance = drivedistance.rename(columns = {'AVG.':'DRIVE_DISTANCE'})

    #Driving Accuracy
    driveacc = make_dataframe("https://www.pgatour.com/stats/stat.102.y{}.html".format(year), 4)[['NAME', '%']]
    #Change column name from % to FWY %
    driveacc = driveacc.rename(columns = {'%': "FWY_%"})

    #Greens in Regulation.
    gir = make_dataframe("https://www.pgatour.com/stats/stat.103.y{}.html".format(year), 5)[['NAME', '%']]
    #Change column name from % to GIR %
    gir = gir.rename(columns = {'%': "GIR_%"})

    #Strokes gained putting
    sg_putting = make_dataframe("https://www.pgatour.com/stats/stat.02564.{}.html".format(year), 4)[['NAME', 'AVERAGE']]
    #Change name of average column
    sg_putting = sg_putting.rename(columns = {'AVERAGE': 'SG_P'})

    #Strokes gained tee to green
    sg_teetogreen = make_dataframe("https://www.pgatour.com/stats/stat.02674.y{}.html".format(year), 6)[['NAME', 'AVERAGE']]
    #Change name of average column
    sg_teetogreen = sg_teetogreen.rename(columns = {'AVERAGE' : 'SG_TTG'})

    #sg total
    sg_total = make_dataframe("https://www.pgatour.com/stats/stat.02675.y{}.html".format(year), 6)[['NAME', 'AVERAGE']]
    sg_total = sg_total.rename(columns = {'AVERAGE':'SG_T'})
    
    #Get Dataframes into list.
    data_frames = [drivedistance, driveacc, gir, sg_putting, sg_teetogreen, sg_total]
    
    #Merge all Dataframes together
    df_one = pd.DataFrame()
    df_one = scoring
    for df in data_frames:
        df_one = pd.merge(df_one, df, on='NAME')
        
    

    #merge fex ex cup points
    df_one = pd.merge(df_one, fcp, how='outer', on='NAME')
    #Merge top 10's
    df_one = pd.merge(df_one, top10, how='outer', on='NAME')
    
    #Only get people who's scoring average isn't null.
    df_one = df_one.loc[df_one['SCORING'].isnull() == False]  
    
    #Add year column
    df_one['Year'] = year
    
    #Concat dataframe to overall dataframe
    
    if year == '2010':
        df_total = pd.DataFrame()
        df_total = pd.concat([df_total, df_one], axis=0)
    else:
        df_total = pd.concat([df_total, df_one], axis=0)

2014
215 Mark Calcavecchia
216 Patrick Cantlay
217 Tiger Woods
218 Tag Ridings
219 Lee Janzen
220 Joe Ogilvie
221 Glen Day
222 Steve Flesch
223 Harrison Frazar
224 Justin Bolli
225 José María Olazábal
226 Alex Cejka
227 Dudley Hart
228 Marc Turnesa
229 Chris Smith
230 Sandy Lyle
231 Billy Mayfair
232 Edward Loar
233 Len Mattiace
234 Larry Mize
235 Kent Jones
236 Kris Blanks
237 D.J. Trahan
238 Chez Reavie
239 Benjamín Alvarado
240 Nathan Green
241 Ben Kohles
242 Vaughn Taylor
243 Paul Goydos
244 Arjun Atwal
245 Todd Hamilton
246 Fabián Gómez
247 Fred Funk
248 Daniel Chopra
249 Jesper Parnevik
250 Marco Dawson
251 Alex Aragon
252 Scott McCarron
253 Paul Stankowski
254 Scott Verplank
255 Chris DiMarco
256 Brett Quigley
257 Bobby Gates


ValueError: Shape of passed values is (6, 215), indices imply (5, 215)