# CSE 482 Project: An Analysis of PGA TOUR Statistics
### Tyler Rozwadowski | CSE 482

## Data Collection

In [40]:
# Import all the modules needed for scraping
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

The following functions are used for web scraping information off of the PGA TOUR website.

In [41]:
def get_headers(soup):
    '''
    Select header classes for dataframe
    '''
    headers = []
    
    rounds = soup.find_all(class_="rounds hidden-small hidden-medium")[0].get_text()
    headers.append(rounds)
    
    stat_headers = soup.find_all(class_="col-stat hidden-small hidden-medium")
    for header in stat_headers:
        headers.append(header.get_text())
    
    return headers

def get_players(soup):
    '''
    Gather player names from the specified stats pages
    '''
    player_list = []
    
    players = soup.select("td a")[1:] # 0th index didnt work?
    for player in players:
        player_list.append(player.get_text())
        
    return player_list

def get_stats(soup, categories):
    '''
    Get the stat categories specified
    '''
    stat_list = []
    
    stats = soup.find_all(class_="hidden-small hidden-medium")
    for i in range(0, len(stats)-categories+1, categories):
        tmp = []
        for j in range(categories):
            tmp.append(stats[i+j].get_text())
        stat_list.append(tmp)
        
    return stat_list

def make_dict(players, stats):
    '''Take a list of players and a list of stats, 
        and create a dictionary with the player name as the key,
        and his stats as the values'''
    player_dict = {}
    
    for i, player in enumerate(players):
        player_dict[player] = stats[i]
        
    return player_dict
    
def make_dataframe(url, categories):
    '''Make dataframe to store stats for specific statistics'''
    
    # Create the soup object
    response = requests.get(url)
    text = response.text
    soup = BeautifulSoup(text, 'lxml') #document we're parsing, parser
    
    headers = get_headers(soup)
    players = get_players(soup)
    stats = get_stats(soup, categories)
    
    stats_dict = make_dict(players, stats)
    frame = pd.DataFrame(stats_dict, index = headers).T #flip the dataframe around
    frame = frame.reset_index()
    frame = frame.rename(index = str, columns = {'index': 'NAME'})
    
    return frame

Now that we have all our data scraping functions, we can use them to collect the data. 

In [42]:
years = [str(i) for i in range(2010, 2011)]

In [6]:
for year in years:
    print("Collecting data for: " + year)
    
    #Fedex cup points
    fcp = make_dataframe("https://www.pgatour.com/stats/stat.02671.{}.html".format(year), 6)[['NAME', 'POINTS']]
    #Top 10's and wins
    top10 = make_dataframe("https://www.pgatour.com/stats/stat.138.{}.html".format(year), 5)[['NAME', 'TOP 10', '1ST']]

    #Scoring statistics, keep rounds from this page as it most accurately reflects total rounds player completed in season.
    scoring = make_dataframe("https://www.pgatour.com/stats/stat.120.{}.html".format(year), 5)[['NAME', 'ROUNDS', 'AVG']]
    scoring = scoring.rename(columns={'AVG':'SCORING'})
    
    # Total Money
    totalmoney = make_dataframe("https://www.pgatour.com/stats/stat.109.{}.html".format(year), 3)[['NAME', 'MONEY']]
    # totalmoney
    
    #Driving Distance
    drivedistance = make_dataframe("https://www.pgatour.com/stats/stat.101.{}.html".format(year), 4)[['NAME', 'AVG.']]
    #Rename Columns
    drivedistance = drivedistance.rename(columns = {'AVG.':'DRIVE_DISTANCE'})

    #Driving Accuracy
    driveacc = make_dataframe("https://www.pgatour.com/stats/stat.102.{}.html".format(year), 4)[['NAME', '%']]
    #Change column name from % to FWY %
    driveacc = driveacc.rename(columns = {'%': "FWY_%"})

    #Greens in Regulation.
    gir = make_dataframe("https://www.pgatour.com/stats/stat.103.{}.html".format(year), 5)[['NAME', '%']]
    #Change column name from % to GIR %
    gir = gir.rename(columns = {'%': "GIR_%"})

    #Strokes gained putting
    sg_putting = make_dataframe("https://www.pgatour.com/stats/stat.02564.{}.html".format(year), 4)[['NAME', 'AVERAGE']]
    #Change name of average column
    sg_putting = sg_putting.rename(columns = {'AVERAGE': 'SG_P'})

    #Strokes gained tee to green
    sg_teetogreen = make_dataframe("https://www.pgatour.com/stats/stat.02674.{}.html".format(year), 6)[['NAME', 'AVERAGE']]
    #Change name of average column
    sg_teetogreen = sg_teetogreen.rename(columns = {'AVERAGE' : 'SG_TTG'})

    #sg total
    sg_total = make_dataframe("https://www.pgatour.com/stats/stat.02675.{}.html".format(year), 6)[['NAME', 'AVERAGE']]
    sg_total = sg_total.rename(columns = {'AVERAGE':'SG_T'})
    
    #Get Dataframes into list.
    data_frames = [drivedistance, driveacc, gir, sg_putting, sg_teetogreen, sg_total]
    
    #Merge all Dataframes together
    df_one = pd.DataFrame()
    df_one = scoring
    for df in data_frames:
        df_one = pd.merge(df_one, df, on='NAME')
        
    

    #merge fex ex cup points
    df_one = pd.merge(df_one, fcp, how='outer', on='NAME')
    #Merge top 10's
    df_one = pd.merge(df_one, top10, how='outer', on='NAME')
    #Merge total money
    df_one = pd.merge(df_one, totalmoney, how='outer', on='NAME')
    
    #Only get people who's scoring average isn't null.
    df_one = df_one.loc[df_one['SCORING'].isnull() == False]  
    
    #Add year column
    df_one['Year'] = year
    
    #Concat dataframe to overall dataframe
    
    if year == '2010':
        df_total = pd.DataFrame()
        df_total = pd.concat([df_total, df_one], axis=0)
    else:
        df_total = pd.concat([df_total, df_one], axis=0)

Collecting data for: 2010


In [39]:
df = df_total
df.head()

Unnamed: 0,NAME,ROUNDS,SCORING,DRIVE_DISTANCE,FWY_%,GIR_%,SG_P,SG_TTG,SG_T,POINTS,TOP 10,1ST,MONEY,Year
0,Matt Kuchar,97,69.606,286.9,67.89,69.36,0.648,0.827,1.461,2728,11,1,4910477,2010
1,Steve Stricker,73,69.66,282.9,68.5,68.29,0.437,1.383,1.818,2028,9,2,4190235,2010
2,Retief Goosen,75,69.718,291.4,64.79,65.96,0.679,0.917,1.598,1360,10,0,3218089,2010
3,Paul Casey,64,69.72,294.2,61.31,68.68,0.812,0.587,1.411,2250,7,0,3613194,2010
4,Jim Furyk,76,69.828,276.0,71.01,67.12,0.402,1.159,1.564,2980,7,3,4809622,2010


In [22]:
print(df.shape)

(192, 14)


In [11]:
df.to_csv(r'output.csv')

## Data Preprocessing

Now we can go through a sequence of steps to preprocess or "clean" the collected data.

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 192 entries, 0 to 191
Data columns (total 14 columns):
NAME              192 non-null object
ROUNDS            192 non-null object
SCORING           192 non-null object
DRIVE_DISTANCE    192 non-null object
FWY_%             192 non-null object
GIR_%             192 non-null object
SG_P              192 non-null object
SG_TTG            192 non-null object
SG_T              192 non-null object
POINTS            192 non-null object
TOP 10            165 non-null object
1ST               165 non-null object
MONEY             192 non-null object
Year              192 non-null object
dtypes: object(14)
memory usage: 22.5+ KB


In [16]:
df_total.describe(include=['O'])

Unnamed: 0,NAME,ROUNDS,SCORING,DRIVE_DISTANCE,FWY_%,GIR_%,SG_P,SG_TTG,SG_T,POINTS,TOP 10,1ST,MONEY,Year
count,192,192,192.0,192.0,192.0,192.0,192.0,192.0,192.0,192,165,165.0,192,192
unique,192,57,185.0,143.0,179.0,167.0,183.0,186.0,182.0,182,11,4.0,192,1
top,Alex Cejka,84,70.954,286.9,64.19,66.67,-0.36,0.243,0.572,314,1,,642543,2010
freq,1,8,3.0,4.0,2.0,5.0,2.0,2.0,2.0,2,45,133.0,1,192


In [26]:
# Remove dollar signs and commas from money column
df['MONEY'] = df['MONEY'].str.replace(',', '')
df['MONEY'] = df['MONEY'].str.replace('$', '')

# Remove commas from the POINTS column
df['POINTS'] = df['POINTS'].str.replace(',', '')

In [27]:
# Make columns numeric
df[['ROUNDS', 'SCORING', 'DRIVE_DISTANCE', 'FWY_%', 'GIR_%', 'SG_P', 'SG_TTG', 'SG_T', 'POINTS', 'MONEY']] = df[['ROUNDS', 'SCORING', 'DRIVE_DISTANCE', 'FWY_%', 'GIR_%', 'SG_P', 'SG_TTG', 'SG_T', 'POINTS', 'MONEY']].apply(pd.to_numeric)

In [35]:
# Check data for missing values
missing_vals = df.isnull().sum().sum()
blank_vals = sum(df['1ST'] == '') + sum(df['1ST'] == '')

print("The data has {} null values and {} blank entries".format(missing_vals, blank_vals))

The data has 0 null values and 0 blank entries


In [36]:
# Impute with 0s in 1st and Top 10 columns
df.fillna('0', inplace=True)
df.replace('', '0', inplace=True)
# Check data for missing values
missing_vals = df.isnull().sum().sum()

blank_vals = sum(df['1ST'] == '') + sum(df['1ST'] == '')
print("The data has {} null values and {} blank entries".format(missing_vals, blank_vals))

The data has 0 null values and 0 blank entries


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 192 entries, 0 to 191
Data columns (total 14 columns):
NAME              192 non-null object
ROUNDS            192 non-null int64
SCORING           192 non-null float64
DRIVE_DISTANCE    192 non-null float64
FWY_%             192 non-null float64
GIR_%             192 non-null float64
SG_P              192 non-null float64
SG_TTG            192 non-null float64
SG_T              192 non-null float64
POINTS            192 non-null int64
TOP 10            192 non-null object
1ST               192 non-null object
MONEY             192 non-null int64
Year              192 non-null object
dtypes: float64(7), int64(3), object(4)
memory usage: 22.5+ KB


In [38]:
df.head()

Unnamed: 0,NAME,ROUNDS,SCORING,DRIVE_DISTANCE,FWY_%,GIR_%,SG_P,SG_TTG,SG_T,POINTS,TOP 10,1ST,MONEY,Year
0,Matt Kuchar,97,69.606,286.9,67.89,69.36,0.648,0.827,1.461,2728,11,1,4910477,2010
1,Steve Stricker,73,69.66,282.9,68.5,68.29,0.437,1.383,1.818,2028,9,2,4190235,2010
2,Retief Goosen,75,69.718,291.4,64.79,65.96,0.679,0.917,1.598,1360,10,0,3218089,2010
3,Paul Casey,64,69.72,294.2,61.31,68.68,0.812,0.587,1.411,2250,7,0,3613194,2010
4,Jim Furyk,76,69.828,276.0,71.01,67.12,0.402,1.159,1.564,2980,7,3,4809622,2010


## Association Mining

Now that we have all of our data collected and preprocessed, we can begin to do analysis on it. 