# Group 26: Introduction to Social Data Science (ISDS), exam project 2023

## Can We Predict If a PGA Tour Player Won a Tournament and Their Earnings based on educational background?

## <a id='TOC'>Table of Contents</a>
<ol>
<li><a href='#section_1'>Data Collection: Accessing stats on the PGA Tour website</a></li>
<li><a href='#section_2'>Predicting winners with machine learning classification models</a></li>
<li><a href='#section_3'>Predicting earnings per tournament based on college background</a></li>
</ol>

## 1. <a id='section_1'>Data Collection: Accessing stats on the PGA Tour website </a>
<a href='#TOC'>Back to table of Contents</a>

### Importing Packages 

In [None]:
import pandas as pd
import requests
import seaborn as sns
import matplotlib.pyplot as plt

### Fetching data via API

In [None]:
def available_stats(category: int):
    """
    Retrieve available statistics within a specific category.
    
    Parameters:
        category (int): The index of the category for which to retrieve statistics.
    
    Returns:
        dict: Information about the statistics within the specified category.
    """
    X_API_KEY = "da2-gsrx5bibzbb4njvhl7t37wqyl4"
    payload = {
        "operationName": "StatDetails",
        "variables": {
            "tourCode": "R",
            "statId": 2,
            "year": 2023,
            "eventQuery": None
        },
        "query": (
            "query StatDetails($tourCode: TourCode!, $statId: String!, $year: Int, $eventQuery: StatDetailEventQuery) {\n"
            "  statDetails(\n"
            "    tourCode: $tourCode\n"
            "    statId: $statId\n"
            "    year: $year\n"
            "    eventQuery: $eventQuery\n"
            "  ) {\n"
            "    tourCode\n"
            "    year\n"
            "    displaySeason\n"
            "    statId\n"
            "    statType\n"
            "    tournamentPills {\n"
            "      tournamentId\n"
            "      displayName\n"
            "    }\n"
            "    yearPills {\n"
            "      year\n"
            "      displaySeason\n"
            "    }\n"
            "    statTitle\n"
            "    statDescription\n"
            "    tourAvg\n"
            "    lastProcessed\n"
            "    statHeaders\n"
            "    statCategories {\n"
            "      category\n"
            "      displayName\n"
            "      subCategories {\n"
            "        displayName\n"
            "        stats {\n"
            "          statId\n"
            "          statTitle\n"
            "        }\n"
            "      }\n"
            "    }\n"
            "    rows {\n"
            "      ... on StatDetailsPlayer {\n"
            "        __typename\n"
            "        playerId\n"
            "        playerName\n"
            "        country\n"
            "        countryFlag\n"
            "        rank\n"
            "        rankDiff\n"
            "        rankChangeTendency\n"
            "        stats {\n"
            "          statName\n"
            "          statValue\n"
            "          color\n"
            "        }\n"
            "      }\n"
            "      ... on StatDetailTourAvg {\n"
            "        __typename\n"
            "        displayName\n"
            "        value\n"
            "      }\n"
            "    }\n"
            "  }\n"
            "}"
        )
    }

    req = requests.post("https://orchestrator.pgatour.com/graphql", json=payload, headers={"x-api-key": X_API_KEY, 'name': 'Simon Knobelauch Hansen', 'email': 'Rfv228@alumni.ku.dk'})
    stats_in_category = req.json()["data"]["statDetails"]["statCategories"][category]
    return stats_in_category

In [None]:
# NOTEBOOK INFORMATION:
# -----------------------------------------------------------------------------
# The results computed from this notebook are used in the project
# "Can We Predict If a PGA Tour Player Won a Tournament in That Year and Their Earnings
# based on educational background?" the ourced from a run on 21st of August 2023.
# Due to ongoing golf events in the 2023 PGA Tour, the season 2023 dataset on which the results are based will change slightly of run later. 
# -----------------------------------------------------------------------------

As an example, we can retrieve the available stats in SCORING (category 5). There are 9 categories in total (0-8).

In [None]:
available_stats(category=5)

We move on to building a function which can retrieve and merge the golf stats we want: 

In [None]:
def get_data(id_list: list):

    """
    Retrieve and merge statistics data based on a list of stat IDs.

    Parameters:
        id_list (list): List of stat IDs to retrieve data for.

    Returns:
        pd.DataFrame: Merged DataFrame containing statistics data.
    """
    
    X_API_KEY = "da2-gsrx5bibzbb4njvhl7t37wqyl4"
    YEAR = 2023 # Change this to the year you want to retrieve data for
    first_stat = True
    
    for stat_id in id_list:
        payload = {
            "operationName": "StatDetails",
            "variables": {
                "tourCode": "R",
                "statId": stat_id,
                "year": YEAR,
                "eventQuery": None
            },
         "query": "query StatDetails($tourCode: TourCode!, $statId: String!, $year: Int, $eventQuery: StatDetailEventQuery) {\n  statDetails(\n    tourCode: $tourCode\n    statId: $statId\n    year: $year\n    eventQuery: $eventQuery\n  ) {\n    tourCode\n    year\n    displaySeason\n    statId\n    statType\n    tournamentPills {\n      tournamentId\n      displayName\n    }\n    yearPills {\n      year\n      displaySeason\n    }\n    statTitle\n    statDescription\n    tourAvg\n    lastProcessed\n    statHeaders\n    statCategories {\n      category\n      displayName\n      subCategories {\n        displayName\n        stats {\n          statId\n          statTitle\n        }\n      }\n    }\n    rows {\n      ... on StatDetailsPlayer {\n        __typename\n        playerId\n        playerName\n        country\n        countryFlag\n        rank\n        rankDiff\n        rankChangeTendency\n        stats {\n          statName\n          statValue\n          color\n        }\n      }\n      ... on StatDetailTourAvg {\n        __typename\n        displayName\n        value\n      }\n    }\n  }\n}"  
      }
        page = requests.post("https://orchestrator.pgatour.com/graphql", json=payload, headers={"x-api-key": X_API_KEY ,'name':'Simon Knobelauch Hansen', 'email':'Rfv228@alumni.ku.dk'})
        page.raise_for_status()
        data = page.json()["data"]["statDetails"]["rows"]
        
        # For the first stat of the loop, we can not merge dataframes, so it is initialized here
        if first_stat:
            df = pd.DataFrame(data)
            if df.empty: # Some stats have no data, giving empty dataframe, which is skipped
                continue
            col_names = ["playerId", "playerName", "country"]
            for col in range(len(page.json()["data"]["statDetails"]["statHeaders"])):
                col_name = page.json()["data"]["statDetails"]["statTitle"]+"("+page.json()["data"]["statDetails"]["statHeaders"][col]+")"
                df[col_name] = df.stats.apply(lambda x: str(x[col]['statValue']).replace(",", "").replace("$", "") if isinstance(x, list) and len(x) > 0 else None)
                col_names.append(col_name)
            df = df[col_names]
            first_stat = False
        
        # Here all the subsequent stats are merged onto the dataframe
        else:
            df_temp = pd.DataFrame(data)
            if df_temp.empty: # Some stats have no data, giving empty dataframe, which is skipped
                continue
            col_names = ["playerId", "playerName", "country"]
            for col in range(len(page.json()["data"]["statDetails"]["statHeaders"])):
                col_name = page.json()["data"]["statDetails"]["statTitle"]+"("+page.json()["data"]["statDetails"]["statHeaders"][col]+")"
                df_temp[col_name] = df_temp.stats.apply(lambda x: str(x[col]['statValue']).replace(",", "").replace("$", "") if isinstance(x, list) and len(x) > 0 else None)
                col_names.append(col_name)
            df_temp = df_temp[col_names]

            df = pd.merge(df, df_temp, left_on=["playerId", "playerName", "country"], right_on=["playerId", "playerName", "country"], how="outer")
            
    return df

Now, define a list of stat IDs that we want to include in our dataset. We provide an example below. Note, the length of the dataframe must equal the number of unique Player IDs to ensure that each row corresponds to one player. 

In [None]:
desired_stats = ["02675", "02567", "02568", "02569", "02564", "101", "130", "402", "014", "108", "103", "300", "154", "138", "213"]
df = get_data(id_list=desired_stats)
print("Length of df (%s) should be the amount of unique player ids which is: %s" % (len(df), len(df.playerId.unique())))
print(df.columns)

In [None]:
df

In [None]:
# List of column names you want to drop
columns_to_drop = ['SG: Off-the-Tee(Measured Rounds)', 'SG: Approach the Green(Measured Rounds)', 'SG: Around-the-Green(Measured Rounds)', 'SG: Putting(Measured Rounds)', 'Scrambling(Par or Better)', 'Scrambling(Missed GIR)', 'Overall Putting Average(# of Putts)', 'Overall Putting Average(# of Holes)', 'Scoring Average (Actual)(Total Strokes)', 'Scoring Average (Actual)(Total Rounds)', 'Greens in Regulation Percentage(Greens Hit)', 'Greens in Regulation Percentage(# Holes)', 'Greens in Regulation Percentage(Relative/Par)', 'Top 10 Finishes(1st)', 'Top 10 Finishes(2nd)', 'Top 10 Finishes(3rd)', 'Hit Fairway Percentage(Possible Fairways)', 'Hit Fairway Percentage(Relative to Par)', 'Hit Fairway Percentage(Fairways Hit)', 'Driving Distance(Total Drives)', 'Driving Distance(Total Distance)', 'SG: Total(Total SG:T)', 'SG: Off-the-Tee(Avg)', 'SG: Approach the Green(Avg)', 'SG: Around-the-Green(Avg)']

# Drop the specified columns
df = df.drop(columns=columns_to_drop)

# Replace NaN with 0 in Top 10 
df['Top 10 Finishes(Top 10)'].fillna(0, inplace=True)
df['Top 10 Finishes(Top 10)'] = df['Top 10 Finishes(Top 10)'].astype(int)
df['Scrambling(%)'] = df['Scrambling(%)'].str.rstrip('%').astype(float)
df['Greens in Regulation Percentage(%)'] = df['Greens in Regulation Percentage(%)'].str.rstrip('%').astype(float)
df['Hit Fairway Percentage(%)'] = df['Hit Fairway Percentage(%)'].str.rstrip('%').astype(float)

# Loop through object columns starting from index 3 and try to convert them to integers
for column in df.select_dtypes(include=['object']).iloc[:, 3:].columns:
    df[column] = pd.to_numeric(df[column], errors='coerce').astype('Int64', errors='ignore')

# Drop rows with NaN values
df = df.dropna()

In [None]:
column_types = df.dtypes
column_types

In [None]:
df.describe()

Next up, we want to access the biographies of each player available on the website. We then want to merge the information retrieved from the bios on to the dataframe containing the desired stats for each player. 

In [None]:
# API key for authentication
X_API_KEY = "da2-gsrx5bibzbb4njvhl7t37wqyl4"

# Define the payload for the GraphQL query
payload = {
    "operationName": "PlayerDirectory",
    "variables": {
        "tourCode": "R"
    },
    "query": """
        query PlayerDirectory($tourCode: TourCode!, $active: Boolean) {
            playerDirectory(tourCode: $tourCode, active: $active) {
                tourCode
                players {
                    id
                    isActive
                    firstName
                    lastName
                    shortName
                    displayName
                    alphaSort
                    country
                    countryFlag
                    headshot
                    playerBio {
                        id
                        age
                        education
                        turnedPro
                    }
                }
            }
        }
    """
}

# Send a POST request to the GraphQL API
req = requests.post(
    "https://orchestrator.pgatour.com/graphql",
    json=payload,
    headers={"x-api-key": X_API_KEY}
)

# Extract player data from the API response and create a DataFrame
df_player_bio = pd.DataFrame(req.json()["data"]["playerDirectory"])

# Extract playerBio information from the DataFrame
player_bio_list = []
for i in range(len(df_player_bio)):
    player_bio_list.append(df_player_bio.players.iloc[i]["playerBio"])
df_player_bio = pd.DataFrame(player_bio_list)

# Merge playerBio data with existing DataFrame using playerId as the key
df = pd.merge(df, df_player_bio, left_on="playerId", right_on="id", how="left")
df.drop(columns=['id'], inplace=True)
df['age'] = df['age'].astype(int)
df['turnedPro'] = df['turnedPro'].astype(int)
df['Winner_dummy'] = df['Victory Leaders(Victories)'].apply(lambda x: 1 if x > 0 else 0) #Winner dummy
df['Career Earnings(Money)'] = df['Career Earnings(Money)'].astype(float)

# Convert specific non-values to NaN in the "Education" column
non_values = ['non', 'na', 'n/a', 'unknown', 'None']  # List of non-values to convert
df['education'] = df['education'].apply(lambda x: np.nan if x in non_values else x)

# Replace NaN values with "Outside the US" in the "Education" column
df['education'] = df['education'].fillna('outside the US')

# Convert specific non-values to NaN in the "Education" column
non_values = ['non', 'na', 'n/a', 'unknown', 'None']  # List of non-values to convert
df['turnedPro'] = df['turnedPro'].apply(lambda x: np.nan if x in non_values else x)

# Replace NaN values with "Outside the US" in the "Education" column
df['turnedPro'] = df['turnedPro'].fillna('Amateur')

df

In [None]:
df.describe()

In [None]:
# Choose descriptive statistics for selected variables
variables_of_interest = ['SG: Total(Avg)', 'SG: Total(Measured Rounds)', 'Driving Distance(Avg)', 'Victory Leaders(Victories)', 'Top 10 Finishes(Top 10)', 'Money per Event Leaders(Money per event)']

# Compute descriptive statistics for the selected variables
stats = df[variables_of_interest].describe()

rounded_stats = stats.apply(lambda x: round(x, 1))

rounded_stats

In [None]:
# Set style for the plots (optional)
sns.set(style="whitegrid")

# Create a distribution plot for SG Total(Avg)
plt.figure(figsize=(10, 6))  # Adjust the figure size as needed

sns.histplot(data=df, x="SG: Total(Avg)", kde=True)  # Create a histogram with KDE

plt.title("Distribution Plot of SG Total(Avg)")
plt.xlabel("SG: Total(Avg)")
plt.ylabel("Frequency")
plt.savefig('SG_Total(Avg)_dist.png')
plt.show()

In [None]:
# Calculate the correlation matrix
correlation_matrix = df.corr()

# Set up the figure
plt.figure(figsize=(18, 14))  # Adjust the figure size as needed

# Create a heatmap of the correlation matrix
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)

plt.title("Correlation Heatmap of Variables")
plt.savefig('correlation_heatmap.png')
plt.show()

In [None]:
df.corr()['Victory Leaders(Victories)']

In [None]:
# Create the plot
ax = sns.catplot(
    x="Winner_dummy",
    y="SG: Total(Avg)",
    kind="swarm",
    data=df,
    size=3,
    palette="Set1"  # Use the custom colors
)

# Set labels and title
ax.set(xlabel='', ylabel='SG: Total(Avg)')

# Customize the aesthetics
sns.despine()  # Remove spines
plt.grid(True, axis="y", linestyle="--", alpha=0.7)  # Add horizontal grid lines
plt.xticks([0, 1], ["Non-winners", "Winners"])  # Customize x-axis labels
plt.savefig("SG_Total_avg_scatter.png", dpi=300)  # Save the plot
# Show the plot
plt.show()

In [None]:
ax = sns.catplot(x="Winner_dummy", y="age", kind="swarm", hue="country", data=df, size=3)
ax.set(xlabel='Winners vs non-winners', ylabel='age');

In [None]:
# Count the number of players for each education
country_counts = df['education'].value_counts()

print(country_counts)

In [None]:
ax = sns.violinplot(x='Winner_dummy', y='SG: Total(Avg)', data=df)
ax.set(xlabel='Non-winner vs winners', ylabel='SG: Total(Avg)');

In [None]:
print(df['SG: Total(Avg)'].dtype)
print(df['Career Earnings(Money)'].dtype)

In [None]:
# Set the size of the plot
plt.figure(figsize=(8, 6))  # Adjust width and height as needed

# Create the lmplot with adjusted size
ax = sns.lmplot(x='SG: Total(Avg)', y='Career Earnings(Money)', data=df, height=6, aspect=1.2)

# Set labels for the plot
ax.set(xlabel='SG: Total(Avg)', ylabel='Career Earnings(Money)')

# Show the plot
plt.show()

In [None]:
sns.pairplot(df, vars = ['SG: Total(Avg)', 'Career Earnings(Money)', 'Winner_dummy', 'age'] ,height=4, size=2); # make hist and scatter for all numeric variables

In [None]:
# Set the size of the plot
plt.figure(figsize=(8, 6))  # Adjust width and height as needed

# Create a bar plot for distribution of top ten finishes
ax = sns.countplot(x='Top 10 Finishes(Top 10)', data=df, palette='rocket')

# Set labels for the plot
ax.set(xlabel='Top 10 Finishes', ylabel='Count')
plt.title('Distribution of Top 10 Finishes')

# Show the plot
plt.show()

## 2. <a id='section 2'>Predicting winners with machine learning models: Classification </a>
<a href='#TOC'>Back to table of Contents</a>

The goal of this section is to use machine learning methods to predict if a player has won a tournament or not. For this purpose, we use a range of machine learning methods based on supervised learning to find the model with the best prediction accuracy. 

I.e. the goal is learning a prediction rule for labelled data. Out target is categorial; the player has either won or not won a tournament. 

### Imports 

In [None]:
# import packages and modules from SciKitLearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import learning_curve
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

### Prepare data for classification with partioning

In [None]:
#Features (X): Define the features that we will use to predict the target variable
X = df.drop(columns=['playerId','Winner_dummy', 'playerName', 'country', 'education', 'Victory Leaders(Victories)', 'Money per Event Leaders(Money per event)'])

# Target variable (y): 'Winner_dummy' column
y = df['Winner_dummy']

# Split the data into development and test data

# SPLIT INTO DEVELOPMENT (2/3) AND TEST DATA (1/3)
X_dev, X_test, y_dev, y_test = train_test_split(X, y, test_size=1/3, random_state=42)

# SPLIT DEVELOPMENT INTO TRAIN (1/3) AND VALIDATION (1/3)
X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, test_size=1/2, random_state=42)

### Logistic regression classification

We start out testing a plain logistic regression model, which gives us an accuracy of 80.9% on the test data

In [None]:
# Estimate model on train data, evaluate on test data
clf = LogisticRegression(random_state=42) # Note: try default values

clf.fit(X_train, y_train) # Model training

y_hat_test = clf.predict(X_test) # Use model to predict test target
y_hat_train = clf.predict(X_train) # Predictions on training data

accuracy_test = (y_hat_test == y_test).mean() # Evaluate performance on test data
accuracy_train = (y_hat_train == y_train).mean() # Evaluate performance on training data

print(f"Accuracy on test data: {accuracy_test:.3f}")
print(f"Accuracy on training data: {accuracy_train:.3f}")

# Evaluate performance using classification report
report = classification_report(y_test, y_hat_test)
roc_auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])

print("Classification Report:")
print(report)
print(f"ROC AUC score: {roc_auc:.3f}")

Logistic Regression classifier, Lasso (L1) regularization and Ridge (L2) regularization

In [None]:
# Create a pipeline
pipe_lr = make_pipeline(
    PolynomialFeatures(include_bias=True),
    StandardScaler(),
    LogisticRegression(random_state=42, penalty = 'l1', C=1.0, solver='saga') # Change to penalty='l2' for Ridge regularization
)

# Fit the pipeline to the training data
pipe_lr.fit(X_train, y_train)

# Predict on the test data
y_pred_test = pipe_lr.predict(X_test)
y_pred_train = pipe_lr.predict(X_train)

# Evaluate the model
report = classification_report(y_test, y_pred_test)

accuracy_test = accuracy_score(y_test, y_pred_test)
accuracy_train = accuracy_score(y_train, y_pred_train)

roc_auc = roc_auc_score(y_test, pipe_lr.predict_proba(X_test)[:, 1]) # Use pipe_lr for prediction

print(f"Accuracy on test data: {accuracy_test:.3f}")
print(f"Accuracy on training data: {accuracy_train:.3f}")

print("Classification Report:")
print(report)

print(f"ROC AUC score: {roc_auc:.3f}")

K-fold cross validation to optimize hyperparameter C

In [None]:
# Grid of lambdas
lambdas = np.linspace(0.1, 300, 60) 

# SETUPa
kfolds = KFold(n_splits=10)
folds = list(kfolds.split(X_dev, y_dev))

# Outer loop: lambdas
accCV = []
for lambda_ in lambdas:    
    
    # Inner loop: folds
    accCV_ = []    
    for train_idx, val_idx in folds:
    
        # Reset indices for training and validation data
        X_train, y_train = X_dev.iloc[train_idx], y_dev.iloc[train_idx]
        X_val, y_val = X_dev.iloc[val_idx], y_dev.iloc[val_idx]
        
        # Train model and compute accuracy on validation fold
        pipe_lrCV = make_pipeline(PolynomialFeatures(),
                                StandardScaler(),
                                LogisticRegression(random_state=42, penalty='l1', C=lambda_, solver='saga'))  # change to "penalty = 'l2' for l2 regularization"
        pipe_lrCV.fit(X_train, y_train)
        accCV_.append(accuracy_score(pipe_lrCV.predict(X_val), y_val))  
        
    # Store result    
    accCV.append(accCV_) 
    
# Convert to DataFrame
lambdaCV = pd.DataFrame(accCV, index=lambdas)

In [None]:
# CHOOSE OPTIMAL HYPERPARAMETERS (mean of accuracy across folds)
optimal_lambda = lambdaCV.mean(axis=1).nlargest(1)
print("Optimal lambda and optimal accuracy: ", optimal_lambda)

Training the model the with optimal hypeparameter

In [None]:
# CHOOSE OPTIMAL HYPERPARAMETERS (mean of accuracy across folds)
optimal_lambda = lambdaCV.mean(axis=1).nlargest(1)

# RETRAIN/RE-ESTIMATE MODEL USING OPTIMAL HYPERPARAMETERS AND COMPARE PERFORMANCE
pipe_lrCV = make_pipeline(PolynomialFeatures(), 
                             StandardScaler(),
                             LogisticRegression(C=optimal_lambda.index[0], random_state=42, penalty='l1', solver='saga'))

pipe_lrCV.fit(X_dev,y_dev) #fit optimal lambda to entire development set

models = {'LogReg': pipe_lr, 'LogRegCV': pipe_lrCV}
for name, model in models.items():
    score = accuracy_score(model.predict(X_test),y_test)
    print(name,round(score,3))

Learning curve

In [None]:
# Learning curve
train_sizes, train_scores, test_scores = \
    learning_curve(estimator=pipe_lrCV,
                   X=X_dev,
                   y=y_dev,
                   train_sizes=np.linspace(0.1, 1.0, 10),
                   scoring='accuracy',                 
                   cv=10,
                   n_jobs=1)

train_mean = np.mean(train_scores, axis=1)
train_std= np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std= np.std(test_scores, axis=1)

plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy')
plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy')
plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')

plt.grid(True)
plt.xlabel('Number of training samples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim([0.60, 1.03])
plt.savefig('learning_curve.png', dpi=300)
plt.show()

Validation curve

In [None]:
param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
train_scores, test_scores = validation_curve(
                            estimator=pipe_lrCV,
                            X=X_dev,
                            y=y_dev,
                            param_name='logisticregression__C',
                            param_range=param_range,
                            cv=10)


train_mean = np.mean(train_scores, axis=1)
train_std= np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std= np.std(test_scores, axis=1)

plt.plot(param_range, train_mean, color='blue', marker='o', markersize=5, label='training accuracy')
plt.fill_between(param_range, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
plt.plot(param_range, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy')
plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')

plt.grid(True)
plt.xscale('log')
plt.legend(loc='lower right')
plt.xlabel('Parameter C')
plt.ylabel('Accuracy')
plt.ylim([0.7, 1.0])
plt.savefig('validation_curve.png', dpi=300)
plt.show()

Random forest model 

In [None]:
# Create a Random Forest Classifier with specified parameters
forest = RandomForestClassifier(
    criterion='gini',
    n_estimators=20,
    random_state=1,
    n_jobs=1)

# Fit the classifier to the training data
forest.fit(X_train, y_train)

# Predict on the test data
y_pred = forest.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy
print(f"Accuracy: {accuracy:.2f}")

## 3. <a id='section_3'>Predicting earnings per event based on college background</a>
<a href='#TOC'>Back to table of Contents</a>

### Importing packages

In [None]:
# Import packages relevant for the this seciton 
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFE
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### Preparing dataframe for regression

Make dataframe and define features (X) and target (y)

In [None]:
# Create a new DataFrame
earning_df = df.copy()

# Remove rows with NaN values
earning_df.dropna(inplace=True)

# Y value for machine learning is the Money column
Y = earning_df['Money per Event Leaders(Money per event)']

# Removing the specified columns from the DataFrame
columns_to_drop = ['Winner_dummy', 'playerId', 'playerName', 'country', 'education', 'Victory Leaders(Victories)', 'Money per Event Leaders(Money per event)', 'Career Earnings(Money)', 'Money per Event Leaders(Total money)', 'Top 10 Finishes(Top 10)', 'SG: Total(Total SG:P)']
X = earning_df.drop(columns=columns_to_drop, axis=1)

### Setting up a linear regression function 

Making a function that partitions data intro training and test data and then fit the model 

In [None]:
def linear_reg(X, Y):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state = 10)
    clf = LinearRegression().fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print('R-Squared on training set: {:.3f}'
          .format(clf.score(X_train, y_train)))
    print('R-Squared on test set {:.3f}'
          .format(clf.score(X_test, y_test)))
    
    print('linear model coeff (w):\n{}'
         .format(clf.coef_))
    print('linear model intercept (b): {:.3f}'
         .format(clf.intercept_))

### Defining linear regression with regularization

In [None]:
def linear_reg(X, Y, regularization='none', alpha=1.0):
    
    # Convert any non-numeric data to numeric, handling errors with 'coerce'
    X = X.apply(pd.to_numeric, errors='coerce')

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=10)
    
    if regularization == 'none':
        clf = LinearRegression().fit(X_train, y_train)
    elif regularization == 'lasso':
        clf = Lasso(alpha=alpha, max_iter=100000000).fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    r2_train = clf.score(X_train, y_train)
    r2_test = clf.score(X_test, y_test)
    
    print('R-Squared on training set: {:.3f}'.format(r2_train))
    print('R-Squared on test set: {:.3f}'.format(r2_test))
    
    if regularization == 'none':
        print('linear model coeff (w):\n{}'.format(clf.coef_))
        print('linear model intercept (b): {:.3f}'.format(clf.intercept_))
    elif regularization == 'lasso':
        print('lasso model coeff (w):\n{}'.format(clf.coef_))
        print('lasso model intercept (b): {:.3f}'.format(clf.intercept_))

Now, you can use the modified X and Y in your linear_reg function as before

In [None]:
# Run regression
linear_reg(X, Y)  # Without regularization

In [None]:
# Run regression 
linear_reg(X, Y, regularization='lasso')  # With Lasso regularization

### Introducing polynomial features 

In [None]:
# Creating a Polynomial Feature to improve R-Squared
poly = PolynomialFeatures(2)
poly = poly.fit(X)
poly_earning = poly.transform(X)
print(poly_earning.shape)

# Creating a DataFrame with the polynomial features 
poly_earning = pd.DataFrame(poly_earning, columns = poly.get_feature_names(X.columns))

Run regression

In [None]:
linear_reg(poly_earning, Y)

Polynomial regression with ridge regularizaiton 

In [None]:
def linear_reg_ridge(X, Y, al):
    X_train, X_test, y_train, y_test = train_test_split(X, Y,
                                                   random_state = 10)
    clf = Ridge(alpha = al).fit(X_train, y_train)

    print('(poly deg 2 + ridge) R-squared score (training): {:.3f}'
         .format(clf.score(X_train, y_train)))
    print('(poly deg 2 + ridge) R-squared score (test): {:.3f}'
         .format(clf.score(X_test, y_test)))
    
    print('(poly deg 2 + ridge) linear model coeff (w):\n{}'
         .format(clf.coef_))
    print('(poly deg 2 + ridge) linear model intercept (b): {:.3f}'
         .format(clf.intercept_))

In [None]:
linear_reg_ridge(poly_earning, Y, al = 1)

In [None]:
linear_reg_ridge(poly_earning, Y, al = 100)

### Cross validation 

In [None]:
def cross_val(X, Y):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state = 10)
    clf = Ridge(alpha=100).fit(X_train, y_train)
    scores = cross_val_score(clf, X, Y, cv=5)
    
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    print(scores)

In [None]:
cross_val(poly_earning, Y)

### Fetching the regression output for use in the project 

In [None]:
def find_earning_with_names(X, Y, education):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=10)
    clf = Ridge().fit(X_train, y_train)
    y_pred = clf.predict(X)
    y_pred = pd.Series(y_pred)

    pred_data = pd.concat([X, y_pred], axis=1)
    pred_name = pd.concat([pred_data, df[['education', 'playerName']]], axis=1)

    selected_earnings = pred_name.loc[pred_name['education'] == education]
    return selected_earnings  # Assuming the column name for target variable is 'Y'

education_category = "outside the US"
mean_predicted_earning = find_earning_with_names(X=poly_earning, Y=Y, education=education_category)
print(f"Mean predicted earning for education '{education_category}': {mean_predicted_earning}")

In [None]:
# Calculate the mean
mean_money_per_event = df[df['education'] == 'outside the US']['Money per Event Leaders(Money per event)'].mean()

print("Mean of actual Money per Event:", mean_money_per_event)

mean_predicted_earning = mean_predicted_earning[mean_predicted_earning['education'] == 'outside the US'][0.].mean()

print("Mean predicted earnings:", mean_predicted_earning)