# March Madness 2023 - Data Extraction

#### Creator: Weston Mauz
Project Start: 09/28/22

### Project Index

1. Problem Statement

2. **Data Extraction**

3. Exploratory Data Analysis

4. Feature Engineering

5. Feature Selection

6. Model Selection

7. Model tuning

8. Model serving (if applicable)

### File Index

1. Load Libraries

2. Load Files & Define Constants

3. Define Functions

4. Acquire & Write Team Data

5. Acquire & Write March Madness Results

6. Assemble & Write March Madness Dataframe

### 1) Load Libraries

In [124]:
# Pandas for dataframes
import pandas as pd

# Request for Network interaction
import requests

# Beautiful Soup for Data extraction
from bs4 import BeautifulSoup

# Import Team Class
from Classes.team import Team

# Pickle import for data serialization and deserialization
import pickle

# csv files from reading and writing
import csv

# Nan for dictionary creation
import math

### 2) Load Files & Define Constants

In [153]:
# Read Pickle File as dictionary
teams_dictionary_file = open("Data/teams_dictionary.pkl", "rb")
teams_dictionary = pickle.load(teams_dictionary_file)

# Load in keys (team names) csv as dataframe
teams_keys = pd.read_csv('Data/team_names.csv') 
team_keys_list = list(teams_keys['Team Name'])

# Load in stat labels csv as dataframe (then convert to list)
all_labels_df = pd.read_csv("Data/stat_labels.csv")
stat_labels_list = list(all_labels_df['Stat Label'])

# Load in keys (team names) csv as dataframe
results_csv = pd.read_csv('Data/Results/madness_results_confirmed.csv') 

# Data Dictionary to contain all data for all teams for all years -- Formatted
data_dictionary = {}

# Site link
base_url = "https://www.sports-reference.com"

### 3) Define Functions

##### Scrape Website for Team Data for a given year

- Input: Target Year
- Return: Data Dictionary of year stats for all teams

In [126]:
def extract_stats(year):
    
    for team in range(len(team_keys_list)):

        team_key = team_keys_list[team]

        stat_dictionary = {}

        print(team_key)

        # Set up Soup URL for data parsing
        if year in teams_dictionary[team_key].stat_years:
            team_year_key = teams_dictionary[team_key].stat_link_dict[year]
        else:
            continue
        
        year_url = base_url + team_year_key
        page = requests.get(year_url)
        year_page = BeautifulSoup(page.content, 'html.parser')

        # Extract data
        per_game_table = year_page.select('[id="schools_per_game"] tbody tr td')
        stat_values = [stat.get_text() for stat in per_game_table]

        stat_dictionary[year] = stat_values

        data_dictionary[f'{team_key}'] = stat_dictionary
        
    return(data_dictionary)

# Website Scraping
# Pulls team data from webpate and returns as a dictionary of data
# Strings are raw & unfiltered
# Input is the year in string form (ex. '2020-21')

##### Format Stats 

- Input: List of stat data (unformatted)
- Return: List of stat data (formatted)
- This function is run through format_data

In [127]:
def format_stats(team_stats):
    count = 0
    for string in team_stats:
        if string.endswith('th'):
            string = string[:-2]
        if string.endswith('rd'):
            string = string[:-2]
        if string.endswith('nd'):
            string = string[:-2]
        if string.endswith('st'):
            string = string[:-2]
        if string != '':
            team_stats[count] = float(string)
        else:
            team_stats[count] = -1
        
        team_stats[count] = string

        count +=1
        
    return team_stats

# Stat formatting nested function
# Nests into format_data(data) function
# Input is stat list as List type

##### Format Data

- Input: Data dictionary (unformatted)
- Return: Data dictionary (formatted)
- This function calls format_stats

- strings -> floats
- Remove ordinal indicators (th | rd | st | nd)

In [128]:
def format_data(data):
    for team in data.keys():
        for year_key in data[team].keys():
            format_stats(data[team][year_key])

# Stat Formatting
# imput data is dictionary type


##### Transfer Data 
- data_dictionary --> teams_dictionary

In [129]:
def data_transfer(data_dict):
    for team in data_dict.keys():
        for year in data_dict[team].keys():
            teams_dictionary[team].stat_year_data[year] = data_dict[team][year]

##### Write formatted data dictionary

- Input: data_dictionary & target_year
- Return: Data/Yearly-Data/{target_year}.csv

In [130]:
def write_year_dict(dict, year):
    # create a binary pickle file for main dataframe export 
    f = open(f"Data/Yearly-Data/{year}_data_dictionary.pkl","wb")

    # write the python object (dict) to pickle file
    pickle.dump(dict,f)

    # close file
    f.close()

##### Write Completed Teams Dictionary

- Input: Teams Dictionary
- Output: None
- Teams dictionary -> pkl file: teams_dictionary.pkl

In [131]:
def write_team_dict(team_dict):
        # create a binary pickle file for main dataframe export 
    f = open(f"Data/teams_dictionary.pkl","wb")

    # write the python object (dict) to pickle file
    pickle.dump(team_dict,f)

    # close file
    f.close()

##### Retrieve Results Data into Dictionary

- Input: Results csv file: 'Data/Results/madness_results_confirmed.csv'
- Output: Results data dictionary
- Writes dictionary to pkl file: result_dictionary.pkl

In [151]:
def retrieve_madness_results(results):

    # Data Dictionary for all results
    results_data_dictionary = {}

    year_array = results.columns[1:]

    for team in range(len(results['Team Name'])):

        team_name = results['Team Name'][team]

        rankings_dictionary = {}

        for year in year_array:
            ranking = results[year][team]
            rankings_dictionary[year] = ranking

        results_data_dictionary[team_name] = rankings_dictionary

    f = open(f"Data/result_dictionary.pkl","wb")

    # write the python object (dict) to pickle file
    pickle.dump(results_data_dictionary,f)

    # close file
    f.close()
    
    return results_data_dictionary

##### Compile Madness dataframe

- Input: Target Year, Data Labels, Results Dictionary
- Returns: Dataframe (formatted for analysis)
- Writes dataframe to csv: "Data/Madness/{year}.csv"

In [156]:
def df_year_compiler(year, data_labels, results_dictionary):

    madness_teams = []

    for key in results_dictionary:
        result = results_dictionary[key][year]
        if not math.isnan(result):
            madness_teams.append(key)
    
    year_df = pd.DataFrame(columns=data_labels)

    for team in madness_teams:

        if year in teams_dictionary[team].stat_years:
            team_stats = teams_dictionary[team].stat_year_data[year]
        else:
            continue

        #Final data format for dataframe upload
        team_name = teams_dictionary[team].name
        team_id = teams_dictionary[team].team_id
        madness_result = results_dictionary[team][year]

        team_info = [team_name, team_id, year, madness_result]

        all_data = team_info + team_stats

        if len(all_data) == len(data_labels):
            year_df.loc[len(year_df.index)] = all_data

    year_df.to_csv(f"Data/Madness/{year}.csv", index=False)
    return(year_df)

### 4) Acquire Team Data

In [159]:
# Evaluation Year
target_year = '2021-22'

In [134]:
# Dictionary of raw year data for each team in a given year
data_dict = extract_stats(target_year)

# Runtime ~ 5-10 minutes

Abilene Christian Wildcats
Air Force Falcons
Akron Zips
Alabama A&M Bulldogs
Alabama Crimson Tide
Alabama State Hornets
Albany (NY) Great Danes
Alcorn State Braves
Allegheny Gators
American Eagles
Amherst Lord Jeffs
Appalachian State Mountaineers
Arizona State Sun Devils
Arizona Wildcats
Arkansas Razorbacks
Arkansas State Red Wolves
Arkansas-Pine Bluff Golden Lions
Armstrong Pirates
Army Black Knights
Auburn Tigers
Augusta State Jaguars
Augustana (IL) Vikings
Austin Peay Governors
Baker University Wildcats
Baldwin-Wallace Yellow Jackets
Ball State Cardinals
Baltimore Super Bees
Baylor Bears
Bellarmine Knights
Belmont Bruins
Beloit Buccaneers
Bethune-Cookman Wildcats
Binghamton Bearcats
Birmingham-Southern Panthers
Bloomsburg Huskies
Boise State Broncos
Boston College Eagles
Boston University Terriers
Bowling Green State Falcons
Bradley Braves
Brigham Young College 
Brigham Young Cougars
Brooklyn Bulldogs
Brown Bears
Bryant Bulldogs
Bucknell Bison
Buffalo Bulls
Butler Bulldogs
Cal Poly 

In [135]:
# Format Data in dictionary
format_data(data_dict)

In [136]:
# Write Dictionary to pkl file
write_year_dict(data_dict, target_year)

In [137]:
# Transfer data dictionary to teams_dictionary
data_transfer(data_dict)

In [138]:
# Write teams_dictionary to pkl file
write_team_dict(teams_dictionary)

### 5) March Madness Data Acquisition

In [157]:
# Read results file from csv & translate to dictionary
# Write to pkl file
results_dict = retrieve_madness_results(results_csv)

### 6) March Madness Dataframe Assembly and Writing

In [160]:
# Labels for final dataframe export
team_data_labels = ['Team Name', 'Team ID', 'Test Year', 'Madness Result'] + stat_labels_list

# Compile March Madness dataframe & write to csv file
df_year_compiler(year= target_year, data_labels= team_data_labels, results_dictionary=results_dict)

Unnamed: 0,Team Name,Team ID,Test Year,Madness Result,g,mp_per_g,fg_per_g,fga_per_g,fg_pct,fg2_per_g,...,opp_ft_pct_rank,opp_orb_per_g_rank,opp_drb_per_g_rank,opp_trb_per_g_rank,opp_ast_per_g_rank,opp_stl_per_g_rank,opp_blk_per_g_rank,opp_tov_per_g_rank,opp_pf_per_g_rank,opp_pts_per_g_rank
0,Michigan Wolverines,236,2021-22,3.0,34,40.0,26.9,57.6,.467,20.6,...,282,48,23,12,263,46,204,13,107,188
1,Duke Blue Devils,110,2021-22,5.0,39,40.1,29.7,60.4,.491,21.8,...,59,312,43,143,348,119,199,41,91,134
2,Marquette Golden Eagles,220,2021-22,1.0,32,40.3,26.9,59.8,.450,18.3,...,107,353,315,350,336,270,193,257,60,251
3,Ohio State Buckeyes,298,2021-22,2.0,32,40.3,25.6,54.4,.471,17.8,...,270,247,10,50,152,144,38,17,230,132
4,Indiana Hoosiers,173,2021-22,1.0,35,40.4,25.8,56.8,.454,19.9,...,341,115,149,119,285,210,73,122,235,84
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,Vermont Catamounts,437,2021-22,1.0,34,40.1,27.5,55.9,.492,18.8,...,63,3,42,3,315,9,97,55,126,12
64,Virginia Tech Hokies,442,2021-22,1.0,36,40.1,25.9,55.0,.471,17.1,...,316,66,19,14,269,167,150,111,47,24
65,Wright State Raiders,480,2021-22,1.0,36,40.0,27.4,59.4,.461,20.9,...,176,237,77,124,256,310,130,204,277,253
66,Wyoming Cowboys,481,2021-22,0.5,34,40.6,25.1,54.9,.457,17.0,...,221,86,119,83,50,25,65,38,336,62
