# March Madness 2023 -- Data Exploration

#### Creator: Weston Mauz
Project Start: 09/28/22

### Project Index

1. Problem Statement

2. **Data Extraction**

3. Exploratory Data Analysis

4. Feature Engineering

5. Feature Selection

6. Model Selection

7. Model tuning

8. Model serving (if applicable)

### File Index

1. Library Imports

2. Team Name Extraction

3. Assign Team IDs

4. Collect Team Home Web Page Links

5. Collect Team Year Page Links

6. Locate & Write Stat Labels

### 1. Library Imports

In [54]:
# Pandas for dataframes
import pandas as pd

# Request for Network interaction
import requests

# Beautiful Soup for Data extraction
from bs4 import BeautifulSoup

# Import Team Class
from team import Team

# Pickle import for data serialization and deserialization
import pickle

### 2. Team Name Extraction

The first step will be to pull all the school names out of the website

In [55]:
# Assign soup to webpage

# Initial webpage
base_url = "https://www.sports-reference.com"
school_url = "/cbb/schools/"

# Connect to webpage & check that response is 200
page = requests.get(base_url + school_url)
#page.status_code
soup = BeautifulSoup(page.content, 'html.parser')

# Extract Team Names
name_tags = soup.select('[id="div_schools"] tbody tr td a')
team_names = [nt.get_text() for nt in name_tags]

In [56]:
# Conversion of list to dataframe
team_names_df = pd.DataFrame(team_names, columns=['Team Name'])
team_names_df

Unnamed: 0,Team Name
0,Abilene Christian Wildcats
1,Air Force Falcons
2,Akron Zips
3,Alabama A&M Bulldogs
4,Alabama Crimson Tide
...,...
480,Wright State Raiders
481,Wyoming Cowboys
482,Xavier Musketeers
483,Yale Bulldogs


In [57]:
# Write keys to csv file for future use -- Team Names as Keys
team_names_df.to_csv("Data/team_names.csv", index=False)

### 3. Assign Team IDs
Index will be the ID

In [58]:
# Load in keys (team names) csv as dataframe & create list
teams_keys = pd.read_csv('Data/team_names.csv') 
team_keys_list = list(teams_keys['Team Name'])

In [59]:
# Create team IDs
team_ids = [str(team) for team in range(len(team_keys_list))]
team_ids

['0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '40',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49',
 '50',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '57',
 '58',
 '59',
 '60',
 '61',
 '62',
 '63',
 '64',
 '65',
 '66',
 '67',
 '68',
 '69',
 '70',
 '71',
 '72',
 '73',
 '74',
 '75',
 '76',
 '77',
 '78',
 '79',
 '80',
 '81',
 '82',
 '83',
 '84',
 '85',
 '86',
 '87',
 '88',
 '89',
 '90',
 '91',
 '92',
 '93',
 '94',
 '95',
 '96',
 '97',
 '98',
 '99',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '106',
 '107',
 '108',
 '109',
 '110',
 '111',
 '112',
 '113',
 '114',
 '115',
 '116',
 '117',
 '118',
 '119',
 '120',
 '121',
 '122',
 '123',
 '124',
 '125',
 '126',
 '127',
 '128',
 '129',
 '130',
 '131',
 '132',
 '133',
 '134',
 '135',
 '136',
 '137',
 '138'

In [60]:
# Assign links to appropriate column & row of dataframe
team_names_df['Team ID'] = team_ids
team_names_df

Unnamed: 0,Team Name,Team ID
0,Abilene Christian Wildcats,0
1,Air Force Falcons,1
2,Akron Zips,2
3,Alabama A&M Bulldogs,3
4,Alabama Crimson Tide,4
...,...,...
480,Wright State Raiders,480
481,Wyoming Cowboys,481
482,Xavier Musketeers,482
483,Yale Bulldogs,483


### 4. Collect Team Page Links

Using the team_names array, now the program must find the individual team web pages

In [61]:
# Extract team links
team_links = [tag['href'] for tag in name_tags]

# Assign links to appropriate column & row of dataframe
team_names_df['Team Link'] = team_links
team_names_df

Unnamed: 0,Team Name,Team ID,Team Link
0,Abilene Christian Wildcats,0,/cbb/schools/abilene-christian/
1,Air Force Falcons,1,/cbb/schools/air-force/
2,Akron Zips,2,/cbb/schools/akron/
3,Alabama A&M Bulldogs,3,/cbb/schools/alabama-am/
4,Alabama Crimson Tide,4,/cbb/schools/alabama/
...,...,...,...
480,Wright State Raiders,480,/cbb/schools/wright-state/
481,Wyoming Cowboys,481,/cbb/schools/wyoming/
482,Xavier Musketeers,482,/cbb/schools/xavier/
483,Yale Bulldogs,483,/cbb/schools/yale/


### 5. Navigate & Collect team links for each year

##### Assign all collected data to Team object & create dictionary of all teams

In [62]:
teams_dictionary = {}

In [63]:
# Loop to gather all team main links, available stat years, and links to those stat years
for team in range(len(team_names_df)):
  
  stat_year_dict = {}
  stat_year_data = {}
  
  school_page = requests.get(base_url + team_names_df['Team Link'][team])
  school_page_scrape = BeautifulSoup(school_page.content, 'html.parser')

  stat_year = school_page_scrape.select('table tbody tr td[data-stat="season"] a')

  stat_year_labels = [year.get_text() for year in stat_year]

  for year in stat_year:
    stat_year_label = year.get_text()
    stat_year_link = year['href']
    stat_year_dict[stat_year_label] = stat_year_link

  team_object = Team(team_names_df['Team Name'][team], team_names_df['Team ID'][team], team_names_df['Team Link'][team], stat_year_labels, stat_year_dict, stat_year_data)
  teams_dictionary[team_names_df['Team Name'][team]] = team_object

#Runtime is about 4 Minutes

In [64]:
# Save as pickle file in order to serialize & deserialize python objects in dictionary

# create a binary pickle file 
f = open("Data/teams_dictionary.pkl","wb")

# write the python object (dict) to pickle file
pickle.dump(teams_dictionary,f)

# close file
f.close()

### 6. Locate & Write all Stat Labels to CSV

In [65]:
# Read Pickle File as dictionary

file_to_read = open("Data/teams_dictionary.pkl", "rb")

# Final Product dictionary load
teams_dictionary = pickle.load(file_to_read)

# Load in keys (team names) csv as dataframe
teams_keys = pd.read_csv('Data/team_names.csv')

In [50]:
# Initial webpage evaluation
base_url = "https://www.sports-reference.com"
test_team = team_names_df['Team Name'][350]
year_keys = teams_dictionary[test_team].stat_years
test_year_key = teams_dictionary[test_team].stat_link_dict[year_keys[0]]
test_year_key


url = base_url + test_year_key

# Connect to webpage & check that response is 200
page = requests.get(url)
page.status_code
url

year_page = BeautifulSoup(page.content, 'html.parser')

In [51]:
per_game_table = year_page.select('[id="schools_per_game"] tbody tr td')

# Divide table to grab the four rows of data

per_game_table_team = per_game_table[0:23]
per_game_table_team_rank = per_game_table[23:46]
per_game_table_opp = per_game_table[46:69]
per_game_table_opp_rank = per_game_table[69:92]

In [52]:
# Extract stat labels for individual rows & modify their title

per_game_labels_team = [label['data-stat'] for label in per_game_table_team]
per_game_labels_team_rank = [label['data-stat'] for label in per_game_table_team_rank]
per_game_labels_opp = [label['data-stat'] for label in per_game_table_opp]
per_game_labels_opp_rank = [label['data-stat'] for label in per_game_table_opp_rank]

for s in range(len(per_game_labels_team_rank)):
    per_game_labels_team_rank[s] = per_game_labels_team_rank[s] + '_rank'

for s in range(2):
    per_game_labels_opp[s] = 'opp_' + per_game_labels_opp[s]

for s in range(2):
    per_game_labels_opp_rank[s] = 'opp_' + per_game_labels_opp_rank[s]

for s in range(len(per_game_labels_opp_rank)):
    per_game_labels_opp_rank[s] = per_game_labels_opp_rank[s] + '_rank'

In [53]:
# Combine all labels and export to csv

all_labels = per_game_labels_team + per_game_labels_team_rank + per_game_labels_opp + per_game_labels_opp_rank
len(all_labels)

all_labels_df = pd.DataFrame(all_labels, columns= ['Stat Label'])
all_labels_df['Stat Label'].nunique()

all_labels_df.to_csv("Data/stat_labels.csv")