# Birthday Paradox

In probability, the birthday paradox is the unexpected answer to the question *"How many people would need to be in a room for there to be a 50% chance that at least two of them share the same birthday?"* We might think that there would need to be a large number of people, given that there are 365 days in a year, but according to the birthday paradox only **23** people are needed for the probability of two people sharing a birthday to reach 50%.

This project aims to explore this phenomenon by examining the birthdays of NBA players. First I use Selenium to webscrape and Pandas to clean the players' information. Then, I use excel to examine repeated birthdays by team, division, and conference to explore how the number of repeated birthdays changes as our group grows in size.

## Importing Packages

In [1]:
import pandas as pd 

# for webscraping 
from selenium import webdriver
from selenium.webdriver.common.by import By 
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException

## Getting the Roster Link for Every Team

In [2]:
driver = webdriver.Chrome(options = Options())
url = 'https://www.nba.com/teams'

driver.get(url)

driver.implicitly_wait(2)

info = []

body = driver.find_element(By.XPATH, '/html/body/div[1]/div[2]/div[2]/div[2]/section/div/div[2]')

# the 'Profile' links contain player information
for p in body.find_elements(By.CLASS_NAME, 'TeamDivisions_division__u3KUS'):
    driver.implicitly_wait(2)
    division = p.find_element(By.CLASS_NAME, 'TeamDivisions_divisionName__KFlSk')
    team = [item.text for item in p.find_elements(By.XPATH, './/a[1]')]
    links = [item.get_attribute('href') for item in p.find_elements(By.XPATH, './/div/div/a[1]')]
    team2 = [t for t in team if t != 'Profile']
    links2 = [l for l in links if 'team' in l]

    
    # adding each link into our list of links
    d = [division.text, team2, links2]
    info.append(d)

driver.quit()

In [3]:
# 'df_list' will contain a list of dataframes
df_list = []

# this will create a dataframe using the labels list and values list of each course
for i in info:
    division = i[0]
    team = i[1]
    link = i[2]
    dict = {'Division': division, 'Team': team, 'Link':link} 
    df = pd.DataFrame(dict)
    # this will rename each column holding course information to that course's name
    df_list.append(df)

In [4]:
all_teams = pd.concat(df_list, ignore_index=True)
all_teams

Unnamed: 0,Division,Team,Link
0,ATLANTIC,Boston Celtics,https://www.nba.com/team/1610612738/celtics
1,ATLANTIC,Brooklyn Nets,https://www.nba.com/team/1610612751/nets
2,ATLANTIC,New York Knicks,https://www.nba.com/team/1610612752/knicks
3,ATLANTIC,Philadelphia 76ers,https://www.nba.com/team/1610612755/sixers
4,ATLANTIC,Toronto Raptors,https://www.nba.com/team/1610612761/raptors
5,CENTRAL,Chicago Bulls,https://www.nba.com/team/1610612741/bulls
6,CENTRAL,Cleveland Cavaliers,https://www.nba.com/team/1610612739/cavaliers
7,CENTRAL,Detroit Pistons,https://www.nba.com/team/1610612765/pistons
8,CENTRAL,Indiana Pacers,https://www.nba.com/team/1610612754/pacers
9,CENTRAL,Milwaukee Bucks,https://www.nba.com/team/1610612749/bucks


## Getting the Players' Info for Every Team

In [5]:
driver = webdriver.Chrome(options = Options())

all_players = []

for index, row in all_teams.iterrows():
    link = row['Link']
    driver.get(link)
    
    driver.implicitly_wait(3)
    
    players = []
    
    tbody = driver.find_element(By.XPATH, '(//table)[1]' )
    
    for tr in tbody.find_elements(By.XPATH, './/tr'):
        driver.implicitly_wait(2)
        division = row['Division']
        team = row['Team']
        name = [item.text for item in tr.find_elements(By.CSS_SELECTOR, 'a')]
        birthday = [item.text for item in tr.find_elements(By.XPATH, './/td[6]')]
        player_info = [division, team, name, birthday]
        players.append(player_info)
        
    all_players.append(players[1:])

driver.quit()

In [6]:
ap = []

for team in all_players:
    for player in team:
        ap.append(player)

## Creating a DF of All Players

In [8]:
NBA_Players = pd.DataFrame(ap, columns=['Division', 'Team', 'Player', 'Birthday'])

NBA_Players = NBA_Players.astype({'Player': str,'Birthday': str})

NBA_Players['Player'] = NBA_Players['Player'].str.strip("[")
NBA_Players['Player'] = NBA_Players['Player'].str.strip("]")
NBA_Players['Player'] = NBA_Players['Player'].str.strip("'")
NBA_Players['Player'] = NBA_Players['Player'].str.strip('"')

NBA_Players['Birthday'] = NBA_Players['Birthday'].str.strip("[")
NBA_Players['Birthday'] = NBA_Players['Birthday'].str.strip("]")
NBA_Players['Birthday'] = NBA_Players['Birthday'].str.strip("'")
NBA_Players['Birthday'] = NBA_Players['Birthday'].str.strip('"')

# creating new 'Conference' column
NBA_Players['Conference'] = ''

NBA_Players.loc[NBA_Players['Division'] == 'ATLANTIC', 'Conference'] = 'Eastern'
NBA_Players.loc[NBA_Players['Division'] == 'CENTRAL', 'Conference'] = 'Eastern'
NBA_Players.loc[NBA_Players['Division'] == 'SOUTHEAST', 'Conference'] = 'Eastern'
NBA_Players.loc[NBA_Players['Division'] == 'NORTHWEST', 'Conference'] = 'Western'
NBA_Players.loc[NBA_Players['Division'] == 'SOUTHWEST', 'Conference'] = 'Western'
NBA_Players.loc[NBA_Players['Division'] == 'PACIFIC', 'Conference'] = 'Western'

NBA_Players

Unnamed: 0,Division,Team,Player,Birthday,Conference
0,ATLANTIC,Boston Celtics,Hayden Gray,"MAY 11, 2003",Eastern
1,ATLANTIC,Boston Celtics,RJ Luis Jr.,"NOV 27, 2002",Eastern
2,ATLANTIC,Boston Celtics,Jayson Tatum,"MAR 03, 1998",Eastern
3,ATLANTIC,Boston Celtics,Anfernee Simons,"JUN 08, 1999",Eastern
4,ATLANTIC,Boston Celtics,Jaylen Brown,"OCT 24, 1996",Eastern
...,...,...,...,...,...
595,SOUTHWEST,San Antonio Spurs,Charles Bassey,"OCT 28, 2000",Western
596,SOUTHWEST,San Antonio Spurs,Julian Champagnie,"JUN 29, 2001",Western
597,SOUTHWEST,San Antonio Spurs,Harrison Barnes,"MAY 30, 1992",Western
598,SOUTHWEST,San Antonio Spurs,Lindy Waters III,"JUL 28, 1997",Western


## Exporting as CSV

In [9]:
NBA_Players.to_csv('NBA_Players.csv', index=False)