In [4]:
# Required libraries
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px
import seaborn as sns


# Project Requirement 2 - Importing. Retrieve data online using SQL, APIs, or web scraping

# Basketball URL to be scraped
url = "https://www.basketball-reference.com/playoffs/"

# Collect the HTML data
html = urlopen(url)
        
# Create Beautiful Soup object from HTML
soup = BeautifulSoup(html, features="lxml")

# Use getText()to extract the headers into a list
headers = [th.getText() for th in soup.findAll('tr', limit=2)[1].findAll('th')]

# Get rows from table
rows = soup.findAll('tr')[2:]
rows_data = [[td.getText() for td in rows[i].findAll('td')]
                    for i in range(len(rows))]

# Empty rows from the web scrape need to be removed
rows_data.pop(20)
rows_data.pop(40)

# Look at all NBA champions data back to 1980 (starting from the Larry Bird/Magic Johnson era)
rows_data = rows_data[0:43]

# Add a column for years into rows_data
last_year = 2022
for i in range(0, len(rows_data)):
       rows_data[i].insert(0, last_year)
       last_year -=1

# Create the dataframe
nba_finals = pd.DataFrame(rows_data, columns = headers)

# Print only the Year, Champion & Runner Up
print(nba_finals[['Year','Champion', 'Runner-Up']])

# Extract the Year, Champion & Runner Up information from the dataframe
nba_top2 = nba_finals[['Year','Champion', 'Runner-Up']]

# Export full dataframe to a CSV 
nba_finals.to_csv("nba_finals_history.csv", index=False)

# Export dataframe of Finalists to a CSV 
nba_top2.to_csv("nba_winners_runners-up.csv", index=False)


# Project Requirement 2 - Importing. Import data from a flat file (.csv, .xls, .xlsx, .txt, etc.)
player_data = pd.read_csv('player_data.csv', delimiter=',')

# Project Requirement 3 - Preparation. Create pandas DataFrame
player_data.dataframeName = 'player_data.csv'
nRow, nCol = player_data.shape
print(f'There are {nRow} rows and {nCol} columns in player_data.csv')


inches_data = pd.read_csv('Height_in_inches.csv', delimiter=',')
inches_data.dataframeName = 'Height_in_inches.csv'
nRow1, nCol1 = inches_data.shape
print(f'There are {nRow1} rows and {nCol1} columns in inches_data.csv')

# Project Requirement 3 - Preparation. Merge DataFrames
player_data.merge(inches_data, on = 'height', validate = 'many_to_one')


    Year               Champion               Runner-Up
0   2022  Golden State Warriors          Boston Celtics
1   2021        Milwaukee Bucks            Phoenix Suns
2   2020     Los Angeles Lakers              Miami Heat
3   2019        Toronto Raptors   Golden State Warriors
4   2018  Golden State Warriors     Cleveland Cavaliers
5   2017  Golden State Warriors     Cleveland Cavaliers
6   2016    Cleveland Cavaliers   Golden State Warriors
7   2015  Golden State Warriors     Cleveland Cavaliers
8   2014      San Antonio Spurs              Miami Heat
9   2013             Miami Heat       San Antonio Spurs
10  2012             Miami Heat   Oklahoma City Thunder
11  2011       Dallas Mavericks              Miami Heat
12  2010     Los Angeles Lakers          Boston Celtics
13  2009     Los Angeles Lakers           Orlando Magic
14  2008         Boston Celtics      Los Angeles Lakers
15  2007      San Antonio Spurs     Cleveland Cavaliers
16  2006             Miami Heat        Dallas Ma

Unnamed: 0,name,year_start,year_end,position,height,weight,birth_date,college,inches
0,Alaa Abdelnaby,1991,1995,F-C,06-Oct,240.0,"June 24, 1968",Duke University,82
1,Bam Adebayo,2018,2018,C-F,06-Oct,243.0,"July 18, 1997",University of Kentucky,82
2,Henry Akin,1967,1969,C-F,06-Oct,225.0,"July 31, 1944",Morehead State University,82
3,Furkan Aldemir,2015,2015,F-C,06-Oct,240.0,"August 9, 1991",,82
4,Jarrett Allen,2018,2018,F-C,06-Oct,234.0,"April 21, 1998",University of Texas at Austin,82
...,...,...,...,...,...,...,...,...,...
4544,Priest Lauderdale,1997,1998,C,07-Apr,325.0,"August 31, 1973",Central State University,88
4545,Ralph Sampson,1984,1992,C-F,07-Apr,228.0,"July 7, 1960",University of Virginia,88
4546,Rik Smits,1989,2000,C,07-Apr,250.0,"August 23, 1966",Marist College,88
4547,Mel Hirsch,1947,1947,G,05-Jun,165.0,"July 31, 1921",Brooklyn College,66
