In [166]:
from bs4 import BeautifulSoup
import requests

import numpy as np
import pandas as pd
import re

import importlib
import utils
importlib.reload(utils)

from utils import get_parsed, wiki_pedia_base_url

In [2]:
url = 'https://en.wikipedia.org/wiki/2002_Miami_Dolphins_season'
response = requests.get(url)
text = response.text

In [3]:
soup = BeautifulSoup(text, 'html.parser')

The main page for NFL

In [170]:
NFL_MainPage_url = 'https://en.wikipedia.org/wiki/National_Football_League'

In [171]:
NFL_MainPage_url_parsed = get_parsed(NFL_MainPage_url)

In [172]:
NFL_teams_table = NFL_MainPage_url_parsed.body.findAll(id='Teams')[0].parent.findNext('table', 'wikitable')

Using find_all('tr') instead of children can remove the '\n' elements

In [173]:
NFL_teams_table_list = NFL_teams_table.tbody.find_all('tr')
len(NFL_teams_table_list)

36

In [174]:
columns_name_no = NFL_teams_table_list[0]
columns_name = []
for th in columns_name_no.children:
    if th != '\n':
        column_text = th.text
        loc = column_text.find('[')
        columns_name.append(column_text[:loc])
columns_name = ['Conference'] + columns_name
columns_name = columns_name + [col + '_url' for col in columns_name[:5]]
columns_name

['Conference',
 'Division',
 'Club',
 'City',
 'Stadium',
 'Capacity',
 'Coordinates',
 'First season',
 'Head coach',
 'Conference_url',
 'Division_url',
 'Club_url',
 'City_url',
 'Stadium_url']

In [175]:
def process_one_row(tr_content):
    main_cols = []
    url_cols = []
    for idd, th in enumerate(tr_content.find_all('td')):
        if idd < 3:
            url_cols.append(th.a['href'])
        main_cols.append(th.text.strip())
    return (main_cols, url_cols)

Conference One and Two

In [176]:
data = []

# conference_one
conference_one = NFL_teams_table_list[1].text.strip()
conference_one_url = NFL_teams_table_list[1].a['href']
loc_start = 2
for i in range(4):
    # deal with first club in this division
    loc_first_club_this_division = loc_start + i * 4
    tr_content = NFL_teams_table_list[loc_first_club_this_division]
    
    th = tr_content.find('th')
    division = th.text
    division_url = th.a['href']
    
    main_cols, url_cols = process_one_row(tr_content)
    
    data.append([conference_one, division] + main_cols + [conference_one_url, division_url] + url_cols)
    
    # the rest three clubs
    for j in range(1, 4):
        loc_j_club_this_division = loc_start + i * 4 + j
        tr_content = NFL_teams_table_list[loc_j_club_this_division]
        main_cols, url_cols = process_one_row(tr_content)
        data.append([conference_one, division] + main_cols + [conference_one_url, division_url] + url_cols)

# conference_one
conference_two = NFL_teams_table_list[18].text.strip()
conference_two_url = NFL_teams_table_list[18].a['href']
loc_start = 19
for i in range(4):
    # deal with first club in this division
    loc_first_club_this_division = loc_start + i * 4
    tr_content = NFL_teams_table_list[loc_first_club_this_division]
    
    th = tr_content.find('th')
    division = th.text
    division_url = th.a['href']
    
    main_cols, url_cols = process_one_row(tr_content)
    data.append([conference_two, division] + main_cols + [conference_two_url, division_url] + url_cols)
    
    # the rest three clubs
    for j in range(1, 4):
        loc_j_club_this_division = loc_start + i * 4 + j
        tr_content = NFL_teams_table_list[loc_j_club_this_division]
        main_cols, url_cols = process_one_row(tr_content)
        data.append([conference_two, division] + main_cols + [conference_two_url, division_url] + url_cols)



In [177]:
data[0]

['American Football Conference',
 'East\n',
 'Buffalo Bills',
 'Orchard Park, New York',
 'Bills Stadium',
 '71,608',
 '.mw-parser-output .geo-default,.mw-parser-output .geo-dms,.mw-parser-output .geo-dec{display:inline}.mw-parser-output .geo-nondefault,.mw-parser-output .geo-multi-punct{display:none}.mw-parser-output .longitude,.mw-parser-output .latitude{white-space:nowrap}42°46′26″N 78°47′13″W\ufeff / \ufeff42.774°N 78.787°W\ufeff / 42.774; -78.787\ufeff (Buffalo Bills)',
 '1960 (AFL), 1970 (NFL)',
 'Sean McDermott',
 '/wiki/American_Football_Conference',
 '/wiki/AFC_East',
 '/wiki/Buffalo_Bills',
 '/wiki/Orchard_Park_(town),_New_York',
 '/wiki/Bills_Stadium']

In [178]:
data_df = pd.DataFrame(data, columns=columns_name)

In [179]:
data_df.shape

(32, 14)

In [180]:
test_string = data_df.loc[0, 'Coordinates']
test_string

'.mw-parser-output .geo-default,.mw-parser-output .geo-dms,.mw-parser-output .geo-dec{display:inline}.mw-parser-output .geo-nondefault,.mw-parser-output .geo-multi-punct{display:none}.mw-parser-output .longitude,.mw-parser-output .latitude{white-space:nowrap}42°46′26″N 78°47′13″W\ufeff / \ufeff42.774°N 78.787°W\ufeff / 42.774; -78.787\ufeff (Buffalo Bills)'

In [181]:
re.findall('[0-9]+\.[0-9]+°[NS]', test_string)

['42.774°N']

In [182]:
def get_lat(x):
    lat = re.findall('[0-9]+\.[0-9]+°[NS]', x)
    return lat[0]
def get_long(x):
    lat = re.findall('[0-9]+\.[0-9]+°[WE]', x)
    return lat[0]
data_df['Lat'] = data_df['Coordinates'].apply(get_lat)
data_df['Long'] = data_df['Coordinates'].apply(get_long)

In [183]:
data_df.head(6)

Unnamed: 0,Conference,Division,Club,City,Stadium,Capacity,Coordinates,First season,Head coach,Conference_url,Division_url,Club_url,City_url,Stadium_url,Lat,Long
0,American Football Conference,East\n,Buffalo Bills,"Orchard Park, New York",Bills Stadium,71608,".mw-parser-output .geo-default,.mw-parser-outp...","1960 (AFL), 1970 (NFL)",Sean McDermott,/wiki/American_Football_Conference,/wiki/AFC_East,/wiki/Buffalo_Bills,"/wiki/Orchard_Park_(town),_New_York",/wiki/Bills_Stadium,42.774°N,78.787°W
1,American Football Conference,East\n,Miami Dolphins,"Miami Gardens, Florida",Hard Rock Stadium,64767,25°57′29″N 80°14′20″W﻿ / ﻿25.958°N 80.239°W﻿ /...,"1966 (AFL), 1970 (NFL)",Brian Flores,/wiki/American_Football_Conference,/wiki/AFC_East,/wiki/Miami_Dolphins,"/wiki/Miami_Gardens,_Florida",/wiki/Hard_Rock_Stadium,25.958°N,80.239°W
2,American Football Conference,East\n,New England Patriots,"Foxborough, Massachusetts",Gillette Stadium,65878,42°05′28″N 71°15′50″W﻿ / ﻿42.091°N 71.264°W﻿ /...,"1960 (AFL), 1970 (NFL)",Bill Belichick,/wiki/American_Football_Conference,/wiki/AFC_East,/wiki/New_England_Patriots,"/wiki/Foxborough,_Massachusetts",/wiki/Gillette_Stadium,42.091°N,71.264°W
3,American Football Conference,East\n,New York Jets,"East Rutherford, New Jersey",MetLife Stadium[C],82500,40°48′50″N 74°04′26″W﻿ / ﻿40.814°N 74.074°W﻿ /...,"1960 (AFL), 1970 (NFL)",Vacant,/wiki/American_Football_Conference,/wiki/AFC_East,/wiki/New_York_Jets,"/wiki/East_Rutherford,_New_Jersey",/wiki/MetLife_Stadium,40.814°N,74.074°W
4,American Football Conference,North\n,Baltimore Ravens,"Baltimore, Maryland",M&T Bank Stadium,71008,39°16′41″N 76°37′23″W﻿ / ﻿39.278°N 76.623°W﻿ /...,1996[D],John Harbaugh,/wiki/American_Football_Conference,/wiki/AFC_North,/wiki/Baltimore_Ravens,/wiki/Baltimore,/wiki/M%26T_Bank_Stadium,39.278°N,76.623°W
5,American Football Conference,North\n,Cincinnati Bengals,"Cincinnati, Ohio",Paul Brown Stadium,65515,39°05′42″N 84°30′58″W﻿ / ﻿39.095°N 84.516°W﻿ /...,"1968 (AFL), 1970 (NFL)",Zac Taylor,/wiki/American_Football_Conference,/wiki/AFC_North,/wiki/Cincinnati_Bengals,/wiki/Cincinnati,/wiki/Paul_Brown_Stadium,39.095°N,84.516°W


In [169]:
data_df.to_pickle('all_teams_data.pkl')