In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import re
import numpy as np
from bs4 import BeautifulSoup
import requests
import time

### Teams

In [2]:
# File from Kaggle, all possible team spellings (to get to the TeamID)
#spellings = pd.read_csv('MDataFiles_Stage1/MTeamSpellings.csv', encoding = 'ISO-8859-1')
spellings = pd.read_csv('TeamSpellings.csv', encoding = 'ISO-8859-1')
spellings.head()

Unnamed: 0,TeamNameSpelling,TeamID
0,a&m-corpus chris,1394
1,a&m-corpus christi,1394
2,abilene chr,1101
3,abilene christian,1101
4,abilene-christian,1101


In [19]:
# Put the team names in the same format (lowercase no punctuation) for joins
spellings['TeamNameSpelling'] = spellings['TeamNameSpelling'].str.replace('[^a-zA-Z&.()\' ]+',' ').str.lower()
spellings['TeamNameSpelling'] = spellings['TeamNameSpelling'].str.replace('[^a-z&.()\' ]+','')

### Helper Functions and Variables

In [4]:
# last days before tournament starts, these are the days we want to scrape to avoid any leakage
last_days = {2008: '2008-03-19',
             2009: '2009-03-18',
             2010: '2010-03-17',
            2011: '2011-03-16',
            2012: '2012-03-14',
            2013: '2013-03-20',
            2014: '2014-03-19',
            2015: '2015-03-18',
            2016: '2016-03-16',
            2017: '2017-03-15',
            2018: '2018-03-14',
            2019: '2019-03-20'}

# seasons to scrape, 2008-2019
seasons = [2008 + i for i in range(12)]

In [5]:
nans = lambda df: df[df.isnull().any(axis=1)]  # Function to print out rows with null values

In [60]:
# returns name of team that is in the spellings csv
def fix_name(row):
    if row['Team'] == 'st marys':
        return 'st marys ca'
    elif row['Team'] == 'wins salem' or row['Team'] == 'winston salem st.':
        return 'winston salem'
    elif row['Team'] == 'w virginia':
        return 'west virginia'
    elif row['Team'] == 'n carolina':
        return 'north carolina'
    elif row['Team'] == 'tx christian':
        return 'tcu'
    elif row['Team'] == 'va tech':
        return 'virginia tech'
    elif row['Team'] == 'miss state':
        return 'mississippi st'
    elif row['Team'] == 'st bonavent':
        return 'st bonaventure'
    elif row['Team'] == 'loyola chi':
        return 'loyola chicago'
    elif row['Team'] == 's methodist':
        return 'smu'
    elif row['Team'] == 'n mex state':
        return 'new mexico st'
    elif row['Team'] == 's carolina':
        return 'south carolina'
    elif row['Team'] == 'boston col':
        return 'boston college'
    elif row['Team'] == 'e tenn st':
        return 'etsu'
    elif row['Team'] == 'nc grnsboro':
        return 'unc greensboro'
    elif row['Team'] == 'central fl':
        return 'ucf'
    elif row['Team'] == 'utah val st':
        return 'utah valley state'
    elif row['Team'] == 'northeastrn':
        return 'northeastern'
    elif row['Team'] == 'ga tech':
        return 'georgia tech'
    elif row['Team'] == 'col charlestn':
        return 'college of charleston'
    elif row['Team'] == 'st josephs':
        return 'st josephs pa'
    elif row['Team'] == 'u penn':
        return 'penn'
    elif row['Team'] == 'ste f austin':
        return 'stephen f austin'
    elif row['Team'] == 'fla gulf cst':
        return 'florida gulf coast'
    elif row['Team'] == 'grd canyon':
        return 'grand canyon'
    elif row['Team'] == 'tx arlington':
        return 'ut arlington'
    elif row['Team'] == 'n iowa':
        return 'northern iowa'
    elif row['Team'] == 'la tech':
        return 'louisiana tech'
    elif row['Team'] == 'wm & mary':
        return 'william & mary'
    elif row['Team'] == 'jksnville st':
        return 'jacksonville st'
    elif row['Team'] == 'app state':
        return 'appalachian st'
    elif row['Team'] == 'san fransco':
        return 'san francisco'
    elif row['Team'] == 'e washingtn':
        return 'eastern washington'
    elif row['Team'] == 'geo wshgtn':
        return 'george washington'
    elif row['Team'] == 'u mass':
        return 'umass'
    elif row['Team'] == 'maryland bc':
        return 'umbc'
    elif row['Team'] == 'wash state':
        return 'washington st'
    elif row['Team'] == 'tx san ant':
        return 'utsa'
    elif row['Team'] == 'st fran (pa)' or row['Team'] == 'st. francis pa':
        return 'st francis pa'
    elif row['Team'] == 'miami oh':
        return 'miami ohio'
    elif row['Team'] == 'geo mason':
        return 'george mason'
    elif row['Team'] == 'wi milwkee':
        return 'milwaukee'
    elif row['Team'] == 'tn state':
        return 'tennessee st'
    elif row['Team'] == 'tn tech':
        return 'tennessee tech'
    elif row['Team'] == 'nc wilmgton':
        return 'unc wilmington'
    elif row['Team'] == 's alabama':
        return 'south alabama'
    elif row['Team'] == 'lg beach st':
        return 'long beach st'
    elif row['Team'] == 'james mad':
        return 'james madison'
    elif row['Team'] == 'sam hous st':
        return 'sam houston st'
    elif row['Team'] == 'cs bakersfld' or row['Team'] == 'cal st. bakersfield':
        return 'cal state bakersfield'
    elif row['Team'] == 'loyola mymt':
        return 'loyola marymount'
    elif row['Team'] == 's mississippi':
        return 'southern miss'
    elif row['Team'] == 'bowling grn':
        return 'bowling green'
    elif row['Team'] == 'tx el paso':
        return 'utep'
    elif row['Team'] == 'n hampshire':
        return 'new hampshire'
    elif row['Team'] == 'rob morris':
        return 'robert morris'
    elif row['Team'] == 'wi grn bay':
        return 'green bay'
    elif row['Team'] == 'charl south':
        return 'charleston southern'
    elif row['Team'] == 'abl christian':
        return 'abilene christian'
    elif row['Team'] == 'gard webb':
        return 'gardner webb'
    elif row['Team'] == 'tx pan am':
        return 'texas pan american'
    elif row['Team'] == 'se missouri' or row['Team'] == 'southeast missouri st.':
        return 'se missouri st'
    elif row['Team'] == 'neb omaha':
        return 'omaha'
    elif row['Team'] == 's florida':
        return 'south florida'
    elif row['Team'] == 'mass lowell':
        return 'umass lowell'
    elif row['Team'] == 'e carolina':
        return 'east carolina'
    elif row['Team'] == 'tx a&m cc' or row['Team'] == 'texas a&m corpus chris':
        return 'a&m corpus chris'
    elif row['Team'] == 's utah':
        return 'southern utah'
    elif row['Team'] == 'n florida':
        return 'north florida'
    elif row['Team'] == 'sacred hrt':
        return 'sacred heart'
    elif row['Team'] == 'st fran (ny)':
        return 'st francis ny'
    elif row['Team'] == 'ar lit rock':
        return 'arkansas little rock'
    elif row['Team'] == 'beth cook':
        return 'bethune cookman'
    elif row['Team'] == 'sac state':
        return 'sacramento st'
    elif row['Team'] == 'siu edward':
        return 'southern illinois'
    elif row['Team'] == 'youngs st':
        return 'youngstown st'
    elif row['Team'] == 'nw state':
        return 'northwestern st'
    elif row['Team'] == 'cal st nrdge':
        return 'cal state northridge'
    elif row['Team'] == 'ark pine bl':
        return 'arkansas pine bluff'
    elif row['Team'] == 'va military':
        return 'vmi'
    elif row['Team'] == 'incar word':
        return 'incarnate word'
    elif row['Team'] == 'n arizona':
        return 'northern arizona' 
    elif row['Team'] == 's car state':
        return 'south carolina state'
    elif row['Team'] == 'nw st':
        return 'northwestern st'
    elif row['Team'] == 'miss val st' or row['Team'] == 'mississippi valley st.':
        return 'mississippi valley state'
    elif row['Team'] == 'maryland es':
        return 'umes'
    elif row['Team'] == 'alab a&m':
        return 'alabama a&m' 
    elif row['Team'] == 'n alabama':
        return 'north alabama'
    elif row['Team'] == 'la lafayette':
        return 'louisiana lafayette'
    elif row['Team'] == 'grambling st':
        return 'grambling state'
    elif row['Team'] == 'ut rio grande valley':
        return 'texas rio grande valley'
    else:
        return row['Team']

### Teamrank Ratings

Scraping teamrankings.com for their CBB Ratings

In [24]:
# scrape each season's ratings
season_list = []
team_list = []
rating_list = []
for season in seasons:
    time.sleep(5)
    teamrank_url = 'https://www.teamrankings.com/ncaa-basketball/ranking/predictive-by-other?date=' + last_days[season]
    teamrank_page = requests.get(teamrank_url)
    teamrank_soup = BeautifulSoup(teamrank_page.content, 'lxml')
    teamrank_rows = teamrank_soup.select('tbody tr')
    for row in teamrank_rows:
        anchor = row.select('.nowrap')[0].select('a')
        if not anchor: # this if statement is neccessary due to teamrank
            continue
        season_list.append(season)
        team_list.append(anchor[0].get_text())
        rating_list.append(row.find_all('td')[2].get_text())  # magic number
teamrank_temp = pd.DataFrame({'Season': season_list, 'Team': team_list, 'TeamrankRating': rating_list})

In [31]:
teamrank = teamrank_temp.copy()
teamrank.head()

Unnamed: 0,Season,Team,TeamrankRating
0,2008,Kansas,32.4
1,2008,N Carolina,29.4
2,2008,Memphis,28.7
3,2008,Duke,28.5
4,2008,UCLA,28.2


In [32]:
teamrank.tail()

Unnamed: 0,Season,Team,TeamrankRating
4158,2019,Alcorn State,-17.7
4159,2019,Miss Val St,-18.3
4160,2019,Maryland ES,-19.7
4161,2019,Delaware St,-21.6
4162,2019,Chicago St,-21.7


In [33]:
# Put the team names in the same format (lowercase no punctuation) for joins
teamrank['Team'] = teamrank['Team'].str.replace('[^a-zA-Z&.()\' ]+',' ').str.lower()
teamrank['Team'] = teamrank['Team'].str.replace('[^a-z&.()\' ]+','')

In [34]:
# fix the names in order to join to get the team id
teamrank['Team'] = teamrank.apply(fix_name, axis = 1)

In [35]:
teamrank_teams = pd.merge(teamrank, spellings, how = 'left', left_on = 'Team', right_on = 'TeamNameSpelling')
nans(teamrank_teams)

Unnamed: 0,Season,Team,TeamrankRating,TeamNameSpelling,TeamID
335,2008,california san diego,1.6,,
404,2008,dixie state,-3.3,,
801,2009,california san diego,2.7,,
1328,2010,california san diego,-0.7,,


The above teams were not in division 1

In [67]:
teamrank_teams = teamrank_teams[['TeamID', 'Season', 'TeamrankRating']].drop_duplicates()
teamrank_teams.head()

Unnamed: 0,TeamID,Season,TeamrankRating
0,1242.0,2008,32.4
1,1314.0,2008,29.4
3,1272.0,2008,28.7
4,1181.0,2008,28.5
5,1417.0,2008,28.2


### Trank Ratings

Getting Trank ratings from barttorvik.com

In [54]:
trank = pd.read_csv('http://barttorvik.com/teamslicejson.php?year=2008&csv=1&type=R', header = None)
trank = trank[[0, 1, 2, 3, 26]]
trank.columns = ['Team', 'OE', 'DE', 'TrankRating', 'Tempo']
trank['Season'] = 2008
for season in seasons:
    time.sleep(3)
    if season != 2008:
        trank_url = 'http://barttorvik.com/teamslicejson.php?year=' + str(season) + '&csv=1&type=R'
        trank_temp = pd.read_csv(trank_url, header = None)
        trank_temp = trank_temp[[0, 1, 2, 3, 26]]
        trank_temp.columns = ['Team', 'OE', 'DE', 'TrankRating', 'Tempo']
        trank_temp['Season'] = season
        trank = pd.concat([trank, trank_temp])
trank = trank.reset_index(drop = True)
trank.head()

Unnamed: 0,Team,OE,DE,TrankRating,Tempo,Season
0,Jackson St.,91.259939,107.681226,0.129789,72.4,2008
1,Mississippi,112.21789,98.272341,0.821427,71.1,2008
2,TCU,98.968705,99.941178,0.471918,69.0,2008
3,Albany,101.159577,103.145314,0.444343,65.3,2008
4,Wyoming,99.388001,102.575619,0.410223,70.5,2008


In [55]:
trank.tail()

Unnamed: 0,Team,OE,DE,TrankRating,Tempo,Season
4172,Fordham,96.800091,102.817876,0.333237,66.2,2019
4173,Alabama St.,93.767566,110.787449,0.12807,68.4,2019
4174,Wichita St.,105.897571,96.538612,0.743471,69.2,2019
4175,Oakland,107.71395,108.870026,0.469346,69.7,2019
4176,Boise St.,107.634044,102.400989,0.639494,66.9,2019


In [56]:
# Put the team names in the same format (lowercase no punctuation) for joins
trank['Team'] = trank['Team'].str.replace('[^a-zA-Z&.()\' ]+',' ').str.lower()
trank['Team'] = trank['Team'].str.replace('[^a-z&.()\' ]+','')

In [62]:
# fix the names in order to join to get the team id
trank['Team'] = trank.apply(fix_name, axis = 1)

In [63]:
trank_copy = trank.copy()
trank_teams = pd.merge(trank_copy, spellings, how = 'left', left_on = 'Team', right_on = 'TeamNameSpelling')
nans(trank_teams)

Unnamed: 0,Team,OE,DE,TrankRating,Tempo,Season,TeamNameSpelling,TeamID


In [69]:
trank_teams = trank_teams[['TeamID', 'Season', 'TrankRating', 'OE', 'DE', 'Tempo']].drop_duplicates()
trank_teams.head()

Unnamed: 0,TeamID,Season,TrankRating,OE,DE,Tempo
0,1238,2008,0.129789,91.259939,107.681226,72.4
1,1279,2008,0.821427,112.21789,98.272341,71.1
2,1395,2008,0.471918,98.968705,99.941178,69.0
3,1107,2008,0.444343,101.159577,103.145314,65.3
4,1461,2008,0.410223,99.388001,102.575619,70.5


In [73]:
teams = pd.merge(teamrank_teams, trank_teams, how = 'inner', on = ['Season', 'TeamID'])
teams.head()

Unnamed: 0,TeamID,Season,TeamrankRating,TrankRating,OE,DE,Tempo
0,1242.0,2008,32.4,0.981585,120.970641,85.610492,69.5
1,1314.0,2008,29.4,0.957196,120.240748,91.770372,75.1
2,1272.0,2008,28.7,0.969683,113.254181,83.789458,70.7
3,1181.0,2008,28.5,0.960742,117.213494,88.761128,73.7
4,1417.0,2008,28.2,0.966422,116.350781,86.87395,66.2
