In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

tr_features = pd.read_csv('train_features.csv')
tr_labels = pd.read_csv('train_labels.csv')

val_features = pd.read_csv('val_features.csv')
val_labels = pd.read_csv('val_labels.csv')

te_features = pd.read_csv('test_features.csv')
te_labels = pd.read_csv('test_labels.csv')

In [3]:
rf1 = RandomForestClassifier(n_estimators=5, max_depth=10)
rf1.fit(tr_features, tr_labels.values.ravel())

rf2 = RandomForestClassifier(n_estimators=100, max_depth=10)
rf2.fit(tr_features, tr_labels.values.ravel())

rf3 = RandomForestClassifier(n_estimators=100, max_depth=None)
rf3.fit(tr_features, tr_labels.values.ravel())

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [4]:
for mdl in [rf1, rf2, rf3]:
    y_pred = mdl.predict(val_features)
    accuracy = round(accuracy_score(val_labels, y_pred), 3)
    precision = round(precision_score(val_labels, y_pred), 3)
    recall = round(recall_score(val_labels, y_pred), 3)
    print('MAX DEPTH: {} / # OF EST: {} -- A: {} / P: {} / R: {}'.format(mdl.max_depth,
                                                                         mdl.n_estimators,
                                                                         accuracy,
                                                                         precision,
                                                                         recall))

MAX DEPTH: 10 / # OF EST: 5 -- A: 0.729 / P: 0.68 / R: 0.773
MAX DEPTH: 10 / # OF EST: 100 -- A: 0.823 / P: 0.765 / R: 0.886
MAX DEPTH: None / # OF EST: 100 -- A: 0.802 / P: 0.736 / R: 0.886


In [5]:
y_pred = rf3.predict(te_features)
accuracy = round(accuracy_score(te_labels, y_pred), 3)
precision = round(precision_score(te_labels, y_pred), 3)
recall = round(recall_score(te_labels, y_pred), 3)
print('MAX DEPTH: {} / # OF EST: {} -- A: {} / P: {} / R: {}'.format(rf2.max_depth,
                                                                     rf2.n_estimators,
                                                                     accuracy,
                                                                     precision,
                                                                     recall))

MAX DEPTH: 10 / # OF EST: 100 -- A: 0.781 / P: 0.778 / R: 0.824


In [25]:
stats = pd.read_csv('statsfor2020to2021.csv')
stats.head()

Unnamed: 0,3P%,2P%,DRB,AST,STL,BLK,TOV,PF,Playoffs
0,22.0,28.0,28.0,18.0,4.0,15.5,3.5,7.0,0
1,13.0,31.0,31.0,25.0,10.5,29.0,22.0,5.5,1
2,9.0,29.0,14.5,2.0,28.5,20.0,12.5,24.5,1
3,29.0,7.0,20.5,1.0,3.0,30.0,5.0,23.0,0
4,24.0,13.0,29.0,10.0,8.0,15.5,17.0,27.0,1


In [26]:
testerset = stats.drop('Playoffs', axis=1)
testerset.head()

Unnamed: 0,3P%,2P%,DRB,AST,STL,BLK,TOV,PF
0,22.0,28.0,28.0,18.0,4.0,15.5,3.5,7.0
1,13.0,31.0,31.0,25.0,10.5,29.0,22.0,5.5
2,9.0,29.0,14.5,2.0,28.5,20.0,12.5,24.5
3,29.0,7.0,20.5,1.0,3.0,30.0,5.0,23.0
4,24.0,13.0,29.0,10.0,8.0,15.5,17.0,27.0


In [52]:
y_pred = rf2.predict(testerset)

In [60]:
y_pred

array([1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [54]:
predictions = pd.DataFrame(y_pred.T, columns = ['Playoffs'])

In [55]:
predictions

Unnamed: 0,Playoffs
0,1
1,1
2,1
3,1
4,1
5,1
6,0
7,0
8,1
9,1


In [30]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import re
import requests

In [56]:
site = 'https://www.basketball-reference.com/leagues/NBA_{}.html'
tableID = 'team-stats-per_game'

comm = re.compile("<!--|-->")
url= site.format(2020)
file = "{}.csv"

html = requests.get(url).text
cleaned_soup = BeautifulSoup(re.sub("<!--|-->","", str(html)),'lxml')

tableStats = cleaned_soup.find('table', {'id':tableID})

headers = [th.getText() for th in tableStats.findAll('tr')[0].findAll('th')]
headers = headers[1:]
rows = tableStats.findAll('tr')[1:]
teams = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]

teams = pd.DataFrame(teams, columns = headers)
teams= teams[['Team']]

In [45]:
teams.head()

Unnamed: 0,Team
0,Dallas Mavericks
1,Milwaukee Bucks*
2,Houston Rockets*
3,Portland Trail Blazers
4,Los Angeles Clippers*


In [57]:
teams['Playoffs'] = predictions

In [58]:
teams

Unnamed: 0,Team,Playoffs
0,Dallas Mavericks,1.0
1,Milwaukee Bucks*,1.0
2,Houston Rockets*,1.0
3,Portland Trail Blazers,1.0
4,Los Angeles Clippers*,1.0
5,New Orleans Pelicans,1.0
6,Washington Wizards,0.0
7,Phoenix Suns,0.0
8,Memphis Grizzlies,1.0
9,Miami Heat*,1.0


In [61]:
teams.to_csv("2020playoffteams.csv",index=False)