In [16]:
import streamlit as st
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

nba_data = pd.read_csv("data/NBA_Regular_Season.csv", sep = ";", encoding = 'latin-1')

new_data = (nba_data[['Rk', 'Player', 'Pos', 'PTS', 'AST' ,'TRB']].groupby('Rk', as_index=False).agg({
    'Player': 'first',  
    'Pos': 'first',
    'PTS': 'mean',
    'AST': 'mean',
    'TRB': 'mean'
}))

new_data = new_data[new_data['Pos'].isin(['PG', 'SG', 'SF', 'PF', 'C'])].reset_index(drop = True)

all_star_dict = {"Player": ["Tyrese Haliburton", "Damian Lillard", "Giannis Antetokounmpo", "Jayson Tatum", "Joel Embiid",
                 "Jalen Brunson", "Tyrese Maxey", "Donovan Mitchell", "Trae Young", "Paolo Banchero", "Scottie Barnes", "Jaylen Brown",
                 "Julius Randle", "Bam Adebayo", "Luka Don?i?", "Shai Gilgeous-Alexander", "Kevin Durant", "LeBron James", "Nikola Joki?",
                 "Devin Booker", "Stephen Curry", "Anthony Edwards", "Paul George", "Kawhi Leonard", "Karl-Anthony Towns", "Anthony Davis"],
                 "All-Star": [int(1)]*26}

all_star_data = pd.DataFrame(all_star_dict)

final_dataset = (pd.merge(new_data, all_star_data, how = "outer", on = "Player").fillna(0))
final_dataset['Rk'] = final_dataset['Rk'].astype(int)
final_dataset['All-Star'] = final_dataset['All-Star'].astype(int)

positions = {
    'PG':1,
    'SG':2,
    'SF':3,
    'PF':4,
    'C':5
}

final_dataset['Pos'] = final_dataset['Pos'].map(positions)



In [17]:
final_dataset

Unnamed: 0,Rk,Player,Pos,PTS,AST,TRB,All-Star
0,183,A.J. Green,2,4.5,0.5,1.1,0
1,290,A.J. Lawson,2,3.2,0.5,1.2,0
2,190,AJ Griffin,3,2.4,0.3,0.9,0
3,178,Aaron Gordon,4,13.9,3.5,6.5,0
4,221,Aaron Holiday,1,6.6,1.8,1.6,0
...,...,...,...,...,...,...,...
554,289,Zach LaVine,2,19.5,3.9,5.2,0
555,466,Zavier Simpson,1,6.0,3.6,2.9,0
556,375,Zeke Nnaji,4,3.2,0.6,2.2,0
557,559,Ziaire Williams,3,8.2,1.5,3.5,0


In [21]:
features = ['Pos', 'PTS', 'AST', 'TRB']
X = final_dataset[features]
y = final_dataset['All-Star']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
                                                                random_state = 99)
user_data = [[positions['PG'], 29.5, 10.0, 10.2]]
model = LogisticRegression()
model.fit(X_train, y_train)
prob = model.predict_proba(user_data)
print(prob[0][1])

0.9947959713718099




In [23]:
list(range(1,11,1))

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [24]:
final_dataset.head()

Unnamed: 0,Rk,Player,Pos,PTS,AST,TRB,All-Star
0,183,A.J. Green,2,4.5,0.5,1.1,0
1,290,A.J. Lawson,2,3.2,0.5,1.2,0
2,190,AJ Griffin,3,2.4,0.3,0.9,0
3,178,Aaron Gordon,4,13.9,3.5,6.5,0
4,221,Aaron Holiday,1,6.6,1.8,1.6,0


In [32]:
for var in final_dataset.columns:
    if final_dataset[var].nunique() == 2:
        print(var)

All-Star
