# Can you predict which NBA players will make the "All-Star" team?

## Part one: Clean the data

#### Import libraries

In [None]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import pickle

import seaborn as sns
import pandas_profiling
from pandas_profiling import ProfileReport
import numba
import matplotlib
import matplotlib.pyplot as plt

from tpot import TPOTClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

#### Load 'player_data.csv' 

In [None]:
player_data = pd.read_csv("player_data.csv")
player_data.shape

In [None]:
player_data.head()

#### Drop the first column

In [None]:
cl_data = player_data.drop([player_data.columns[0]] ,  axis='columns')
cl_data.shape

In [None]:
cl_data.head()

#### Remove null values from "Players" column

In [None]:
cl_data.isnull().sum(axis = 0)

In [None]:
cl_data = cl_data.dropna(subset=['Player'])
cl_data.isnull().sum(axis = 0)

#### Replace null values with 0's in the '3P%' column

In [None]:
cl_data['3P%'] = cl_data['3P%'].fillna(0)
cl_data.isnull().sum(axis = 0)

#### Remove rows for players who started their careers before the 1980 season

In [None]:
cl_data = cl_data.drop(cl_data[cl_data.Year < 1980].index)

In [None]:
cl_data

#### Load in the 'all_star_appearances.pickle' file and use this data to create a column called 'all_star' that indicates whether or not a player made the All-Star team for a given year
- This file is a dictionary in which the keys are players who've made an All-Star team in their careers
- The values are all the years that the corresponding player made an All-Star team

#### Make sure this column is binary where 1 = Made All-Star team, and 0 = Did not make All-Star team

In [None]:
asa_dict = pickle.load( open( "all_star_appearances.pickle", "rb" ) )
asa_dict

In [None]:
#https://stackoverflow.com/questions/42232728/pandas-creating-a-dataframe-from-a-dictionary
# Convert each list to a Series and make the dataframe
asa_df = pd.DataFrame(dict([ (k,Series(list(v))) for k,v in asa_dict.items() ]))
asa_df

In [None]:
#https://stackoverflow.com/questions/60333701/pandas-remove-index-after-stacking
asa_new = asa_df.stack().reset_index(level=0, drop=True).reset_index(name='year')
asa_new

In [None]:
asa_new["all_star"]= 1
asa_new.columns = ["Player","Year","all_star"]
asa_new

In [None]:
JL_DATA = cl_data.merge(asa_new, on=["Player","Year"], how="left")
JL_DATA["all_star"] = JL_DATA["all_star"].fillna(0)
JL_DATA

In [None]:
#df = pd.DataFrame.from_dict(asa_dict, orient="index")
#df = pd.DataFrame(asa_dict.items())

In [None]:
#pd.read_pickle("all_star_appearances.pickle")

## Part 2: Exploratory data analysis

#### Generate descriptive stats for the features

In [None]:
JL_DATA.describe()

In [None]:
JL_DATA.info()

In [None]:
#if the data set is not too big i use this all in onetool in order to get abetter understanding of the data
JL_DATA.profile_report()

In [None]:
#To save the profiling report in an easy to read html format
profile = JL_DATA.profile_report(title='Pandas Profiling Report')
profile.to_file("NBA data profiling.html")

#### Create a corr map of the features

In [None]:
corr = JL_DATA.corr()
ax = sns.heatmap(
    corr,
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

In [None]:
#Using Pearson Correlation
plt.figure(figsize=(20,20))
corr = JL_DATA.corr()
sns.heatmap(corr, annot=True, cmap=sns.diverging_palette(20, 220, n=200))
plt.show()

#### Graph the distibution of the 'Age' feature

In [None]:
#Below graph makes sense the yunger you re the more athletic you are. How ever from the above we can see that age is not very strongly correlated with other features.
sns.distplot(JL_DATA.Age)

## Part 3: Classification

### Build a classifier that predicts whether or not a player makes an All-Star team based on their stats for that season

In [None]:
#Droped pos and team as they do not affect personal score card much
X = JL_DATA.drop(["all_star","Player","Pos","Tm"], axis=1)
y = JL_DATA["all_star"]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.75, test_size=0.25, random_state=42)

In [None]:
#Since the data was small was able to quickly use AutoML
tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, random_state=42)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
#tpot.export('tpot_all-star_pipeline.py')

In [None]:
#since there is imbalance its better to use the F1 metric
tpot2 = TPOTClassifier(generations=5, population_size=50, verbosity=2, scoring = "f1_macro", random_state=42)
tpot2.fit(X_train, y_train)
print(tpot2.score(X_test, y_test))
#tpot.export('tpot_all-star_pipeline.py')

In [None]:
X = JL_DATA.drop(["all_star","Player"], axis=1)
y = JL_DATA["all_star"]

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.75, test_size=0.25, random_state=42)

In [None]:
categ_features_index = np.where(X.dtypes != float)[0]

In [None]:
model = CatBoostClassifier(eval_metric='F1',use_best_model=True,verbose = 200,random_seed=42)
model.fit(X_train, y_train, cat_features=categ_features_index,eval_set=(X_test, y_test),use_best_model=True,plot=True)
model.score(X_test, y_test)

In [None]:
#Quick alternative testing

clf = CatBoostClassifier(
    iterations=5, 
    learning_rate=0.1, 
    #loss_function='CrossEntropy'
)


clf.fit(X_train, y_train, 
        
        eval_set=(X_test, y_test), 
        verbose=False
)

print('CatBoost model is fitted: ' + str(clf.is_fitted()))
print('CatBoost model parameters:')
print(clf.get_params())

In [None]:
clf.score(X_test, y_test)