In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.mode.chained_assignment = None 

sns.set_style("whitegrid")

%matplotlib inline

# Import Data

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Let's take a look at the Datasets

In [None]:
ranking = pd.read_csv("/kaggle/input/nba-games/ranking.csv")
ranking.head()

In [None]:
players = pd.read_csv("/kaggle/input/nba-games/players.csv")
players.columns

In [None]:
game_details = pd.read_csv("/kaggle/input/nba-games/games_details.csv")
game_details.columns

# Feature Engineering (Games)

In [None]:
games = pd.read_csv("/kaggle/input/nba-games/games.csv")
games.head()

In [None]:
games.info()

In [None]:
# Duplicates and Unnecessary columns
games.drop(['GAME_STATUS_TEXT', 'TEAM_ID_home', 'TEAM_ID_away', "GAME_ID"], 1, inplace=True)

In [None]:
games.rename(columns = {
    "GAME_DATE_EST": "date",
    "SEASON": "season",
    "HOME_TEAM_ID": "home_team",
    "VISITOR_TEAM_ID": "away_team",
    "PTS_home": "home_pts",
    "FG_PCT_home": "home_fg%",
    "FT_PCT_home": "home_ft%",
    "FG3_PCT_home": "home_3p%",
    "AST_home": "home_ast",
    "REB_home": "home_reb",
    "PTS_away": "away_pts",
    "FG_PCT_away": "away_fg%",
    "FT_PCT_away": "away_ft%",
    "FG3_PCT_away": "away_3p%",
    "AST_away": "away_ast",
    "REB_away": "away_reb",
    "HOME_TEAM_WINS": "home_W"
}, inplace=True)

In [None]:
games.describe().transpose().drop('count',1)

# Feature Engineering (Teams)

In [None]:
games.head()

In [None]:
teams = pd.read_csv("/kaggle/input/nba-games/teams.csv")
teams.head()

In [None]:
team_ids = teams[['TEAM_ID', 'ABBREVIATION']]
team_ids
for i,name in team_ids.itertuples(index=False):
    games.loc[games['home_team'] == i, 'home_team'] = name
    games.loc[games['away_team'] == i, 'away_team'] = name

In [None]:
games['date']= pd.to_datetime(games['date'])
games['month'] = games['date'].apply(lambda date:date.month)
games['year'] = games['date'].apply(lambda date:date.year)

In [None]:
games.head()

# Sacramento Kings Analysis

In [None]:
sac_games = games[(games['home_team'] == 'SAC') | (games['away_team'] == 'SAC')]
sac_games.head()

In [None]:
sac_games.isnull().sum()

In [None]:
sac_games[sac_games.isna().any(axis=1)]

In [None]:
sac_games.dropna(inplace=True)
sac_games.isnull().sum()
print(sac_games.shape)

In [None]:
def is_home(home_tm):
    return 1 if home_tm == 'SAC' else 0

def opponent(home_tm, away_tm):
    return away_tm if home_tm == 'SAC' else home_tm if away_tm == 'SAC' else 0

def sac_wins(home_team,home_pts,away_pts):
    if home_team == 1:
        return 1 if home_pts > away_pts else 0
    else:
        return 0 if home_pts > away_pts else 1

In [None]:
sac_games['is_home'] = sac_games.apply(lambda x: is_home(x['home_team']), 1)
sac_games['vs'] = sac_games.apply(lambda x: opponent(x['home_team'], x['away_team']), 1)
sac_games['sac_W'] = sac_games.apply(lambda x: sac_wins(x['is_home'], x['home_pts'], x['away_pts']), 1)

In [None]:
def set_attr(is_home, attr_home, attr_away):
    return attr_home if is_home == 1 else attr_away

In [None]:
sac_games['sac_pts'] = sac_games.apply(lambda x : set_attr(x['is_home'], x['home_pts'], x['away_pts']), 1)
sac_games['sac_ast'] = sac_games.apply(lambda x : set_attr(x['is_home'], x['home_ast'], x['away_ast']), 1)
sac_games['sac_reb'] = sac_games.apply(lambda x : set_attr(x['is_home'], x['home_reb'], x['away_reb']), 1)
sac_games['sac_fg%'] = sac_games.apply(lambda x : set_attr(x['is_home'], x['home_fg%'], x['away_fg%']), 1)
sac_games['sac_ft%'] = sac_games.apply(lambda x : set_attr(x['is_home'], x['home_ft%'], x['away_ft%']), 1)
sac_games['sac_3p%'] = sac_games.apply(lambda x : set_attr(x['is_home'], x['home_3p%'], x['away_3p%']), 1)

sac_games['vs_pts'] = sac_games.apply(lambda x : set_attr(x['is_home'], x['away_pts'], x['home_pts']), 1)
sac_games['vs_ast'] = sac_games.apply(lambda x : set_attr(x['is_home'], x['away_ast'], x['home_ast']), 1)
sac_games['vs_reb'] = sac_games.apply(lambda x : set_attr(x['is_home'], x['away_reb'], x['home_reb']), 1)
sac_games['vs_fg%'] = sac_games.apply(lambda x : set_attr(x['is_home'], x['away_fg%'], x['home_fg%']), 1)
sac_games['vs_ft%'] = sac_games.apply(lambda x : set_attr(x['is_home'], x['away_ft%'], x['home_ft%']), 1)
sac_games['vs_3p%'] = sac_games.apply(lambda x : set_attr(x['is_home'], x['away_3p%'], x['home_3p%']), 1)

In [None]:
sac_games.drop(['home_team', 'away_team', 'season', 'home_pts', 'home_fg%',
       'home_ft%', 'home_3p%', 'home_ast', 'home_reb', 'away_pts', 'away_fg%',
       'away_ft%', 'away_3p%', 'away_ast', 'away_reb', 'home_W',], 1, inplace=True)

In [None]:
sac_games

In [None]:
sac_games.corr()

In [None]:
sac_games.corr()['sac_W'].sort_values(ascending=False)
plt.figure(figsize=(12,8))
sns.heatmap(sac_games.corr(), annot=True, cmap='viridis')

In [None]:
plt.figure(figsize=(12,8))
sac_games.corr()['sac_W'].sort_values().plot(kind='bar')

In [None]:
# Count Plot
plt.figure(figsize=(12,8))
sns.countplot(x=sac_games['is_home'], hue=sac_games["sac_W"])

In [None]:
plt.figure(figsize=(12,8))
sns.distplot(sac_games['sac_pts'],color="Purple")

In [None]:
plt.figure(figsize=(12,8))
sns.distplot(sac_games['sac_ast'],color="Purple")

In [None]:
plt.figure(figsize=(12,8))
sns.distplot(sac_games['sac_reb'],color="Purple")

In [None]:
plt.figure(figsize=(12,8))
sns.distplot(sac_games['sac_fg%'],color="Purple")

In [None]:
plt.figure(figsize=(12,8))
sns.distplot(sac_games['sac_ft%'],color="Purple")

In [None]:
plt.figure(figsize=(12,8))
sns.distplot(sac_games['sac_3p%'],color="Purple")

In [None]:
plt.figure(figsize=(12,8))
sns.distplot(sac_games['vs_pts'])

In [None]:
plt.figure(figsize=(12,8))
sns.distplot(sac_games['vs_ast'])

In [None]:
plt.figure(figsize=(12,8))
sns.distplot(sac_games['vs_reb'])

In [None]:
plt.figure(figsize=(12,8))
sns.distplot(sac_games['vs_fg%'])

In [None]:
plt.figure(figsize=(12,8))
sns.distplot(sac_games['vs_ft%'])

In [None]:
plt.figure(figsize=(12,8))
sns.distplot(sac_games['vs_3p%'])

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(sac_games['vs'],order=sac_games['vs'].value_counts().index)

In [None]:
g = sns.FacetGrid(sac_games,hue="sac_W",palette='coolwarm',height=6,aspect=2)
g = g.map(plt.hist,'sac_pts',bins=20,alpha=0.7)
g.add_legend()

In [None]:
sns.lmplot('sac_pts','vs_pts',data=sac_games,hue='sac_W')

In [None]:
g = sns.FacetGrid(sac_games,hue="sac_W",palette='coolwarm',height=6,aspect=2)
g = g.map(plt.hist,'sac_ast',bins=20,alpha=0.7)
g.add_legend()

In [None]:
sns.lmplot('sac_ast','vs_ast',data=sac_games,hue='sac_W')

In [None]:
g = sns.FacetGrid(sac_games,hue="sac_W",palette='coolwarm',height=6,aspect=2)
g = g.map(plt.hist,'sac_reb',bins=20,alpha=0.7)
g.add_legend()

In [None]:
sns.scatterplot(x='sac_reb',y='vs_reb',data=sac_games,hue='sac_W')

In [None]:
g = sns.FacetGrid(sac_games,hue="sac_W",palette='coolwarm',height=6,aspect=2)
g = g.map(plt.hist,'sac_fg%',bins=20,alpha=0.7)
g.add_legend()

In [None]:
sns.scatterplot(x='sac_fg%',y='vs_fg%',data=sac_games,hue='sac_W')

In [None]:
g = sns.FacetGrid(sac_games,hue="sac_W",palette='coolwarm',height=6,aspect=2)
g = g.map(plt.hist,'sac_ft%',bins=20,alpha=0.7)
g.add_legend()

In [None]:
sns.scatterplot(x='sac_ft%',y='vs_ft%',data=sac_games,hue='sac_W')

In [None]:
g = sns.FacetGrid(sac_games,hue="sac_W",palette='coolwarm',height=6,aspect=2)
g = g.map(plt.hist,'sac_3p%',bins=20,alpha=0.7)
g.add_legend()

In [None]:
sns.scatterplot(x='sac_3p%',y='vs_3p%',data=sac_games,hue='sac_W')

In [None]:
sns.pairplot(sac_games.drop(['sac_pts', 'date', 'month', 'year', 'vs'],1))

# Predicting Points per Game (Regression Analysis)

In [None]:
sac_games.head()

In [None]:
X = sac_games.drop(['sac_pts', 'date', 'month', 'year', 'vs'],1)
y = sac_games['sac_pts']

In [None]:
X

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

## Linear Regression for PPG

In [None]:
# Linear Regression (Estimate how many ppg the Kings have)
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train,y_train)

In [None]:
# Increase in val means increase/decrease in "col" 
print(lm.intercept_)
coeff_df = pd.DataFrame(lm.coef_,X.columns,columns=['Coefficient'])
round(coeff_df, 2)

Interpreting the coefficients:

* Holding all other features fixed, if the Kings are playing at home, they score 1.3 more points per game.
* Holding all other features fixed, if the Kings win the game, they score 8.47 more points per game.
* Holding all other features fixed, for every assist the Kings make, that translates into an increase of 0.38 points per game.
* Holding all other features fixed, for every rebound the Kings make, that translates into an increase of 0.4 points per game.
* Holding all other features fixed, for an increase in FG%, that translates into an increase of 8.67 points per game.
* Holding all other features fixed, for an increase in FT%, that translates into an increase of 1.5 points per game.
* Holding all other features fixed, for an increase in 3P%, that translates into an increase of 1.1 points per game.


In [None]:
# Predictions
predictions = lm.predict(X_test)
plt.scatter(y_test,predictions)

In [None]:
sns.distplot((y_test-predictions),bins=50);

In [None]:
from sklearn import metrics
print('Linear Regression MAE:', metrics.mean_absolute_error(y_test, predictions))
print('Linear Regression MSE:', metrics.mean_squared_error(y_test, predictions))
print('Linear Regression RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))
print('Linear Regression R2:', round(r2_score(y_test, predictions), 2))

# Decision Tree Regressor for PPG

In [None]:
from sklearn.tree import DecisionTreeRegressor

dtree = DecisionTreeRegressor().fit(X_train, y_train)
dtree_predict = dtree.predict(X_test)
plt.scatter(y_test,dtree_predict)

In [None]:
sns.distplot((y_test-dtree_predict),bins=50);

In [None]:
from sklearn import metrics
print('Decision Tree MAE:', metrics.mean_absolute_error(y_test, dtree_predict))
print('Decision Tree MSE:', metrics.mean_squared_error(y_test, dtree_predict))
print('Decision Tree RMSE:', np.sqrt(metrics.mean_squared_error(y_test, dtree_predict)))
print('Decision Tree R2:', round(r2_score(y_test, dtree_predict), 2))

In [None]:
# Random Forest Regressor (Estimate of how much a house is being sold based on features)
    # Regressor works by predicting the the average of all decision trees.
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators=100, max_depth=5)
regressor.fit(X_train, y_train)

regressor_predict = regressor.predict(X_test)

# Graph Random Forest
plt.scatter(y_test,regressor_predict)

In [None]:
sns.distplot((y_test-regressor_predict),bins=50);

In [None]:
from sklearn import metrics
print('Random Forest MAE:', metrics.mean_absolute_error(y_test, regressor_predict))
print('Random Forest MSE:', metrics.mean_squared_error(y_test, regressor_predict))
print('Random Forest RMSE:', np.sqrt(metrics.mean_squared_error(y_test, regressor_predict)))
print('Random Forest R2:', round(r2_score(y_test, regressor_predict), 2) )

# XGBoost Regressor for PPG

In [None]:
import xgboost as xgb
from sklearn.metrics import r2_score, auc

In [None]:
xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)

In [None]:
print('XGB MAE:', metrics.mean_absolute_error(y_test, xgb_pred))
print('XGB MSE:', metrics.mean_squared_error(y_test, xgb_pred))
print('XGB RMSE:', np.sqrt(metrics.mean_squared_error(y_test, xgb_pred)))
print('XGB R2:', round(r2_score(y_test, xgb_pred), 2) )

In [None]:
plt.scatter(y_test,xgb_pred)

In [None]:
sns.distplot((y_test-xgb_pred),bins=50);

# Catboost Regressor for PPG

In [None]:
from catboost import CatBoostRegressor
cat_model = CatBoostRegressor(iterations=10,
                          learning_rate=.3,
                          depth=10)

cat_model.fit(X_train, y_train)
cat_pred = model.predict(X_test)

In [None]:
print('Cat MAE:', metrics.mean_absolute_error(y_test, cat_pred))
print('Cat MSE:', metrics.mean_squared_error(y_test, cat_pred))
print('Cat RMSE:', np.sqrt(metrics.mean_squared_error(y_test, cat_pred)))
print('Cat R2:', round(r2_score(y_test, cat_pred), 2) )

In [None]:
plt.scatter(y_test,cat_pred)

In [None]:
sns.distplot((y_test-cat_pred),bins=50);

# AdaBoost Regressor for PPG

In [None]:
from sklearn.ensemble import AdaBoostRegressor

ada_model = AdaBoostRegressor(random_state=0, n_estimators=100)
ada_model.fit(X_train, y_train)
ada_pred = ada_model.predict(X_test)

In [None]:
print('Ada MAE:', metrics.mean_absolute_error(y_test, ada_pred))
print('Ada MSE:', metrics.mean_squared_error(y_test, ada_pred))
print('Ada RMSE:', np.sqrt(metrics.mean_squared_error(y_test, ada_pred)))
print('Ada R2:', round(r2_score(y_test, ada_pred), 2) )

In [None]:
plt.scatter(y_test,ada_pred)

In [None]:
sns.distplot((y_test-ada_pred),bins=50);

# Predicting Wins and Losses (Classification Analysis)

In [None]:
X = sac_games.drop(['sac_W', 'date', 'month', 'year', 'vs'],1)
y = sac_games['sac_W']

## Logistic Regression for Wins and Losses

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = StandardScaler()
scaled_X_train= scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [None]:
# Logistic Regression (0 OR 1: Predict if the Kings won or lost)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

logmodel = LogisticRegression()
logmodel.fit(scaled_X_train,y_train)

# Predictions
log_pred = logmodel.predict(scaled_X_test)

# Evaluation
print(classification_report(y_test,log_pred))
print(confusion_matrix(y_test,log_pred))

## Decision Tree Classifier for Wins and Losses

In [None]:
# Decision Tree (0 or 1: Determine if a person is PRESENT or ABSENT from class)
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)
dtree_pred = dtree.predict(X_test)

print(classification_report(y_test,dtree_pred))
print(confusion_matrix(y_test,dtree_pred))

In [None]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train,y_train)
rfc_pred = rfc.predict(X_test)

print(classification_report(y_test,rfc_pred))
print(confusion_matrix(y_test,rfc_pred))

## SVC with Grid Search Wins and Losses

In [None]:
# SVC (0 OR 1: Predict if a breast is MALIGNANT or BENIGN)
from sklearn.svm import SVC
svc_model = SVC()
svc_model.fit(X_train,y_train)
svc_model_pred = svc_model.predict(X_test)

# Grid Search CV
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)
grid.fit(X_train,y_train)
grid.best_params_
grid.best_estimator_
grid_predictions = grid.predict(X_test)

In [None]:
print(classification_report(y_test,grid_predictions))
print(confusion_matrix(y_test,grid_predictions))

In [None]:
print(classification_report(y_test,svc_model_pred))
print(confusion_matrix(y_test,svc_model_pred))