# 3 Pointers Made against shooting.xlsx

### Import packages

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
%matplotlib inline
from scipy.stats import pearsonr
import itertools

pd.set_option("display.max_columns", None)

### Set working directory

In [None]:
# Print working directory
cwd = os.getcwd()
print(f'Directory: {cwd}')

# Change working directory
os.chdir('/Users/tyler/OneDrive/Documents/Python/NBA')

# Print working directory
cwd = os.getcwd()
print(f'Directory: {cwd}')

## Exploratory Data Analysis

### Import data

In [None]:
df = pd.read_csv('backend/data/totals/game_totals.csv').drop(['Unnamed: 0'], axis=1)

### Basic exploration

In [None]:
df.info()

In [None]:
df.sample(5)

In [None]:
# Convert 'date' column to Date object
df['date'] = pd.to_datetime(df['date'])

# Conver 'team' column to Team Name
df['team'] = np.where(df['team'], df['home'], df['visitor'])

df.info()

## Dataframe of team's latest performance

In [None]:
# Return lastest date team played
def last_date(team, date):
    schedule = df[df['team'] == team].sort_values(by='date').reset_index()
    date_index = schedule[schedule['date'] == date].index[0]
    if date_index - 1 < 0:
        return None
    else:
        return schedule.iloc[date_index - 1]['date']

df['last_date'] = df.apply(lambda x: last_date(x.team, x.date), axis=1)

# X and y column names to merge on
y_cols = ['date', 'visitor', 'home', 'team', '3p', 'last_date']
x_cols = ['date', 'team', 'fg', 'fga', 'fg_perc', '2p', '2pa', '2p_perc',
          '3p', '3pa', '3p_perc', 'efg_perc', 'ast', 'ast_perc']

y = df[y_cols]
X = df[x_cols]

# Dataframe of target (3pt made by each team) and of variables (last games stats for each team)
last_game = pd.merge(y, X, left_on=['last_date', 'team'], right_on=['date', 'team'], how='left', suffixes=('_y', ''))
last_game = last_game.drop(['last_date', 'date'], axis=1).dropna(axis=0)

# Totals for target and variables
last_game = last_game.groupby(['date_y', 'visitor', 'home']).aggregate(
    {'3p_y': 'sum', 'fg': 'sum', 'fga': 'sum', 'fg_perc': 'mean', '2p': 'sum', '2pa': 'sum', '2p_perc': 'mean',
     '3p': 'sum', '3pa': 'sum', '3p_perc': 'mean', 'efg_perc': 'mean', 'ast': 'sum', 'ast_perc': 'mean'})

## Dataframe of team's last 5 performances

In [None]:
# Return five lastest dates team played
def last_5_date(team, date):
    schedule = df[df['team'] == team].sort_values(by='date').reset_index()
    date_index = schedule[schedule['date'] == date].index[0]
    if date_index - 5 < 0:
        return None, None, None, None, None
    else:
        date_1 = schedule.iloc[date_index - 1]['date']
        date_2 = schedule.iloc[date_index - 2]['date']
        date_3 = schedule.iloc[date_index - 3]['date']
        date_4 = schedule.iloc[date_index - 4]['date']
        date_5 = schedule.iloc[date_index - 5]['date']
        return date_1, date_2, date_3, date_4, date_5

df['dates'] = df.apply(lambda x: last_5_date(x.team, x.date), axis=1)
df['date_1'] = df['dates'].apply(lambda x: x[0])
df['date_2'] = df['dates'].apply(lambda x: x[1])
df['date_3'] = df['dates'].apply(lambda x: x[2])
df['date_4'] = df['dates'].apply(lambda x: x[3])
df['date_5'] = df['dates'].apply(lambda x: x[4])

# X and y column names to merge on
y_cols = df.columns
x_cols = ['date', 'team', 'fg', 'fga', 'fg_perc', '2p', '2pa', '2p_perc',
          '3p', '3pa', '3p_perc', 'efg_perc', 'ast', 'ast_perc']

last_5_games = df[y_cols]
last_5_games['target'] = last_5_games['3p']
X = df[x_cols]

# Dataframe of target (3pt made by each team) and of variables (last 5 games stats for each team)
weeks = ['_1', '_2', '_3', '_4', '_5']
for week in weeks:
    last_5_games = pd.merge(last_5_games, X, left_on=['date' + week, 'team'], right_on=['date', 'team'], how='left', suffixes=('', week))

drop_cols = ['quarter', 'fg', 'fga', 'fg_perc', '2p', '2pa', '2p_perc', '3pa', '3p', '3p_perc', 'efg_perc', 'ast', 'ast_perc', 'dates', 'date_1', 'date_2', 'date_3', 'date_4', 'date_5']
last_5_games = last_5_games.drop(drop_cols, axis=1).dropna(axis=0)

# Take average of last 5 games
stats = ['fg', 'fga', 'fg_perc', '2p', '2pa', '2p_perc', '3p', '3pa', '3p_perc', 'efg_perc', 'ast', 'ast_perc']
for stat in stats:
    last_5_games[stat] = 0
    for week in weeks:
        last_5_games[stat] = last_5_games[stat] + last_5_games[stat + week]
    
    last_5_games[stat] = last_5_games[stat] / len(weeks)

drop_cols = [tup[0] + tup[1] for tup in list(itertools.product(stats, weeks))]
last_5_games = last_5_games.drop(drop_cols, axis=1)

# Totals for target and variables
last_5_games = last_5_games.groupby(['date', 'visitor', 'home']).aggregate(
    {'target': 'sum', 'fg': 'sum', 'fga': 'sum', 'fg_perc': 'mean', '2p': 'sum', '2pa': 'sum', '2p_perc': 'mean',
     '3p': 'sum', '3pa': 'sum', '3p_perc': 'mean', 'efg_perc': 'mean', 'ast': 'sum', 'ast_perc': 'mean'})

## Dataframe of team's last 10 performances

In [None]:
# Return ten lastest dates team played
def last_10_date(team, date):
    schedule = df[df['team'] == team].sort_values(by='date').reset_index()
    date_index = schedule[schedule['date'] == date].index[0]
    if date_index - 10 < 0:
        return None, None, None, None, None, None, None, None, None, None
    else:
        date_1, date_2 = schedule.iloc[date_index - 1]['date'], schedule.iloc[date_index - 2]['date']
        date_3, date_4 = schedule.iloc[date_index - 3]['date'], schedule.iloc[date_index - 4]['date']
        date_5, date_6 = schedule.iloc[date_index - 5]['date'], schedule.iloc[date_index - 6]['date']
        date_7, date_8 = schedule.iloc[date_index - 7]['date'], schedule.iloc[date_index - 8]['date']
        date_9, date_10 = schedule.iloc[date_index - 9]['date'], schedule.iloc[date_index - 10]['date']
        return date_1, date_2, date_3, date_4, date_5, date_6, date_7, date_8, date_9, date_10

df['dates'] = df.apply(lambda x: last_10_date(x.team, x.date), axis=1)
df['date_1'], df['date_2'] = df['dates'].apply(lambda x: x[0]), df['dates'].apply(lambda x: x[1])
df['date_3'], df['date_4'] = df['dates'].apply(lambda x: x[2]), df['dates'].apply(lambda x: x[3])
df['date_5'], df['date_6'] = df['dates'].apply(lambda x: x[4]), df['dates'].apply(lambda x: x[5])
df['date_7'], df['date_8'] = df['dates'].apply(lambda x: x[6]), df['dates'].apply(lambda x: x[7])
df['date_9'], df['date_10'] = df['dates'].apply(lambda x: x[8]), df['dates'].apply(lambda x: x[9])

# X and y column names to merge on
y_cols = df.columns
x_cols = ['date', 'team', 'fg', 'fga', 'fg_perc', '2p', '2pa', '2p_perc',
          '3p', '3pa', '3p_perc', 'efg_perc', 'ast', 'ast_perc']

last_10_games = df[y_cols]
last_10_games['target'] = last_10_games['3p']
X = df[x_cols]

# Dataframe of target (3pt made by each team) and of variables (last 5 games stats for each team)
weeks = ['_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10']
for week in weeks:
    last_10_games = pd.merge(last_10_games, X, left_on=['date' + week, 'team'], right_on=['date', 'team'], how='left', suffixes=('', week))

drop_cols = ['quarter', 'fg', 'fga', 'fg_perc', '2p', '2pa', '2p_perc', '3pa', '3p', '3p_perc', 'efg_perc', 'ast', 'ast_perc', 
             'dates', 'date_1', 'date_2', 'date_3', 'date_4', 'date_5', 'date_6', 'date_7', 'date_8', 'date_9', 'date_10']
last_10_games = last_10_games.drop(drop_cols, axis=1).dropna(axis=0)

# Take average of last 10 games
stats = ['fg', 'fga', 'fg_perc', '2p', '2pa', '2p_perc', '3p', '3pa', '3p_perc', 'efg_perc', 'ast', 'ast_perc']
for stat in stats:
    last_10_games[stat] = 0
    for week in weeks:
        last_10_games[stat] = last_10_games[stat] + last_10_games[stat + week]
    
    last_10_games[stat] = last_10_games[stat] / len(weeks)

drop_cols = [tup[0] + tup[1] for tup in list(itertools.product(stats, weeks))]
last_10_games = last_10_games.drop(drop_cols, axis=1)

# Totals for target and variables
last_10_games = last_10_games.groupby(['date', 'visitor', 'home']).aggregate(
    {'target': 'sum', 'fg': 'sum', 'fga': 'sum', 'fg_perc': 'mean', '2p': 'sum', '2pa': 'sum', '2p_perc': 'mean',
     '3p': 'sum', '3pa': 'sum', '3p_perc': 'mean', 'efg_perc': 'mean', 'ast': 'sum', 'ast_perc': 'mean'})

## Dataframe of team's last 15 performances

In [None]:
# Return ten lastest dates team played
def last_15_date(team, date):
    schedule = df[df['team'] == team].sort_values(by='date').reset_index()
    date_index = schedule[schedule['date'] == date].index[0]
    if date_index - 15 < 0:
        return None, None, None, None, None, None, None, None, None, None, None, None, None, None, None
    else:
        date_1, date_2 = schedule.iloc[date_index - 1]['date'], schedule.iloc[date_index - 2]['date']
        date_3, date_4 = schedule.iloc[date_index - 3]['date'], schedule.iloc[date_index - 4]['date']
        date_5, date_6 = schedule.iloc[date_index - 5]['date'], schedule.iloc[date_index - 6]['date']
        date_7, date_8 = schedule.iloc[date_index - 7]['date'], schedule.iloc[date_index - 8]['date']
        date_9, date_10 = schedule.iloc[date_index - 9]['date'], schedule.iloc[date_index - 10]['date']
        date_11, date_12 = schedule.iloc[date_index - 11]['date'], schedule.iloc[date_index - 12]['date']
        date_13, date_14 = schedule.iloc[date_index - 13]['date'], schedule.iloc[date_index - 14]['date']
        date_15 = schedule.iloc[date_index - 15]['date']
        return date_1, date_2, date_3, date_4, date_5, date_6, date_7, date_8, date_9, date_10, date_11, date_12, date_13, date_14, date_15

df['dates'] = df.apply(lambda x: last_15_date(x.team, x.date), axis=1)
df['date_1'], df['date_2'] = df['dates'].apply(lambda x: x[0]), df['dates'].apply(lambda x: x[1])
df['date_3'], df['date_4'] = df['dates'].apply(lambda x: x[2]), df['dates'].apply(lambda x: x[3])
df['date_5'], df['date_6'] = df['dates'].apply(lambda x: x[4]), df['dates'].apply(lambda x: x[5])
df['date_7'], df['date_8'] = df['dates'].apply(lambda x: x[6]), df['dates'].apply(lambda x: x[7])
df['date_9'], df['date_10'] = df['dates'].apply(lambda x: x[8]), df['dates'].apply(lambda x: x[9])
df['date_11'], df['date_12'] = df['dates'].apply(lambda x: x[10]), df['dates'].apply(lambda x: x[11])
df['date_13'], df['date_14'] = df['dates'].apply(lambda x: x[12]), df['dates'].apply(lambda x: x[13])
df['date_15'] = df['dates'].apply(lambda x: x[14])

### Unweighted

In [None]:
# X and y column names to merge on
y_cols = df.columns
x_cols = ['date', 'team', 'fg', 'fga', 'fg_perc', '2p', '2pa', '2p_perc',
          '3p', '3pa', '3p_perc', 'efg_perc', 'ast', 'ast_perc']

last_15_games_unweighted = df[y_cols]
last_15_games_unweighted['target'] = last_15_games_unweighted['3p']
X = df[x_cols]

# Dataframe of target (3pt made by each team) and of variables (last 5 games stats for each team)
weeks = ['_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10', '_11', '_12', '_13', '_14', '_15']
for week in weeks:
    last_15_games_unweighted = pd.merge(last_15_games_unweighted, X, left_on=['date' + week, 'team'], right_on=['date', 'team'], how='left', suffixes=('', week))

drop_cols = ['quarter', 'fg', 'fga', 'fg_perc', '2p', '2pa', '2p_perc', '3pa', '3p', '3p_perc', 'efg_perc', 'ast', 'ast_perc', 
             'dates', 'date_1', 'date_2', 'date_3', 'date_4', 'date_5', 'date_6', 'date_7', 'date_8', 'date_9', 'date_10', 
             'date_11', 'date_12', 'date_13', 'date_14', 'date_15']
last_15_games_unweighted = last_15_games_unweighted.drop(drop_cols, axis=1).dropna(axis=0)

# Take average of last 10 games
stats = ['fg', 'fga', 'fg_perc', '2p', '2pa', '2p_perc', '3p', '3pa', '3p_perc', 'efg_perc', 'ast', 'ast_perc']
for stat in stats:
    last_15_games_unweighted[stat] = 0
    for week in weeks:
        last_15_games_unweighted[stat] = last_15_games_unweighted[stat] + last_15_games_unweighted[stat + week]
    
    last_15_games_unweighted[stat] = last_15_games_unweighted[stat] / len(weeks)

drop_cols = [tup[0] + tup[1] for tup in list(itertools.product(stats, weeks))]
last_15_games_unweighted = last_15_games_unweighted.drop(drop_cols, axis=1)

# Totals for target and variables
last_15_games_unweighted = last_15_games_unweighted.groupby(['date', 'visitor', 'home']).aggregate(
    {'target': 'sum', 'fg': 'sum', 'fga': 'sum', 'fg_perc': 'mean', '2p': 'sum', '2pa': 'sum', '2p_perc': 'mean',
     '3p': 'sum', '3pa': 'sum', '3p_perc': 'mean', 'efg_perc': 'mean', 'ast': 'sum', 'ast_perc': 'mean'})

### Weighted

In [None]:
# X and y column names to merge on
y_cols = df.columns
x_cols = ['date', 'team', 'fg', 'fga', 'fg_perc', '2p', '2pa', '2p_perc',
          '3p', '3pa', '3p_perc', 'efg_perc', 'ast', 'ast_perc']

last_15_games_weighted = df[y_cols]
last_15_games_weighted['target'] = last_15_games_weighted['3p']
X = df[x_cols]

# Dataframe of target (3pt made by each team) and of variables (last 5 games stats for each team)
weeks = ['_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10', '_11', '_12', '_13', '_14', '_15']
for week in weeks:
    last_15_games_weighted = pd.merge(last_15_games_weighted, X, left_on=['date' + week, 'team'], right_on=['date', 'team'], how='left', suffixes=('', week))

drop_cols = ['quarter', 'fg', 'fga', 'fg_perc', '2p', '2pa', '2p_perc', '3pa', '3p', '3p_perc', 'efg_perc', 'ast', 'ast_perc', 
             'dates', 'date_1', 'date_2', 'date_3', 'date_4', 'date_5', 'date_6', 'date_7', 'date_8', 'date_9', 'date_10', 
             'date_11', 'date_12', 'date_13', 'date_14', 'date_15']
last_15_games_weighted = last_15_games_weighted.drop(drop_cols, axis=1).dropna(axis=0)

# Take average of last 10 games
stats = ['fg', 'fga', 'fg_perc', '2p', '2pa', '2p_perc', '3p', '3pa', '3p_perc', 'efg_perc', 'ast', 'ast_perc']
multiples = {0: 1, 1: 2, 2: 3}
for stat in stats:
    last_15_games_weighted[stat] = 0
    for week in weeks:
        multiplier = multiples[(int(week.strip('_')) - 1) // 5]
        last_15_games_weighted[stat] = last_15_games_weighted[stat] + (last_15_games_weighted[stat + week] * multiplier)
    
    last_15_games_weighted[stat] = last_15_games_weighted[stat] / len(weeks)

drop_cols = [tup[0] + tup[1] for tup in list(itertools.product(stats, weeks))]
last_15_games_weighted = last_15_games_weighted.drop(drop_cols, axis=1)

# Totals for target and variables
last_15_games_weighted = last_15_games_weighted.groupby(['date', 'visitor', 'home']).aggregate(
    {'target': 'sum', 'fg': 'sum', 'fga': 'sum', 'fg_perc': 'mean', '2p': 'sum', '2pa': 'sum', '2p_perc': 'mean',
     '3p': 'sum', '3pa': 'sum', '3p_perc': 'mean', 'efg_perc': 'mean', 'ast': 'sum', 'ast_perc': 'mean'})

## Correlations of performances (last 1, last 5, last 10)

In [None]:
corr_df = pd.DataFrame()

# Correlations for last 15 game stats vs 3pt made (unweighted)
for col in last_15_games_weighted:
    corr_p = pearsonr(last_15_games_weighted['target'], last_15_games_weighted[col])
    row = {'last': '15_weighted', 'stat': col, 'corr': round(corr_p[0], 2), 'p-value': round(corr_p[1], 2)}
    corr_df = corr_df.append(row, ignore_index=True)

# Correlations for last 15 game stats vs 3pt made (unweighted)
for col in last_15_games_unweighted:
    corr_p = pearsonr(last_15_games_unweighted['target'], last_15_games_unweighted[col])
    row = {'last': '15_unweighted', 'stat': col, 'corr': round(corr_p[0], 2), 'p-value': round(corr_p[1], 2)}
    corr_df = corr_df.append(row, ignore_index=True)

# Correlations for last 10 game stats vs 3pt made 
for col in last_10_games:
    corr_p = pearsonr(last_10_games['target'], last_10_games[col])
    row = {'last': 10, 'stat': col, 'corr': round(corr_p[0], 2), 'p-value': round(corr_p[1], 2)}
    corr_df = corr_df.append(row, ignore_index=True)
    
# Correlations for last 10 game stats vs 3pt made 
for col in last_5_games:
    corr_p = pearsonr(last_5_games['target'], last_5_games[col])
    row = {'last': 5, 'stat': col, 'corr': round(corr_p[0], 2), 'p-value': round(corr_p[1], 2)}
    corr_df = corr_df.append(row, ignore_index=True)
    
# Correlations for last game stats vs 3pt made 
for col in last_game:
    corr_p = pearsonr(last_game['3p_y'], last_game[col])
    row = {'last': 1, 'stat': col, 'corr': round(corr_p[0], 2), 'p-value': round(corr_p[1], 2)}
    corr_df = corr_df.append(row, ignore_index=True)
    
# Print each correlation
for stat in stats:
    print(f'Stat: {stat}')
    print(corr_df[corr_df['stat'] == stat].set_index(['last']).drop(['stat'], axis=1))
    print('\n')