# 1. Initializations

## 1.1 General imports

In [None]:
### data management
import pandas as pd
import numpy as np

### régression
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression, ElasticNetCV
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict, cross_val_score
from sklearn.feature_selection import f_regression
from sklearn.metrics import mean_squared_error

### graphical matplotlib basics
import matplotlib.pyplot as plt
# for jupyter notebook management
%matplotlib inline

### graphical seaborn basics
import seaborn as sns

## 1.2 General dataframe functions

In [None]:
import smartcheck.dataframe_common as dfc

## 1.3 General classification functions

In [None]:
# None

# 2. Loading and Data Quality

## 2.1 Loading of data sets and general exploration

In [None]:
df_nba_raw = dfc.load_dataset_from_config('nba_data', sep=',')

if df_nba_raw is not None and isinstance(df_nba_raw, pd.DataFrame):
    # display(df_auto_raw.head())
    dfc.log_general_info(df_nba_raw)
    nb_first, nb_total = dfc.detect_and_log_duplicates_and_missing(df_nba_raw)
    if nb_first != nb_total:
        print(dfc.duplicates_index_map(df_nba_raw))
    df_nba = df_nba_raw.copy()
    display(df_nba.head())

In [None]:
df_nba_desc = df_nba.select_dtypes(include=np.number).describe()
display(df_nba_desc)
df_nba_cr = df_nba.select_dtypes(include=np.number).corr()
display(df_nba_cr)

## 2.2 Data quality refinement

In [None]:
# Original backup and duplicates management
df_nba_orig = df_nba.copy()
df_nba = df_nba.drop_duplicates()

In [None]:
# Rework on indexes and absurd values
columns_drop = ['fg', 'fga', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'ft', 'fta', 'season_end']
df_nba = df_nba.drop(columns_drop, axis=1)
df_nba.index = df_nba.player + " - " + df_nba.bref_team_id
df_nba = df_nba.dropna(how='any')
df_nba = df_nba[df_nba.pos!='G']
pl_pos_matrix = pd.get_dummies(df_nba.pos, prefix='pos')
df_nba = df_nba.join(pl_pos_matrix)
df_nba = df_nba.drop(columns=['season', 'player', 'bref_team_id', 'pos'])


In [None]:
df_nba.info()
df_nba_desc = df_nba.select_dtypes(include=np.number).describe()
display(df_nba_desc)
df_nba_cr = df_nba.select_dtypes(include=np.number).corr()
display(df_nba_cr)

# 2. Data Classification

## 2.1 General Analysis variable/target Separation

In [None]:
# Separation des variables explicatives (features) et de la variable à prédire (target)
data = df_nba.drop(['pts', 'pos_SG'], axis=1)
target = df_nba['pts']

In [None]:
# Séparation de données d'entrainement et données de test
X_train, X_test, y_train, y_test = train_test_split(data, target, train_size=0.8, random_state=101)

## 2.2 Linear Regression (univariée)

In [None]:
# Preprocessing des variables explicatives d'entrainement et de test (scaler)
scaler = preprocessing.StandardScaler().fit(X_train)
# Recuperation des propriétés du dataframe perdues (nparray) avec le scaler
X_train[X_train.columns] = pd.DataFrame(scaler.fit_transform(X_train), index=X_train.index)
X_test[X_test.columns] = pd.DataFrame(scaler.transform(X_test), index=X_test.index)

In [None]:
# Visualisation de la correlation entre les variables explicatives (avec seaborn)
plt.figure(figsize=(13, 13))
sns.heatmap(df_nba.corr(), annot=True, cmap="RdBu_r", center=0)
plt.tight_layout();

In [None]:
# Definition et Entrainement du modèle
regLR = LinearRegression()
regLR.fit(X_train[['mp']], y_train)

In [None]:
# Evaluation du modèle sur les données
print("Score R² calculé par le modèle:", regLR.score(X_train[['mp']], y_train))
print("Score R² calculé par le modèle:", regLR.score(X_test[['mp']], y_test))
y_train_pred = regLR.predict(X_train[['mp']])
y_test_pred = regLR.predict(X_test[['mp']])
print("Score MSE train:", mean_squared_error(y_train, y_train_pred))
print("Score MSE test:", mean_squared_error(y_test, y_test_pred))

In [None]:
# Test statistique univarié sur chaque variable explicative de la cible (et sur les données totales)
# NB : cela ne prouve pas la causalité ni l'importance, juste la corrélation
f_statistics, p_values = f_regression(data, target)
for column, f, p in zip(data.columns, f_statistics, p_values):
    print (f"[{column}]\n [F-Stat : {f.round(2)}] [P-Value : {p.round(6)}]")

## 2.3 Linear Regression (multivariée)

### 2.3.1 Affinage par Elastic Net
- combine les avantage de Lasso et Ridge

In [None]:
regLR_multi_EN = ElasticNetCV(
    l1_ratio=(0.1, 0.25, 0.5, 0.7, 0.75, 0.8, 0.85, 0.9, 0.99),
    alphas=(0.001, 0.01, 0.02, 0.025, 0.05, 0.1, 0.25, 0.5, 0.8, 1.0),
    cv=8,
    max_iter=15000 # l'alternative est: warnings.filterwarnings('ignore')
)
regLR_multi_EN.fit(X_train, y_train)
print("Alpha retenu par cross validation:", regLR_multi_EN.alpha_)
print("Score R² train:", regLR_multi_EN.score(X_train, y_train))
print("Score R² test:", regLR_multi_EN.score(X_test, y_test))
y_train_pred = regLR_multi_EN.predict(X_train)
y_test_pred  = regLR_multi_EN.predict(X_test)
print("Score MSE train:", mean_squared_error(y_train, y_train_pred))
print("Score MSE test:", mean_squared_error(y_test, y_test_pred))

In [None]:
# Récupération des données d'ajustement pour une régression simple
print("l'intercept calculé par le modèle:", regLR_multi_EN.intercept_)
df_coeff = pd.DataFrame([(i, float(j.round(2))) for i, j in zip(X_test.columns,regLR_multi_EN.coef_)])
print("les coeff du modèle multivarié:",df_coeff)

In [None]:
display(pd.DataFrame(
    {'points_observés': y_test, 'points_predits' : np.round(y_test_pred)},
    index=X_test.index
).head(10))