## Model Training Trial 1 

- The first model implementation uses Logistic Regression


## 📈 DB Visualization

Here we list all the tables, their shapes, and columns
The following cells verify the table contents and save them as html files for easier inspections

In [None]:
import sqlite3
import pandas as pd

# Connect to SQLite DB
conn = sqlite3.connect("../data/sqlite/nba_mvp.db")

# ---------------------------
# 1. List all tables
# ---------------------------
print("📋 Tables in database:")
tables = pd.read_sql(
    "SELECT name FROM sqlite_master WHERE type='table';", conn
)
print(tables)

# ---------------------------
# 2. View table row/column counts
# ---------------------------
def get_table_info(table_name):
    df = pd.read_sql(f"SELECT * FROM {table_name} LIMIT 5;", conn)
    row_count = pd.read_sql(f"SELECT COUNT(*) as rows FROM {table_name};", conn).iloc[0]["rows"]
    print(f"\n🧾 Table: {table_name}")
    print(f"   ➤ Rows: {row_count}")
    print(f"   ➤ Columns: {len(df.columns)}")
    print(f"   ➤ Column names: {list(df.columns)}")

# Inspect key tables
for t in tables["name"]:
    get_table_info(t)

table_count = pd.read_sql("SELECT COUNT(*) as count FROM sqlite_master WHERE type='table';", conn)
print("🧮 Number of tables in the database:", table_count.iloc[0]['count'])
# Close connection
conn.close()



📋 Tables in database:
                  name
0  player_season_stats
1        player_id_map
2  engineered_features
3             mvp_list
4    final_player_data

🧾 Table: player_season_stats
   ➤ Rows: 1150
   ➤ Columns: 23
   ➤ Column names: ['RK', 'Name', 'POS', 'GP', 'MIN', 'PTS', 'FGM', 'FGA', 'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'REB', 'AST', 'STL', 'BLK', 'TO', 'DD2', 'TD3', 'season']

🧾 Table: player_id_map
   ➤ Rows: 287
   ➤ Columns: 2
   ➤ Column names: ['player_id', 'player_name']

🧾 Table: engineered_features
   ➤ Rows: 1150
   ➤ Columns: 28
   ➤ Column names: ['RK', 'Name', 'POS', 'GP', 'MIN', 'PTS', 'FGM', 'FGA', 'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'REB', 'AST', 'STL', 'BLK', 'TO', 'DD2', 'TD3', 'season', 'team_name', 'player_name', 'player_id', 'team_win_pct', 'team_rank']

🧾 Table: mvp_list
   ➤ Rows: 24
   ➤ Columns: 2
   ➤ Column names: ['Year', 'Player']

🧾 Table: final_player_data
   ➤ Rows: 1150
   ➤ Columns: 29
   ➤ Column names: ['RK', 'Na

In [2]:
import sqlite3

# Connect to the database
conn = sqlite3.connect("../data/sqlite/nba_mvp.db")
cursor = conn.cursor()

# Query column names from final_player_data
cursor.execute("PRAGMA table_info(final_player_data);")
columns_info = cursor.fetchall()

# Extract just the column names into a list
column_names = [col[1] for col in columns_info]
 
# Print the result
print(column_names)

# Close connection
conn.close()


['RK_x', 'Name', 'POS_x', 'GP_x', 'MIN_x', 'PTS_x', 'FGM_x', 'FGA_x', 'FG%_x', '3PM_x', '3PA_x', '3P%_x', 'FTM_x', 'FTA_x', 'FT%_x', 'REB_x', 'AST_x', 'STL_x', 'BLK_x', 'TO_x', 'DD2_x', 'TD3_x', 'season', 'RK_y', 'POS_y', 'GP_y', 'MIN_y', 'PTS_y', 'FGM_y', 'FGA_y', 'FG%_y', '3PM_y', '3PA_y', '3P%_y', 'FTM_y', 'FTA_y', 'FT%_y', 'REB_y', 'AST_y', 'STL_y', 'BLK_y', 'TO_y', 'DD2_y', 'TD3_y', 'team_name', 'player_name', 'player_id', 'team_win_pct', 'team_rank']


In [7]:
import sqlite3
import pandas as pd

# Connect to the database
conn = sqlite3.connect("../data/sqlite/nba_mvp.db")

# Read a few rows from the table
df = pd.read_sql("SELECT * FROM final_player_data LIMIT 5;", conn)
df.to_csv("../data/final_player_data.csv", index=False)


# Display the result
print(df)

# Close the connection
conn.close()


  RK                 Name POS  GP   MIN   PTS   FGM   FGA   FG%  3PM  ...  \
0  -     Allen IversonPHI  SG  60  43.7  31.4  11.1  27.8  39.8  1.3  ...   
1  -  Shaquille O'NealLAL   C  67  36.1  27.2  10.6  18.3  57.9  0.0  ...   
2  -       Paul PierceBOS  SF  82  40.3  26.1   8.6  19.5  44.2  2.6  ...   
3  -     Tracy McGradyORL  SG  76  38.3  25.6   9.4  20.9  45.1  1.4  ...   
4  -         Tim DuncanSA   C  82  40.6  25.5   9.3  18.3  50.8  0.0  ...   

   BLK   TO DD2 TD3 season team_name       player_name player_id team_win_pct  \
0  0.2  4.0   4   1   2002       PHI     Allen Iverson         7     0.524390   
1  2.0  2.6  40   0   2002       LAL  Shaquille O'Neal       253     0.707317   
2  1.0  2.9  17   0   2002       BOS       Paul Pierce       234          NaN   
3  1.0  2.5  24   1   2002       ORL     Tracy McGrady       270          NaN   
4  2.5  3.2  67   0   2002        SA        Tim Duncan       265          NaN   

  team_rank  
0       4.0  
1       2.0  
2       

Saving html

In [8]:
import pandas as pd

# Load CSV
df = pd.read_csv("../data/final_player_data.csv")

# Convert to HTML and save
html_table = df.to_html(index=False)
with open("../data/cleaned_final_player_data.html", "w", encoding="utf-8") as f:
    f.write(html_table)

print("✅ HTML table saved to final_player_data.html")


✅ HTML table saved to final_player_data.html


In [11]:
import sqlite3
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report

# Connect and load data
conn = sqlite3.connect("../data/sqlite/nba_mvp.db")
df = pd.read_sql("SELECT * FROM final_player_data", conn)
conn.close()

print("Shape:", df.shape)
print("Columns:", df.columns.tolist())


Shape: (1150, 29)
Columns: ['RK', 'Name', 'POS', 'GP', 'MIN', 'PTS', 'FGM', 'FGA', 'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'REB', 'AST', 'STL', 'BLK', 'TO', 'DD2', 'TD3', 'season', 'team_name', 'player_name', 'player_id', 'team_win_pct', 'team_rank', 'is_mvp']


Define Features and Target

In [12]:
features = [
    'GP', 'MIN', 'PTS', 'FGM', 'FGA', 'FG%', '3PM',
    '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'REB', 'AST', 'STL', 'BLK', 'TO', 
    'DD2', 'TD3'
]

target = "is_mvp"

Data Cleaning and Splitting

In [20]:
df_model = df[features + [target] + ["season", "player_id", "Name"]].dropna()

# Split full DataFrame so we still have season info
train_df, test_df = train_test_split(df_model, test_size=0.2, random_state=42, stratify=df_model[target])

# Extract X and y
X_train = train_df[features]
y_train = train_df[target]

X_test = test_df[features]
y_test = test_df[target]


In [21]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Train Logistic Regression Model

In [22]:
model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,10000


In [23]:
# Predict probability of being MVP
test_df["mvp_prob"] = model.predict_proba(X_test)[:, 1]


In [24]:
# Sort by season and predicted probability
season_rankings = (
    test_df.sort_values(["season", "mvp_prob"], ascending=[True, False])
    .groupby("season")
    .head(5)  # Top 5 MVP candidates per season
)

# Display a few
season_rankings[["season", "Name", "mvp_prob", "is_mvp"]].head(10)


Unnamed: 0,season,Name,mvp_prob,is_mvp
1,2002,Shaquille O'NealLAL,0.004589,0
28,2002,Sam CassellMIL,0.002384,0
35,2002,Nick Van ExelDAL/DEN,0.000136,0
36,2002,Eddie JonesMIA,6.8e-05,0
41,2002,Bonzi WellsPOR,5.5e-05,0
82,2003,Jason KiddNJ,0.01057,0
53,2003,Shaquille O'NealLAL,0.00838,0
89,2003,Steve NashDAL,0.004911,0
66,2003,Shawn MarionPHX,0.001496,0
93,2003,Jason TerryATL,0.00066,0


In [28]:
# Filter rows where is_mvp == 1
mvp_rows = season_rankings[season_rankings['is_mvp'] == 1]

# Count how many MVPs are in the ranking results
mvp_count = len(mvp_rows)

print(f"🏆 Number of actual MVPs in the ranked test set: {mvp_count}")
mvp_rows


🏆 Number of actual MVPs in the ranked test set: 4


Unnamed: 0,GP,MIN,PTS,FGM,FGA,FG%,3PM,3PA,3P%,FTM,...,STL,BLK,TO,DD2,TD3,is_mvp,season,player_id,Name,mvp_prob
232,79,35.4,18.8,6.8,13.4,51.2,1.9,4.3,43.9,3.3,...,0.8,0.2,3.5,43,1,1,2006,261,Steve NashPHX,0.12135
600,81,38.5,32.0,10.5,20.8,50.3,2.4,6.1,39.1,8.7,...,1.3,0.7,3.5,27,3,1,2014,169,Kevin DurantOKC,0.447108
800,72,35.4,30.4,9.0,20.1,44.9,3.7,10.0,36.7,8.7,...,1.8,0.7,4.4,31,4,1,2018,133,James HardenHOU,0.065779
1050,66,34.6,33.1,11.0,20.1,54.8,1.0,3.0,33.0,10.0,...,1.0,1.7,3.4,39,1,1,2023,148,Joel EmbiidPHI,0.398481


In [29]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Track predictions
true_labels = []
pred_labels = []

for season, group in season_rankings.groupby("season"):
    # Predict: player with highest prob
    predicted_idx = group["mvp_prob"].idxmax()
    predicted_mvp = group.loc[predicted_idx]

    # Ground truth: actual MVP
    actual_mvp = group[group["is_mvp"] == 1]

    # Store 1 if prediction was correct, else 0
    true_labels.append(1)  # one MVP per season
    pred_labels.append(1 if predicted_mvp["is_mvp"] == 1 else 0)

# Metrics
accuracy = sum([t == p for t, p in zip(true_labels, pred_labels)]) / len(true_labels)
precision = precision_score(true_labels, pred_labels)
recall = recall_score(true_labels, pred_labels)
f1 = f1_score(true_labels, pred_labels)

print(f"🎯 Accuracy: {accuracy:.2f}")
print(f"🔍 Precision: {precision:.2f}")
print(f"📈 Recall: {recall:.2f}")
print(f"🏅 F1 Score: {f1:.2f}")


🎯 Accuracy: 0.13
🔍 Precision: 1.00
📈 Recall: 0.13
🏅 F1 Score: 0.23


Evaluate Model

In [30]:
# Assuming 'model' is your trained logistic regression model
# and X_train.columns gives the feature names

import pandas as pd

coefficients = model.coef_[0]
feature_names = X_train.columns

# Create DataFrame of feature weights
weights_df = pd.DataFrame({
    "Feature": feature_names,
    "Weight": coefficients
}).sort_values(by="Weight", ascending=False)

print(weights_df)


   Feature    Weight
13     AST  0.990969
12     REB  0.759421
15     BLK  0.545415
6      3PM  0.526505
2      PTS  0.465149
16      TO  0.303264
3      FGM  0.256028
0       GP  0.208779
11     FT%  0.116196
10     FTA  0.094497
5      FG%  0.043429
8      3P%  0.019422
4      FGA  0.003015
17     DD2 -0.041650
18     TD3 -0.151542
14     STL -0.191844
1      MIN -0.206543
9      FTM -0.448534
7      3PA -0.494897


View Feature Weights