In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/draft-guide-by-sam-vecenie/draft_guide_data.csv


In [2]:
import pandas as pd
import numpy as np
import time

In [3]:
pd.set_option('display.max_rows', None)

## Read Data

In [4]:
df = pd.read_csv('/kaggle/input/draft-guide-by-sam-vecenie/draft_guide_data.csv')
print(df.shape)
df.head(1)

(288, 13)


Unnamed: 0,RANK,PLAYER,SCHOOL/TEAM,POS,AGE,HT,WING,TIER,STRENGTHS,WEAKNESSES,SUMMARY,TIER_DESCRIP,year
0,1,LaMelo Ball,Illawarra Hawks,G,19,6-6,6-9,5,Everything starts with Ball’s elite-level feel...,"The defense isn’t a sure thing, though, becaus...",Ball should enter the NBA as one of the most c...,All-Star Upside,2020


In [5]:
# Add Strengths and Weaknesses
df['full_text'] = df[['STRENGTHS', 'WEAKNESSES']].apply(lambda x: x[0] + ' ' + x[1], axis=1)
df.head(1)

Unnamed: 0,RANK,PLAYER,SCHOOL/TEAM,POS,AGE,HT,WING,TIER,STRENGTHS,WEAKNESSES,SUMMARY,TIER_DESCRIP,year,full_text
0,1,LaMelo Ball,Illawarra Hawks,G,19,6-6,6-9,5,Everything starts with Ball’s elite-level feel...,"The defense isn’t a sure thing, though, becaus...",Ball should enter the NBA as one of the most c...,All-Star Upside,2020,Everything starts with Ball’s elite-level feel...


## Preprocessing

In [36]:
test = df.sample(50, random_state=42)
# test = df[df['year'] == 2023]
test.sort_values('TIER', ascending=False).head()

Unnamed: 0,RANK,PLAYER,SCHOOL/TEAM,POS,AGE,HT,WING,TIER,STRENGTHS,WEAKNESSES,SUMMARY,TIER_DESCRIP,year,full_text
140,3,Cam Whitmore,Villanova,W,18,6-6,6-9,5,"Elite intersection of size, frame, athleticism...",Whitmore has all the physical tools to be an N...,Whitmore is a pure upside play. If you buy int...,All-Star Upside,2023,"Elite intersection of size, frame, athleticism..."
60,1,Cade Cunningham,Oklahoma State,G/W,19,6-8,7-1,5,Cunningham has elite-level size as a lead crea...,"Cunningham is a good athlete, but not a great ...",Cunningham is one of the more complete prospec...,All-Star Upside,2021,Cunningham has elite-level size as a lead crea...
216,4,Jaden Ivey,Purdue,G,20,6-4,,5,Was a consensus four-star recruit who was a to...,I don’t think I’d call Ivey a negative in term...,Ivey is all about how much you value athletic ...,All-Star Upside,2022,Was a consensus four-star recruit who was a to...
143,6,Jarace Walker,Houston,F,19,6-7,7-3,4,Walker has strong physical measurements for an...,How high you are on Walker probably depends on...,Walker is a fascinating player. It’s easy to i...,High-Leverage Starters,2023,Walker has strong physical measurements for an...
144,7,Taylor Hendricks,UCF,W/F,19,6-8,7-1,4,Hendricks is a good athlete with a strong fram...,"The main concern is that, for a potential top-...",I love Hendricks going to a team that is well-...,High-Leverage Starters,2023,Hendricks is a good athlete with a strong fram...


In [45]:
train = df[~df.index.isin(test.index)]
train.shape

(238, 14)

In [46]:
train.groupby('TIER')['PLAYER'].count()

TIER
1    97
2    79
3    34
4    15
5    13
Name: PLAYER, dtype: int64

In [47]:
# copy_dict = {3: 2, 4: 4, 5: 6} # 2023 split
copy_dict = {3: 2, 4: 5, 5: 5}

for i in range(3, 6):
    temp = train[train['TIER'] == i].copy()
    for j in range(copy_dict[i]):
        train = pd.concat([train, temp])
    train = train.reset_index().drop('index', axis=1)

In [48]:
train.groupby('TIER')['PLAYER'].count()

TIER
1     97
2     79
3    102
4     90
5     78
Name: PLAYER, dtype: int64

## Preprocessing


In [49]:
import torch
from torch.nn import functional as F
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)
from datasets import Dataset

In [50]:
MODEL = 'bert-base-uncased'
CONFIG = {
    "weight_decay": 0.01,
    "num_train_epochs": 10.0,
    "load_best_model_at_end": True,
    "metric_for_best_model": 'RMSE',
}

In [51]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [52]:
model.classifier = torch.nn.Linear(
    model.classifier.in_features,
    1
)

In [53]:
def preprocess_function(examples):
    tokens = tokenizer(examples['full_text'], 
                       truncation=True, 
                       max_length=512)
    if 'TIER' in examples:
        return {**tokens, "labels": [[float(x)] for x in examples['TIER']]}
    else:
        return tokens

In [54]:
train_dataset = (
    Dataset.from_pandas(train[['PLAYER', 'full_text', 'TIER']])
    .map(preprocess_function, batched=True)
    .remove_columns(['TIER'])
    .shuffle(seed=42)
    .train_test_split(seed=42)
)

test_dataset = (
    Dataset.from_pandas(test[['PLAYER', 'full_text', 'TIER']])
    .map(preprocess_function, batched=True)
    .remove_columns(['TIER'])
    .shuffle(seed=42)
)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [55]:
example = train_dataset['train'][0]
example['PLAYER'], example['labels']

('Onyeka Okongwu', [4.0])

## Modeling

In [56]:
# from sklearn.metrics import mean_squared_error

In [57]:
def MCRMSE(y_pred, y_true):
    """mean columnwise root mean squared error"""
    colwise_root_mse = np.sqrt(
        np.power(y_true - y_pred, 2).mean(axis=0)
    )
    mean_colwise_root_mse = colwise_root_mse.mean()
    return mean_colwise_root_mse

In [58]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # replace with regression loss
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss = F.mse_loss(outputs.get("logits"), labels)
        return (loss, outputs) if return_outputs else loss
    
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return {"MCRMSE": MCRMSE(predictions, labels)}

In [59]:
os.environ["WANDB_DISABLED"] = "true"

torch.cuda.empty_cache()

In [60]:
run_name = f"fpell-{int(time.time())}"

print("=" * 50)
print(f"Starting run: {run_name}")
print("=" * 50)

training_args = TrainingArguments(
    run_name=run_name,
    output_dir="./results",
    save_total_limit=5,
    evaluation_strategy="steps",
    eval_steps=50,
    logging_steps=50,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    **CONFIG,
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset["train"],
    eval_dataset=train_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Starting run: fpell-1692113895


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Mcrmse
50,2.6497,3.377226,1.837723
100,1.3799,0.95577,0.977635
150,0.5451,0.576258,0.759117
200,0.2712,0.308505,0.555432
250,0.1544,0.411367,0.641379
300,0.097,0.298624,0.546465
350,0.0636,0.31588,0.562032
400,0.0543,0.26792,0.51761


TrainOutput(global_step=420, training_loss=0.6225582867860794, metrics={'train_runtime': 201.2686, 'train_samples_per_second': 16.595, 'train_steps_per_second': 2.087, 'total_flos': 878783034593280.0, 'train_loss': 0.6225582867860794, 'epoch': 10.0})

In [61]:
temp = trainer.predict(train_dataset['train'])
MCRMSE(temp[0], temp[1])

0.16275415

In [62]:
temp = trainer.predict(train_dataset['test'])
MCRMSE(temp[0], temp[1])

0.56923604

In [63]:
temp = trainer.predict(test_dataset)
MCRMSE(temp[0], temp[1])

0.91311663

In [64]:
data = []

for i in range(len(test_dataset)):
    data.append([test_dataset[i]['PLAYER'], 
                 test_dataset[i]['labels'][0],
                 temp[0][i][0]])
    
res = pd.DataFrame(data, columns=['PLAYER', 'TIER', 'pred'])
res['diff'] = res['TIER'] - res['pred']
res.sort_values('diff', ascending=False)

Unnamed: 0,PLAYER,TIER,pred,diff
43,Taylor Hendricks,4.0,1.643427,2.356573
18,Cade Cunningham,5.0,3.069614,1.930386
29,Jaden Ivey,5.0,3.150998,1.849002
24,Cam Whitmore,5.0,3.425466,1.574534
26,Deni Avdija,4.0,2.745518,1.254482
5,Dereck Lively II,3.0,1.873166,1.126834
19,Franz Wagner,3.0,1.880722,1.119278
47,Dariq Whitehead,2.0,1.031641,0.968359
0,Killian Hayes,4.0,3.091611,0.908389
10,Jarace Walker,4.0,3.122586,0.877414
