In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/draft-guide-by-sam-vecenie/draft_guide_data.csv


In [2]:
import pandas as pd
import numpy as np
import time
from tqdm.auto import tqdm

In [3]:
pd.set_option('display.max_rows', None)

## Read Data

In [4]:
df = pd.read_csv('/kaggle/input/draft-guide-by-sam-vecenie/draft_guide_data.csv')
print(df.shape)
df.head(1)

(288, 13)


Unnamed: 0,RANK,PLAYER,SCHOOL/TEAM,POS,AGE,HT,WING,TIER,STRENGTHS,WEAKNESSES,SUMMARY,TIER_DESCRIP,year
0,1,LaMelo Ball,Illawarra Hawks,G,19,6-6,6-9,5,Everything starts with Ball’s elite-level feel...,"The defense isn’t a sure thing, though, becaus...",Ball should enter the NBA as one of the most c...,All-Star Upside,2020


In [5]:
# Add Strengths and Weaknesses
df['full_text'] = df[['STRENGTHS', 'WEAKNESSES']].apply(lambda x: x[0] + ' ' + x[1], axis=1)
df.head(1)

Unnamed: 0,RANK,PLAYER,SCHOOL/TEAM,POS,AGE,HT,WING,TIER,STRENGTHS,WEAKNESSES,SUMMARY,TIER_DESCRIP,year,full_text
0,1,LaMelo Ball,Illawarra Hawks,G,19,6-6,6-9,5,Everything starts with Ball’s elite-level feel...,"The defense isn’t a sure thing, though, becaus...",Ball should enter the NBA as one of the most c...,All-Star Upside,2020,Everything starts with Ball’s elite-level feel...


In [6]:
df['TIER'] = df['TIER'] - 1

## Preprocessing

In [7]:
# test = df.sample(50, random_state=42)
df_2023 = df[df['year'] == 2023]
# test.sort_values('TIER', ascending=False).head()

In [8]:
val = df_2023.sample(40, random_state=42)
test = df_2023[~df_2023.index.isin(val.index)]
val.shape, test.shape

((40, 14), (35, 14))

In [9]:
train = df[~df.index.isin(df_2023.index)]\
        .reset_index().drop('index', axis=1)
train.shape

(213, 14)

In [10]:
# data augmentation
train_copy = train.copy()
train_copy['full_text'] = train_copy[['STRENGTHS', 'WEAKNESSES']].apply(lambda x: x[1] + ' ' + x[0], axis=1)
train = pd.concat([train, train_copy])\
          .reset_index().drop('index', axis=1)
train.shape

(426, 14)

In [11]:
train_count_dict = train.groupby('TIER')['PLAYER'].count().to_dict()
train_count_dict

{0: 144, 1: 162, 2: 68, 3: 30, 4: 22}

In [12]:
max_n = max(train_count_dict.values())

for i in range(5):
    diff = max_n - train_count_dict[i]
    if diff != 0:
        random_sample = train[train['TIER'] == i].sample(diff, 
                                                         random_state=42,
                                                         replace=True)
        train = pd.concat([train, random_sample])\
                  .reset_index().drop('index', axis=1)

In [13]:
train.groupby('TIER')['PLAYER'].count().to_dict()

{0: 162, 1: 162, 2: 162, 3: 162, 4: 162}

In [14]:
labels = df['TIER_DESCRIP'].unique()
id2label = {row[0]: row[1]
            for row in df[['TIER', 'TIER_DESCRIP']].drop_duplicates().values}
label2id = {row[1]: row[0]
            for row in df[['TIER', 'TIER_DESCRIP']].drop_duplicates().values}

## Preprocessing


In [15]:
import torch
from torch.nn import functional as F
from transformers import (
    AdamW,
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    get_scheduler,
    TrainingArguments,
    Trainer
)
from datasets import Dataset

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [16]:
MODEL = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification\
        .from_pretrained(MODEL, 
                         id2label=id2label,
                         label2id=label2id)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [17]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [18]:
def preprocess_function(examples):
    tokens = tokenizer(examples['full_text'], 
                       truncation=True, 
                       max_length=512)
    if 'TIER' in examples:
        labels_matrix = [[0.0 if examples['TIER'][i] != j else 1.0
                          for j in range(len(labels))]
                         for i in range(len(examples['TIER']))]
        return {**tokens, "labels": labels_matrix}
    else:
        return tokens

In [19]:
train_dataset = (
    Dataset.from_pandas(train[['PLAYER', 'full_text', 'TIER']])
    .map(preprocess_function, batched=True)
    .remove_columns(['TIER'])
    .shuffle(seed=42)
)

val_dataset = (
    Dataset.from_pandas(val[['PLAYER', 'full_text', 'TIER']])
    .map(preprocess_function, batched=True)
    .remove_columns(['TIER'])
    .shuffle(seed=42)
)

test_dataset = (
    Dataset.from_pandas(test[['PLAYER', 'full_text', 'TIER']])
    .map(preprocess_function, batched=True)
    .remove_columns(['TIER'])
    .shuffle(seed=42)
)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [20]:
example = train_dataset[0]
example['PLAYER'], example['labels']

('Kessler Edwards', [0.0, 1.0, 0.0, 0.0, 0.0])

## Modeling

In [21]:
from sklearn.metrics import f1_score, accuracy_score, log_loss, mean_squared_error
from sklearn.metrics import classification_report

In [22]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # replace with regression loss
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss = F.cross_entropy(F.softmax(logits), labels)
        return (loss, outputs) if return_outputs else loss
    
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = F.softmax(torch.tensor(logits))
    labels_adj = [np.argmax(l) for l in labels]
    predictions_adj = [np.sum([j * p[j] for j in range(5)]) for p in predictions]
    return {"RMSE": log_loss(labels, predictions)}

In [23]:
os.environ["WANDB_DISABLED"] = "true"

torch.cuda.empty_cache()

In [24]:
CONFIG = {
    "weight_decay": 0.01,
    "num_train_epochs": 20.0,
    "load_best_model_at_end": True,
    "metric_for_best_model": 'eval_loss',
    "lr_scheduler_type": 'cosine_with_restarts',
}

In [25]:
run_name = f"fpell-{int(time.time())}"

print("=" * 50)
print(f"Starting run: {run_name}")
print("=" * 50)

training_args = TrainingArguments(
    run_name=run_name,
    output_dir="./results",
    save_total_limit=5,
    evaluation_strategy="steps",
    eval_steps=50,
    logging_steps=50,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    **CONFIG,
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Starting run: fpell-1692299957


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  loss = F.cross_entropy(F.softmax(logits), labels)


Step,Training Loss,Validation Loss,Rmse
50,1.6082,1.684392,2.091438
100,1.6122,1.543905,1.406828
150,1.6131,1.592554,1.564471
200,1.6145,1.618742,1.660774
250,1.6109,1.613146,1.628839
300,1.6102,1.619589,1.662499
350,1.6104,1.602047,1.582334
400,1.6117,1.605593,1.594971
450,1.612,1.612702,1.632167
500,1.6124,1.612608,1.626051


  predictions = F.softmax(torch.tensor(logits))
  loss = F.cross_entropy(F.softmax(logits), labels)
  predictions = F.softmax(torch.tensor(logits))
  loss = F.cross_entropy(F.softmax(logits), labels)
  predictions = F.softmax(torch.tensor(logits))
  loss = F.cross_entropy(F.softmax(logits), labels)
  predictions = F.softmax(torch.tensor(logits))
  loss = F.cross_entropy(F.softmax(logits), labels)
  predictions = F.softmax(torch.tensor(logits))
  loss = F.cross_entropy(F.softmax(logits), labels)
  predictions = F.softmax(torch.tensor(logits))
  loss = F.cross_entropy(F.softmax(logits), labels)
  predictions = F.softmax(torch.tensor(logits))
  loss = F.cross_entropy(F.softmax(logits), labels)
  predictions = F.softmax(torch.tensor(logits))
  loss = F.cross_entropy(F.softmax(logits), labels)
  predictions = F.softmax(torch.tensor(logits))
  loss = F.cross_entropy(F.softmax(logits), labels)
  predictions = F.softmax(torch.tensor(logits))
  loss = F.cross_entropy(F.softmax(logits), labels)


TrainOutput(global_step=2040, training_loss=1.525042730219224, metrics={'train_runtime': 941.4943, 'train_samples_per_second': 17.207, 'train_steps_per_second': 2.167, 'total_flos': 4262513907916800.0, 'train_loss': 1.525042730219224, 'epoch': 20.0})

In [26]:
temp = trainer.predict(train_dataset)
log_loss(temp[1], F.softmax(torch.tensor(temp[0])))

  loss = F.cross_entropy(F.softmax(logits), labels)


  predictions = F.softmax(torch.tensor(logits))
  log_loss(temp[1], F.softmax(torch.tensor(temp[0])))


0.7643030398933989

In [27]:
temp = trainer.predict(val_dataset)
log_loss(temp[1], F.softmax(torch.tensor(temp[0])))

  loss = F.cross_entropy(F.softmax(logits), labels)


  predictions = F.softmax(torch.tensor(logits))
  log_loss(temp[1], F.softmax(torch.tensor(temp[0])))


1.663374484099032

In [28]:
data = []

for i in range(len(val_dataset)):
    data.append([val_dataset[i]['PLAYER'], 
                 temp[1][i],
                 F.softmax(torch.tensor(temp[0][i]))])
    
res = pd.DataFrame(data, columns=['PLAYER', 'TIER', 'pred'])
res['label'] = res['TIER'].apply(lambda x: np.argmax(x))
res['pred_label'] = res['pred'].apply(lambda x: np.sum([i*x[i] for i in range(5)]))
res['diff'] = np.abs(res['label'] - res['pred_label'])

res['log_loss'] = res[['TIER', 'pred']].apply(lambda x: log_loss(x[0], x[1]), 
                                              axis=1)
res.sort_values('diff', ascending=False)


  F.softmax(torch.tensor(temp[0][i]))])


Unnamed: 0,PLAYER,TIER,pred,label,pred_label,diff,log_loss
15,Isaiah Wong,"[1.0, 0.0, 0.0, 0.0, 0.0]","[tensor(0.0107), tensor(0.0215), tensor(0.0369...",0,3.017234,3.017234,1.270397
10,Jordan Walsh,"[1.0, 0.0, 0.0, 0.0, 0.0]","[tensor(0.0253), tensor(0.0355), tensor(0.0893...",0,2.843919,2.843919,1.071304
34,Brandon Miller,"[0.0, 0.0, 0.0, 0.0, 1.0]","[tensor(0.1838), tensor(0.1262), tensor(0.4605...",4,1.777184,2.222816,0.869414
38,Brice Sensabaugh,"[0.0, 1.0, 0.0, 0.0, 0.0]","[tensor(0.0060), tensor(0.0143), tensor(0.0221...",1,3.180345,2.180345,1.158276
4,Gregory “G.G.” Jackson,"[0.0, 1.0, 0.0, 0.0, 0.0]","[tensor(0.0077), tensor(0.0166), tensor(0.0277...",1,3.081936,2.081936,1.16743
3,Drew Timme,"[1.0, 0.0, 0.0, 0.0, 0.0]","[tensor(0.1555), tensor(0.1146), tensor(0.4441...",0,1.907292,1.907292,0.578213
35,Kobe Bufkin,"[0.0, 0.0, 1.0, 0.0, 0.0]","[tensor(0.8757), tensor(0.0668), tensor(0.0483...",2,0.193037,1.806963,1.03875
19,Andre Jackson,"[0.0, 1.0, 0.0, 0.0, 0.0]","[tensor(0.0300), tensor(0.0395), tensor(0.1068...",1,2.801796,1.801796,0.965103
2,Jordan Hawkins,"[0.0, 1.0, 0.0, 0.0, 0.0]","[tensor(0.0310), tensor(0.0403), tensor(0.1099...",1,2.793668,1.793668,0.958785
13,Julian Strawther,"[1.0, 0.0, 0.0, 0.0, 0.0]","[tensor(0.2124), tensor(0.1361), tensor(0.4625...",0,1.665477,1.665477,0.503846


In [29]:
temp = trainer.predict(test_dataset)
log_loss(temp[1], F.softmax(torch.tensor(temp[0])))

  loss = F.cross_entropy(F.softmax(logits), labels)


  predictions = F.softmax(torch.tensor(logits))
  log_loss(temp[1], F.softmax(torch.tensor(temp[0])))


1.4006093021717154

In [30]:
data = []

for i in range(len(test_dataset)):
    data.append([test_dataset[i]['PLAYER'], 
                 temp[1][i],
                 F.softmax(torch.tensor(temp[0][i]))])
    
res = pd.DataFrame(data, columns=['PLAYER', 'TIER', 'pred'])
res['label'] = res['TIER'].apply(lambda x: np.argmax(x))
res['pred_label'] = res['pred'].apply(lambda x: np.sum([i*x[i] for i in range(5)]))
res['diff'] = np.abs(res['label'] - res['pred_label'])

res['log_loss'] = res[['TIER', 'pred']].apply(lambda x: log_loss(x[0], x[1]), 
                                              axis=1)
res.sort_values('diff', ascending=False)


  F.softmax(torch.tensor(temp[0][i]))])


Unnamed: 0,PLAYER,TIER,pred,label,pred_label,diff,log_loss
11,Terquavion Smith,"[1.0, 0.0, 0.0, 0.0, 0.0]","[tensor(0.0131), tensor(0.0240), tensor(0.0459...",0,2.97137,2.97137,1.238021
21,Jalen Wilson,"[1.0, 0.0, 0.0, 0.0, 0.0]","[tensor(0.0204), tensor(0.0311), tensor(0.0719...",0,2.889243,2.889243,1.131056
29,Taylor Hendricks,"[0.0, 0.0, 0.0, 1.0, 0.0]","[tensor(0.7250), tensor(0.1329), tensor(0.1145...",3,0.449609,2.550391,1.070361
19,Ausar Thompson,"[0.0, 0.0, 0.0, 1.0, 0.0]","[tensor(0.6549), tensor(0.1512), tensor(0.1564...",3,0.583404,2.416596,0.978338
16,Adam Flagler,"[1.0, 0.0, 0.0, 0.0, 0.0]","[tensor(0.1451), tensor(0.1095), tensor(0.4296...",0,1.964887,1.964887,0.593584
8,Jett Howard,"[0.0, 1.0, 0.0, 0.0, 0.0]","[tensor(0.0698), tensor(0.0678), tensor(0.2454...",1,2.47586,1.47586,0.782828
10,Omari Moore,"[1.0, 0.0, 0.0, 0.0, 0.0]","[tensor(0.2971), tensor(0.1564), tensor(0.4216...",0,1.400863,1.400863,0.412301
28,Jaylen Martin,"[1.0, 0.0, 0.0, 0.0, 0.0]","[tensor(0.2999), tensor(0.1569), tensor(0.4194...",0,1.393835,1.393835,0.409576
0,Tristan Vukčević,"[1.0, 0.0, 0.0, 0.0, 0.0]","[tensor(0.3000), tensor(0.1569), tensor(0.4196...",0,1.393011,1.393011,0.40952
31,Oscar Tshiebwe,"[1.0, 0.0, 0.0, 0.0, 0.0]","[tensor(0.4276), tensor(0.1699), tensor(0.3222...",0,1.071776,1.071776,0.301438
