In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/draft-guide-by-sam-vecenie/draft_guide_data.csv


In [2]:
import pandas as pd
import numpy as np
import time
from tqdm.auto import tqdm

In [3]:
pd.set_option('display.max_rows', None)

## Read Data

In [4]:
df = pd.read_csv('/kaggle/input/draft-guide-by-sam-vecenie/draft_guide_data.csv')
print(df.shape)
df.head(1)

(288, 13)


Unnamed: 0,RANK,PLAYER,SCHOOL/TEAM,POS,AGE,HT,WING,TIER,STRENGTHS,WEAKNESSES,SUMMARY,TIER_DESCRIP,year
0,1,LaMelo Ball,Illawarra Hawks,G,19,6-6,6-9,5,Everything starts with Ball’s elite-level feel...,"The defense isn’t a sure thing, though, becaus...",Ball should enter the NBA as one of the most c...,All-Star Upside,2020


In [5]:
# Add Strengths and Weaknesses
df['full_text'] = df[['STRENGTHS', 'WEAKNESSES']].apply(lambda x: x[0] + ' ' + x[1], axis=1)
df.head(1)

Unnamed: 0,RANK,PLAYER,SCHOOL/TEAM,POS,AGE,HT,WING,TIER,STRENGTHS,WEAKNESSES,SUMMARY,TIER_DESCRIP,year,full_text
0,1,LaMelo Ball,Illawarra Hawks,G,19,6-6,6-9,5,Everything starts with Ball’s elite-level feel...,"The defense isn’t a sure thing, though, becaus...",Ball should enter the NBA as one of the most c...,All-Star Upside,2020,Everything starts with Ball’s elite-level feel...


In [6]:
df['TIER'] = df['TIER'] - 1

## Preprocessing

In [7]:
# test = df.sample(50, random_state=42)
df_2023 = df[df['year'] == 2023]
# test.sort_values('TIER', ascending=False).head()

In [8]:
val = df_2023.sample(40, random_state=42)
test = df_2023[~df_2023.index.isin(val.index)]
val.shape, test.shape

((40, 14), (35, 14))

In [9]:
train = df[~df.index.isin(df_2023.index)]\
        .reset_index().drop('index', axis=1)
train.shape

(213, 14)

In [10]:
# data augmentation
train_copy = train.copy()
train_copy['full_text'] = train_copy[['STRENGTHS', 'WEAKNESSES']].apply(lambda x: x[1] + ' ' + x[0], axis=1)
train = pd.concat([train, train_copy])\
          .reset_index().drop('index', axis=1)
train.shape

(426, 14)

In [11]:
train_count_dict = train.groupby('TIER')['PLAYER'].count().to_dict()
train_count_dict

{0: 144, 1: 162, 2: 68, 3: 30, 4: 22}

In [12]:
max_n = max(train_count_dict.values())

for i in range(5):
    diff = max_n - train_count_dict[i]
    if diff != 0:
        random_sample = train[train['TIER'] == i].sample(diff, 
                                                         random_state=42,
                                                         replace=True)
        train = pd.concat([train, random_sample])\
                  .reset_index().drop('index', axis=1)

In [13]:
train.groupby('TIER')['PLAYER'].count().to_dict()

{0: 162, 1: 162, 2: 162, 3: 162, 4: 162}

In [14]:
labels = df['TIER_DESCRIP'].unique()
id2label = {row[0]: row[1]
            for row in df[['TIER', 'TIER_DESCRIP']].drop_duplicates().values}
label2id = {row[1]: row[0]
            for row in df[['TIER', 'TIER_DESCRIP']].drop_duplicates().values}

## Preprocessing


In [15]:
import torch
from torch.nn import functional as F
from transformers import (
    AdamW,
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    get_scheduler,
    TrainingArguments,
    Trainer
)
from datasets import Dataset

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [16]:
MODEL = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification\
        .from_pretrained(MODEL, 
                         id2label=id2label,
                         label2id=label2id)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [18]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [19]:
def preprocess_function(examples):
    tokens = tokenizer(examples['full_text'], 
                       truncation=True, 
                       max_length=512)
    if 'TIER' in examples:
        labels_matrix = [[0.0 if examples['TIER'][i] != j else 1.0
                          for j in range(len(labels))]
                         for i in range(len(examples['TIER']))]
        return {**tokens, "labels": labels_matrix}
    else:
        return tokens

In [20]:
train_dataset = (
    Dataset.from_pandas(train[['PLAYER', 'full_text', 'TIER']])
    .map(preprocess_function, batched=True)
    .remove_columns(['TIER'])
    .shuffle(seed=42)
)

val_dataset = (
    Dataset.from_pandas(val[['PLAYER', 'full_text', 'TIER']])
    .map(preprocess_function, batched=True)
    .remove_columns(['TIER'])
    .shuffle(seed=42)
)

test_dataset = (
    Dataset.from_pandas(test[['PLAYER', 'full_text', 'TIER']])
    .map(preprocess_function, batched=True)
    .remove_columns(['TIER'])
    .shuffle(seed=42)
)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [21]:
example = train_dataset[0]
example['PLAYER'], example['labels']

('Kessler Edwards', [0.0, 1.0, 0.0, 0.0, 0.0])

## Modeling

In [22]:
from sklearn.metrics import f1_score, accuracy_score, log_loss, mean_squared_error
from sklearn.metrics import classification_report

In [23]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # replace with regression loss
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss = F.cross_entropy(F.softmax(logits), labels)
        return (loss, outputs) if return_outputs else loss
    
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = F.softmax(torch.tensor(logits))
    labels_adj = [np.argmax(l) for l in labels]
    predictions_adj = [np.sum([j * p[j] for j in range(5)]) for p in predictions]
    return {"RMSE": log_loss(labels, predictions)}

In [24]:
os.environ["WANDB_DISABLED"] = "true"

torch.cuda.empty_cache()

In [25]:
CONFIG = {
    "weight_decay": 0.01,
    "num_train_epochs": 10.0,
    "load_best_model_at_end": True,
    "metric_for_best_model": 'eval_loss',
    "lr_scheduler_type": 'cosine_with_restarts',
}

In [26]:
run_name = f"fpell-{int(time.time())}"

print("=" * 50)
print(f"Starting run: {run_name}")
print("=" * 50)

training_args = TrainingArguments(
    run_name=run_name,
    output_dir="./results",
    save_total_limit=5,
    evaluation_strategy="steps",
    eval_steps=50,
    logging_steps=50,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    **CONFIG,
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Starting run: fpell-1692283429


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  loss = F.cross_entropy(F.softmax(logits), labels)


Step,Training Loss,Validation Loss,Rmse
50,1.5902,1.624614,1.806699
100,1.5328,1.510649,1.577423
150,1.4992,1.563524,1.671481
200,1.4582,1.59664,1.72606
250,1.3864,1.478005,1.579279
300,1.408,1.569886,1.885075
350,1.33,1.518916,1.826413
400,1.2569,1.511157,1.872226
450,1.2106,1.524825,1.973627
500,1.1633,1.490005,2.055536


  predictions = F.softmax(torch.tensor(logits))
  loss = F.cross_entropy(F.softmax(logits), labels)
  predictions = F.softmax(torch.tensor(logits))
  loss = F.cross_entropy(F.softmax(logits), labels)
  predictions = F.softmax(torch.tensor(logits))
  loss = F.cross_entropy(F.softmax(logits), labels)
  predictions = F.softmax(torch.tensor(logits))
  loss = F.cross_entropy(F.softmax(logits), labels)
  predictions = F.softmax(torch.tensor(logits))
  loss = F.cross_entropy(F.softmax(logits), labels)
  predictions = F.softmax(torch.tensor(logits))
  loss = F.cross_entropy(F.softmax(logits), labels)
  predictions = F.softmax(torch.tensor(logits))
  loss = F.cross_entropy(F.softmax(logits), labels)
  predictions = F.softmax(torch.tensor(logits))
  loss = F.cross_entropy(F.softmax(logits), labels)
  predictions = F.softmax(torch.tensor(logits))
  loss = F.cross_entropy(F.softmax(logits), labels)
  predictions = F.softmax(torch.tensor(logits))
  loss = F.cross_entropy(F.softmax(logits), labels)


TrainOutput(global_step=1020, training_loss=1.1870407366285136, metrics={'train_runtime': 475.2026, 'train_samples_per_second': 17.045, 'train_steps_per_second': 2.146, 'total_flos': 2131256953958400.0, 'train_loss': 1.1870407366285136, 'epoch': 10.0})

In [27]:
temp = trainer.predict(train_dataset)
log_loss(temp[1], F.softmax(torch.tensor(temp[0])))

  loss = F.cross_entropy(F.softmax(logits), labels)


  predictions = F.softmax(torch.tensor(logits))
  log_loss(temp[1], F.softmax(torch.tensor(temp[0])))


0.5141349176457469

In [28]:
temp = trainer.predict(val_dataset)
log_loss(temp[1], F.softmax(torch.tensor(temp[0])))

  loss = F.cross_entropy(F.softmax(logits), labels)


  predictions = F.softmax(torch.tensor(logits))
  log_loss(temp[1], F.softmax(torch.tensor(temp[0])))


2.0555358235067267

In [29]:
data = []

for i in range(len(val_dataset)):
    data.append([val_dataset[i]['PLAYER'], 
                 temp[1][i],
                 F.softmax(torch.tensor(temp[0][i]))])
    
res = pd.DataFrame(data, columns=['PLAYER', 'TIER', 'pred'])
res['label'] = res['TIER'].apply(lambda x: np.argmax(x))
res['pred_label'] = res['pred'].apply(lambda x: np.sum([i*x[i] for i in range(5)]))
res['diff'] = np.abs(res['label'] - res['pred_label'])

res['log_loss'] = res[['TIER', 'pred']].apply(lambda x: log_loss(x[0], x[1]), 
                                              axis=1)
res.sort_values('diff', ascending=False)


  F.softmax(torch.tensor(temp[0][i]))])


Unnamed: 0,PLAYER,TIER,pred,label,pred_label,diff,log_loss
27,Victor Wembanyama,"[0.0, 0.0, 0.0, 0.0, 1.0]","[tensor(0.7119), tensor(0.2676), tensor(0.0171...",4,0.313782,3.686218,1.585793
34,Brandon Miller,"[0.0, 0.0, 0.0, 0.0, 1.0]","[tensor(0.7032), tensor(0.2568), tensor(0.0372...",4,0.340951,3.659049,1.626323
28,Amen Thompson,"[0.0, 0.0, 0.0, 0.0, 1.0]","[tensor(0.1694), tensor(0.4854), tensor(0.3389...",4,1.185596,2.814404,1.384418
6,Jarace Walker,"[0.0, 0.0, 0.0, 1.0, 0.0]","[tensor(0.8302), tensor(0.1211), tensor(0.0087...",3,0.295974,2.704026,1.582781
16,Jaylen Clark,"[1.0, 0.0, 0.0, 0.0, 0.0]","[tensor(0.0082), tensor(0.0624), tensor(0.9162...",0,1.93729,1.93729,1.473004
13,Julian Strawther,"[1.0, 0.0, 0.0, 0.0, 0.0]","[tensor(0.0199), tensor(0.0836), tensor(0.8887...",0,1.88675,1.88675,1.241328
17,Nadir Hifi,"[1.0, 0.0, 0.0, 0.0, 0.0]","[tensor(0.0129), tensor(0.1056), tensor(0.8707...",0,1.882588,1.882588,1.303264
7,Mike Miles Jr.,"[1.0, 0.0, 0.0, 0.0, 0.0]","[tensor(0.0310), tensor(0.1158), tensor(0.8459...",0,1.832067,1.832067,1.095154
15,Isaiah Wong,"[1.0, 0.0, 0.0, 0.0, 0.0]","[tensor(0.0281), tensor(0.1367), tensor(0.8266...",0,1.818207,1.818207,1.095795
4,Gregory “G.G.” Jackson,"[0.0, 1.0, 0.0, 0.0, 0.0]","[tensor(0.0058), tensor(0.1192), tensor(0.0210...",1,2.737038,1.737038,0.800229


In [30]:
temp = trainer.predict(test_dataset)
log_loss(temp[1], F.softmax(torch.tensor(temp[0])))

  loss = F.cross_entropy(F.softmax(logits), labels)


  predictions = F.softmax(torch.tensor(logits))
  log_loss(temp[1], F.softmax(torch.tensor(temp[0])))


1.4478618176831581

In [31]:
data = []

for i in range(len(test_dataset)):
    data.append([test_dataset[i]['PLAYER'], 
                 temp[1][i],
                 F.softmax(torch.tensor(temp[0][i]))])
    
res = pd.DataFrame(data, columns=['PLAYER', 'TIER', 'pred'])
res['label'] = res['TIER'].apply(lambda x: np.argmax(x))
res['pred_label'] = res['pred'].apply(lambda x: np.sum([i*x[i] for i in range(5)]))
res['diff'] = np.abs(res['label'] - res['pred_label'])

res['log_loss'] = res[['TIER', 'pred']].apply(lambda x: log_loss(x[0], x[1]), 
                                              axis=1)
res.sort_values('diff', ascending=False)


  F.softmax(torch.tensor(temp[0][i]))])


Unnamed: 0,PLAYER,TIER,pred,label,pred_label,diff,log_loss
20,Scoot Henderson,"[0.0, 0.0, 0.0, 0.0, 1.0]","[tensor(0.0431), tensor(0.7755), tensor(0.1065...",4,1.219463,2.780537,1.363497
29,Taylor Hendricks,"[0.0, 0.0, 0.0, 1.0, 0.0]","[tensor(0.7886), tensor(0.1779), tensor(0.0309...",3,0.249289,2.750711,1.780604
19,Ausar Thompson,"[0.0, 0.0, 0.0, 1.0, 0.0]","[tensor(0.0645), tensor(0.5663), tensor(0.3556...",3,1.320293,1.679707,1.160513
11,Terquavion Smith,"[1.0, 0.0, 0.0, 0.0, 0.0]","[tensor(0.1331), tensor(0.4393), tensor(0.4210...",0,1.304435,1.304435,0.629695
16,Adam Flagler,"[1.0, 0.0, 0.0, 0.0, 0.0]","[tensor(0.0868), tensor(0.7317), tensor(0.1702...",0,1.109731,1.109731,0.791578
10,Omari Moore,"[1.0, 0.0, 0.0, 0.0, 0.0]","[tensor(0.1912), tensor(0.6729), tensor(0.1311...",0,0.950913,0.950913,0.583439
6,Tosan Evbuomwan,"[1.0, 0.0, 0.0, 0.0, 0.0]","[tensor(0.1553), tensor(0.7776), tensor(0.0568...",0,0.923342,0.923342,0.686937
34,Nick Smith Jr.,"[0.0, 1.0, 0.0, 0.0, 0.0]","[tensor(0.9349), tensor(0.0532), tensor(0.0068...",1,0.086357,0.913643,1.135529
28,Jaylen Martin,"[1.0, 0.0, 0.0, 0.0, 0.0]","[tensor(0.1989), tensor(0.7115), tensor(0.0823...",0,0.900828,0.900828,0.590215
33,Kobe Brown,"[0.0, 1.0, 0.0, 0.0, 0.0]","[tensor(0.9254), tensor(0.0616), tensor(0.0065...",1,0.100037,0.899963,1.079301
