In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/draft-guide-by-sam-vecenie/draft_guide_data.csv


In [2]:
import pandas as pd
import numpy as np
import time

In [3]:
pd.set_option('display.max_rows', None)

## Read Data

In [4]:
df = pd.read_csv('/kaggle/input/draft-guide-by-sam-vecenie/draft_guide_data.csv')
print(df.shape)
df.head(1)

(288, 13)


Unnamed: 0,RANK,PLAYER,SCHOOL/TEAM,POS,AGE,HT,WING,TIER,STRENGTHS,WEAKNESSES,SUMMARY,TIER_DESCRIP,year
0,1,LaMelo Ball,Illawarra Hawks,G,19,6-6,6-9,5,Everything starts with Ball’s elite-level feel...,"The defense isn’t a sure thing, though, becaus...",Ball should enter the NBA as one of the most c...,All-Star Upside,2020


In [5]:
# Add Strengths and Weaknesses
df['full_text'] = df[['STRENGTHS', 'WEAKNESSES']].apply(lambda x: x[0] + ' ' + x[1], axis=1)
df.head(1)

Unnamed: 0,RANK,PLAYER,SCHOOL/TEAM,POS,AGE,HT,WING,TIER,STRENGTHS,WEAKNESSES,SUMMARY,TIER_DESCRIP,year,full_text
0,1,LaMelo Ball,Illawarra Hawks,G,19,6-6,6-9,5,Everything starts with Ball’s elite-level feel...,"The defense isn’t a sure thing, though, becaus...",Ball should enter the NBA as one of the most c...,All-Star Upside,2020,Everything starts with Ball’s elite-level feel...


## Preprocessing

In [6]:
df_2023 = df[df['year'] == 2023]

In [7]:
val = df_2023.sample(40, random_state=42)
test = df_2023[~df_2023.index.isin(val.index)]
val.shape, test.shape

((40, 14), (35, 14))

In [8]:
train = df[~df.index.isin(df_2023.index)]\
        .reset_index().drop('index', axis=1)
train.shape

(213, 14)

In [9]:
# # data augmentation
train_copy = train.copy()
train_copy['full_text'] = train_copy[['STRENGTHS', 'WEAKNESSES']].apply(lambda x: x[1] + ' ' + x[0], axis=1)
train = pd.concat([train, train_copy])\
          .reset_index().drop('index', axis=1)
train.shape

(426, 14)

In [10]:
train_count_dict = train.groupby('TIER')['PLAYER'].count().to_dict()
train_count_dict

{1: 144, 2: 162, 3: 68, 4: 30, 5: 22}

In [11]:
max_n = max(train_count_dict.values())

for i in range(1, 6):
    diff = max_n - train_count_dict[i]
    if diff != 0:
        random_sample = train[train['TIER'] == i].sample(diff, 
                                                         random_state=42,
                                                         replace=True)
        train = pd.concat([train, random_sample])\
                  .reset_index().drop('index', axis=1)

In [12]:
train.groupby('TIER')['PLAYER'].count().to_dict()

{1: 162, 2: 162, 3: 162, 4: 162, 5: 162}

## Preprocessing


In [13]:
import torch
from torch.nn import functional as F
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)
from datasets import Dataset

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [14]:
MODEL = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.

In [15]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [16]:
model.classifier = torch.nn.Linear(
    model.classifier.in_features,
    1
)

In [17]:
def preprocess_function(examples):
    tokens = tokenizer(examples['full_text'], 
                       truncation=True, 
                       max_length=512)
    if 'TIER' in examples:
        return {**tokens, "labels": [[float(x)] for x in examples['TIER']]}
    else:
        return tokens

In [18]:
train_dataset = (
    Dataset.from_pandas(train[['PLAYER', 'full_text', 'TIER']])
    .map(preprocess_function, batched=True)
    .remove_columns(['TIER'])
    .shuffle(seed=42)
)

val_dataset = (
    Dataset.from_pandas(val[['PLAYER', 'full_text', 'TIER']])
    .map(preprocess_function, batched=True)
    .remove_columns(['TIER'])
    .shuffle(seed=42)
)

test_dataset = (
    Dataset.from_pandas(test[['PLAYER', 'full_text', 'TIER']])
    .map(preprocess_function, batched=True)
    .remove_columns(['TIER'])
    .shuffle(seed=42)
)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [19]:
example = train_dataset[0]
example['PLAYER'], example['labels']

('Kessler Edwards', [2.0])

## Modeling

In [20]:
from sklearn.metrics import mean_squared_error

In [21]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # replace with regression loss
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss = F.mse_loss(outputs.get("logits"), labels)
        return (loss, outputs) if return_outputs else loss

In [22]:
os.environ["WANDB_DISABLED"] = "true"
torch.cuda.empty_cache()

In [23]:
CONFIG = {
    "weight_decay": 0.1,
    "num_train_epochs": 10.0,
    "load_best_model_at_end": True,
    "metric_for_best_model": 'eval_loss',
    "lr_scheduler_type": 'cosine_with_restarts',
}

In [24]:
run_name = f"fpell-{int(time.time())}"

print("=" * 50)
print(f"Starting run: {run_name}")
print("=" * 50)

training_args = TrainingArguments(
    run_name=run_name,
    output_dir="./results",
    save_total_limit=5,
    evaluation_strategy="steps",
    eval_steps=50,
    logging_steps=50,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    **CONFIG,
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
#     compute_metrics=compute_metrics,
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Starting run: fpell-1692299923


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
50,2.9033,3.823922
100,1.8188,2.186365
150,1.0261,1.142339
200,0.5522,1.844909
250,0.3156,1.313893
300,0.2602,1.231441
350,0.2619,1.317054
400,0.1341,1.089298
450,0.1483,1.462833
500,0.0953,1.320471


TrainOutput(global_step=1020, training_loss=0.3898024567786385, metrics={'train_runtime': 246.0857, 'train_samples_per_second': 32.915, 'train_steps_per_second': 4.145, 'total_flos': 1072966793932800.0, 'train_loss': 0.3898024567786385, 'epoch': 10.0})

In [25]:
temp = trainer.predict(train_dataset)
mean_squared_error([x[0] for x in temp[1]],
                   [x[0] for x in temp[0]])

0.04369183

In [26]:
data = []

for i in range(len(val_dataset)):
    data.append([train_dataset[i]['PLAYER'], 
                 train_dataset[i]['labels'][0],
                 temp[0][i][0]])
    
res = pd.DataFrame(data, columns=['PLAYER', 'TIER', 'pred'])
res['diff'] = res['TIER'] - res['pred']
res.sort_values('diff', ascending=False).drop_duplicates()

Unnamed: 0,PLAYER,TIER,pred,diff
3,Aleksej Pokusevski,3.0,2.512928,0.487072
0,Kessler Edwards,2.0,1.515437,0.484563
28,Malachi Flynn,2.0,1.690561,0.309439
33,Jaden Springer,3.0,2.753875,0.246125
21,Deni Avdija,4.0,3.779682,0.220318
10,Jeremiah Robinson-Earl,2.0,1.843329,0.156671
24,Daniel Oturu,2.0,1.860315,0.139685
1,Collin Gillespie,1.0,0.865169,0.134831
30,Desmond Bane,2.0,1.87012,0.12988
25,Christian Koloko,2.0,1.876794,0.123206


In [27]:
temp = trainer.predict(val_dataset)
mean_squared_error([x[0] for x in temp[1]],
                   [x[0] for x in temp[0]])

1.3204714

In [28]:
data = []

for i in range(len(val_dataset)):
    data.append([val_dataset[i]['PLAYER'], 
                 val_dataset[i]['labels'][0],
                 temp[0][i][0]])
    
res = pd.DataFrame(data, columns=['PLAYER', 'TIER', 'pred'])
res['diff'] = res['TIER'] - res['pred']
res.sort_values('diff', ascending=False)

Unnamed: 0,PLAYER,TIER,pred,diff
34,Brandon Miller,5.0,2.749944,2.250056
28,Amen Thompson,5.0,3.111078,1.888922
27,Victor Wembanyama,5.0,3.487209,1.512791
33,Dereck Lively II,3.0,1.688365,1.311635
6,Jarace Walker,4.0,2.760077,1.239923
35,Kobe Bufkin,3.0,2.13213,0.86787
18,Anthony Black,4.0,3.241787,0.758213
23,Ąžuolas Tubelis,1.0,0.957458,0.042542
29,Jaime Jaquez Jr.,2.0,1.990971,0.009029
37,Mojave King,1.0,1.054657,-0.054657


In [29]:
temp = trainer.predict(test_dataset)
mean_squared_error([x[0] for x in temp[1]],
                   [x[0] for x in temp[0]])

0.9600954

In [30]:
data = []

for i in range(len(test_dataset)):
    data.append([test_dataset[i]['PLAYER'], 
                 test_dataset[i]['labels'][0],
                 temp[0][i][0]])
    
res = pd.DataFrame(data, columns=['PLAYER', 'TIER', 'pred'])
res['diff'] = res['TIER'] - res['pred']
res.sort_values('diff', ascending=False)

Unnamed: 0,PLAYER,TIER,pred,diff
19,Ausar Thompson,4.0,1.571417,2.428583
29,Taylor Hendricks,4.0,2.630101,1.369899
20,Scoot Henderson,5.0,3.667526,1.332474
32,Cam Whitmore,5.0,3.845805,1.154195
3,Jalen Hood-Schifino,3.0,2.048153,0.951847
34,Nick Smith Jr.,2.0,1.128261,0.871739
14,Gradey Dick,3.0,2.1479,0.8521
33,Kobe Brown,2.0,1.381361,0.618639
26,Colby Jones,2.0,1.42532,0.57468
30,Bilal Coulibaly,3.0,2.582159,0.417841
