In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/draft-guide-by-sam-vecenie/draft_guide_data.csv


In [2]:
import pandas as pd
import numpy as np
import time

In [3]:
pd.set_option('display.max_rows', None)

## Read Data

In [4]:
df = pd.read_csv('/kaggle/input/draft-guide-by-sam-vecenie/draft_guide_data.csv')
print(df.shape)
df.head(1)

(288, 13)


Unnamed: 0,RANK,PLAYER,SCHOOL/TEAM,POS,AGE,HT,WING,TIER,STRENGTHS,WEAKNESSES,SUMMARY,TIER_DESCRIP,year
0,1,LaMelo Ball,Illawarra Hawks,G,19,6-6,6-9,5,Everything starts with Ball’s elite-level feel...,"The defense isn’t a sure thing, though, becaus...",Ball should enter the NBA as one of the most c...,All-Star Upside,2020


In [5]:
# Add Strengths and Weaknesses
df['full_text'] = df[['STRENGTHS', 'WEAKNESSES']].apply(lambda x: x[0] + ' ' + x[1], axis=1)
df.head(1)

Unnamed: 0,RANK,PLAYER,SCHOOL/TEAM,POS,AGE,HT,WING,TIER,STRENGTHS,WEAKNESSES,SUMMARY,TIER_DESCRIP,year,full_text
0,1,LaMelo Ball,Illawarra Hawks,G,19,6-6,6-9,5,Everything starts with Ball’s elite-level feel...,"The defense isn’t a sure thing, though, becaus...",Ball should enter the NBA as one of the most c...,All-Star Upside,2020,Everything starts with Ball’s elite-level feel...


## Preprocessing

In [6]:
df_2023 = df[df['year'] == 2023]

In [7]:
val = df_2023.sample(40, random_state=42)
test = df_2023[~df_2023.index.isin(val.index)]
val.shape, test.shape

((40, 14), (35, 14))

In [8]:
train = df[~df.index.isin(df_2023.index)]\
        .reset_index().drop('index', axis=1)
train.shape

(213, 14)

In [9]:
# # # data augmentation
# train_copy = train.copy()
# train_copy['full_text'] = train_copy[['STRENGTHS', 'WEAKNESSES']].apply(lambda x: x[1] + ' ' + x[0], axis=1)
# train = pd.concat([train, train_copy])\
#           .reset_index().drop('index', axis=1)
# train.shape

In [10]:
train_count_dict = train.groupby('TIER')['PLAYER'].count().to_dict()
train_count_dict

{1: 72, 2: 81, 3: 34, 4: 15, 5: 11}

In [11]:
max_n = max(train_count_dict.values())

for i in range(1, 6):
    diff = max_n - train_count_dict[i]
    if diff != 0:
        random_sample = train[train['TIER'] == i].sample(diff, 
                                                         random_state=42,
                                                         replace=True)
        train = pd.concat([train, random_sample])\
                  .reset_index().drop('index', axis=1)

In [12]:
train.groupby('TIER')['PLAYER'].count().to_dict()

{1: 81, 2: 81, 3: 81, 4: 81, 5: 81}

## Preprocessing


In [13]:
import torch
from torch.nn import functional as F
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)
from datasets import Dataset

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [14]:
MODEL = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifier.bias', 'pre_classifier.

In [15]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [16]:
model.classifier = torch.nn.Linear(
    model.classifier.in_features,
    1
)

In [17]:
def preprocess_function(examples):
    tokens = tokenizer(examples['full_text'], 
                       truncation=True, 
                       max_length=512)
    if 'TIER' in examples:
        return {**tokens, "labels": [[float(x)] for x in examples['TIER']]}
    else:
        return tokens

In [18]:
train_dataset = (
    Dataset.from_pandas(train[['PLAYER', 'full_text', 'TIER']])
    .map(preprocess_function, batched=True)
    .remove_columns(['TIER'])
    .shuffle(seed=42)
)

val_dataset = (
    Dataset.from_pandas(val[['PLAYER', 'full_text', 'TIER']])
    .map(preprocess_function, batched=True)
    .remove_columns(['TIER'])
    .shuffle(seed=42)
)

test_dataset = (
    Dataset.from_pandas(test[['PLAYER', 'full_text', 'TIER']])
    .map(preprocess_function, batched=True)
    .remove_columns(['TIER'])
    .shuffle(seed=42)
)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [19]:
example = train_dataset[0]
example['PLAYER'], example['labels']

('Dalano Banton', [1.0])

In [20]:
train_dataset

Dataset({
    features: ['PLAYER', 'full_text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 405
})

## Modeling

In [21]:
from sklearn.metrics import mean_squared_error

In [22]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # replace with regression loss
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss = F.mse_loss(outputs.get("logits"), labels)
        return (loss, outputs) if return_outputs else loss

In [23]:
os.environ["WANDB_DISABLED"] = "true"
torch.cuda.empty_cache()

In [24]:
CONFIG = {
    "weight_decay": 0.01,
    "num_train_epochs": 15.0,
    "load_best_model_at_end": True,
    "metric_for_best_model": 'eval_loss',
    "lr_scheduler_type": 'cosine_with_restarts',
}

In [25]:
run_name = f"fpell-{int(time.time())}"

print("=" * 50)
print(f"Starting run: {run_name}")
print("=" * 50)

training_args = TrainingArguments(
    run_name=run_name,
    output_dir="./results",
    save_total_limit=5,
    evaluation_strategy="steps",
    eval_steps=50,
    logging_steps=50,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    **CONFIG,
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
#     compute_metrics=compute_metrics,
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Starting run: fpell-1692302693


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
50,2.3847,0.921684
100,1.1302,1.012831
150,0.3862,2.106628
200,0.266,0.963449
250,0.1612,1.450636
300,0.1029,1.056129
350,0.0756,1.233763
400,0.0999,1.07664
450,0.0499,1.206737
500,0.0361,1.106209


TrainOutput(global_step=765, training_loss=0.31707072207351134, metrics={'train_runtime': 182.4703, 'train_samples_per_second': 33.293, 'train_steps_per_second': 4.192, 'total_flos': 804725095449600.0, 'train_loss': 0.31707072207351134, 'epoch': 15.0})

In [26]:
temp = trainer.predict(train_dataset)
mean_squared_error([x[0] for x in temp[1]],
                   [x[0] for x in temp[0]])
print(len(temp[0]))

405


In [27]:
df[['TIER', 'TIER_DESCRIP']].drop_duplicates()

Unnamed: 0,TIER,TIER_DESCRIP
0,5,All-Star Upside
3,4,High-Leverage Starters
11,3,Upside Swings
17,2,Rotation Players
49,1,Second Round and Two-Ways


In [28]:
data = []

for i in range(len(train_dataset)):
    data.append([train_dataset[i]['PLAYER'], 
                 train_dataset[i]['labels'][0],
                 temp[0][i][0]])
    
res = pd.DataFrame(data, columns=['PLAYER', 'TIER', 'pred'])
res['diff'] = res['TIER'] - res['pred']
res.sort_values('diff', ascending=False).drop_duplicates()

Unnamed: 0,PLAYER,TIER,pred,diff
360,Christian Braun,2.0,1.49359,0.50641
147,Jaden McDaniels,2.0,1.568535,0.431465
81,Onyeka Okongwu,4.0,3.631358,0.368642
118,Josh Minott,2.0,1.64435,0.35565
290,Killian Tillie,2.0,1.649742,0.350258
297,Isaiah Jackson,2.0,1.652121,0.347879
225,Rokas Jokubaitis,2.0,1.65718,0.34282
14,Zeke Nnaji,2.0,1.69514,0.30486
6,Chris Duarte,3.0,2.709676,0.290324
217,Jabari Walker,2.0,1.711774,0.288226


In [29]:
temp = trainer.predict(val_dataset)
mean_squared_error([x[0] for x in temp[1]],
                   [x[0] for x in temp[0]])

1.1062092

In [30]:
data = []

for i in range(len(val_dataset)):
    data.append([val_dataset[i]['PLAYER'], 
                 val_dataset[i]['labels'][0],
                 temp[0][i][0]])
    
res = pd.DataFrame(data, columns=['PLAYER', 'TIER', 'pred'])
res['diff'] = res['TIER'] - res['pred']
res.sort_values('diff', ascending=False)

Unnamed: 0,PLAYER,TIER,pred,diff
34,Brandon Miller,5.0,2.424003,2.575997
28,Amen Thompson,5.0,3.226198,1.773802
6,Jarace Walker,4.0,2.647186,1.352814
35,Kobe Bufkin,3.0,2.024734,0.975266
27,Victor Wembanyama,5.0,4.051089,0.948911
33,Dereck Lively II,3.0,2.143542,0.856458
18,Anthony Black,4.0,3.582228,0.417772
29,Jaime Jaquez Jr.,2.0,1.647206,0.352794
0,Dariq Whitehead,2.0,1.647875,0.352125
31,Keyonte George,2.0,1.701699,0.298301


In [31]:
temp = trainer.predict(test_dataset)
mean_squared_error([x[0] for x in temp[1]],
                   [x[0] for x in temp[0]])

0.9674589

In [32]:
data = []

for i in range(len(test_dataset)):
    data.append([test_dataset[i]['PLAYER'], 
                 test_dataset[i]['labels'][0],
                 temp[0][i][0]])
    
res = pd.DataFrame(data, columns=['PLAYER', 'TIER', 'pred'])
res['diff'] = res['TIER'] - res['pred']
res.sort_values('diff', ascending=False)

Unnamed: 0,PLAYER,TIER,pred,diff
32,Cam Whitmore,5.0,2.566945,2.433055
29,Taylor Hendricks,4.0,1.781561,2.218439
20,Scoot Henderson,5.0,3.229052,1.770948
19,Ausar Thompson,4.0,2.314439,1.685561
14,Gradey Dick,3.0,1.948847,1.051153
34,Nick Smith Jr.,2.0,1.156706,0.843294
3,Jalen Hood-Schifino,3.0,2.261622,0.738378
30,Bilal Coulibaly,3.0,2.465594,0.534406
27,Julian Phillips,2.0,1.49875,0.50125
33,Kobe Brown,2.0,1.745871,0.254129
