In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/draft-guide-by-sam-vecenie/draft_guide_data.csv


In [2]:
import pandas as pd
import numpy as np
import time

In [3]:
pd.set_option('display.max_rows', None)

## Read Data

In [4]:
df = pd.read_csv('/kaggle/input/draft-guide-by-sam-vecenie/draft_guide_data.csv')
print(df.shape)
df.head(1)

(288, 13)


Unnamed: 0,RANK,PLAYER,SCHOOL/TEAM,POS,AGE,HT,WING,TIER,STRENGTHS,WEAKNESSES,SUMMARY,TIER_DESCRIP,year
0,1,LaMelo Ball,Illawarra Hawks,G,19,6-6,6-9,5,Everything starts with Ball’s elite-level feel...,"The defense isn’t a sure thing, though, becaus...",Ball should enter the NBA as one of the most c...,All-Star Upside,2020


In [5]:
# Add Strengths and Weaknesses
df['full_text'] = df[['STRENGTHS', 'WEAKNESSES']].apply(lambda x: x[0] + ' ' + x[1], axis=1)
# df['full_text'] = df['WEAKNESSES']
df.head(1)

Unnamed: 0,RANK,PLAYER,SCHOOL/TEAM,POS,AGE,HT,WING,TIER,STRENGTHS,WEAKNESSES,SUMMARY,TIER_DESCRIP,year,full_text
0,1,LaMelo Ball,Illawarra Hawks,G,19,6-6,6-9,5,Everything starts with Ball’s elite-level feel...,"The defense isn’t a sure thing, though, becaus...",Ball should enter the NBA as one of the most c...,All-Star Upside,2020,Everything starts with Ball’s elite-level feel...


In [6]:
# df['full_text_word_count'] = df['full_text'].apply(lambda x: x.split(' ')).apply(len)
# df.head(5)

In [7]:
# df['full_text_word_count'].hist()

## Preprocessing

#### Create Chunks

In [8]:
def create_chunks(full_text):
    words = full_text.split(' ')
    n_words = len(words)
    chunks = [' '.join(words[i: i+400])
              for i in range(0, n_words, 400)]
    return chunks

In [9]:
df['chunks'] = df['full_text'].apply(create_chunks)

#### Train/Test Split

In [10]:
df_2023 = df[df['year'] == 2023]

In [11]:
def convert_to_chunk_df(df):
    data = []
    cols = df.columns
    for row in df.values:
        chunks = row[-1]
        for chunk in chunks:
            if len(chunk.split(' ')) > 50:
                new_row = list(row[:-1])
                new_row.append(chunk)
                data.append(new_row)
    return pd.DataFrame(data, columns=cols)

In [12]:
val = df_2023.sample(40, random_state=42)
test = df_2023[~df_2023.index.isin(val.index)]
print(val.shape, test.shape)
val = convert_to_chunk_df(val)
test = convert_to_chunk_df(test)
print(val.shape, test.shape)

(40, 15) (35, 15)
(131, 15) (105, 15)


In [13]:
train = df[~df.index.isin(df_2023.index)]\
        .reset_index().drop('index', axis=1)
print(train.shape)
train = convert_to_chunk_df(train)
print(train.shape)

(213, 15)
(513, 15)


In [14]:
# # # data augmentation
# train_copy = train.copy()
# train_copy['full_text'] = train_copy[['STRENGTHS', 'WEAKNESSES']].apply(lambda x: x[1] + ' ' + x[0], axis=1)
# train = pd.concat([train, train_copy])\
#           .reset_index().drop('index', axis=1)
# train.shape

In [15]:
train_count_dict = train.groupby('TIER')['PLAYER'].count().to_dict()
train_count_dict

{1: 142, 2: 200, 3: 93, 4: 43, 5: 35}

In [16]:
# max_n = max(train_count_dict.values())

# for i in range(1, 6):
#     diff = max_n - train_count_dict[i]
#     if diff != 0:
#         random_sample = train[train['TIER'] == i].sample(diff, 
#                                                          random_state=42,
#                                                          replace=True)
#         train = pd.concat([train, random_sample])\
#                   .reset_index().drop('index', axis=1)

In [17]:
train.groupby('TIER')['PLAYER'].count().to_dict()

{1: 142, 2: 200, 3: 93, 4: 43, 5: 35}

## Preprocessing


In [18]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)
from datasets import Dataset

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [19]:
MODEL = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.we

In [20]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [21]:
model.classifier = torch.nn.Linear(
    model.classifier.in_features,
    1
)

In [22]:
def preprocess_function(examples):
    tokens = tokenizer(examples['chunks'],
                       truncation=True,
                       padding=True,
                       max_length=512)
    if 'TIER' in examples:
        return {**tokens, "labels": [[float(x)] 
                                     for x in examples['TIER']]}
    else:
        return tokens

In [23]:
train_dataset = (
    Dataset.from_pandas(train[['PLAYER', 'chunks', 'TIER']])
    .map(preprocess_function, batched=True)
    .remove_columns(['TIER'])
    .shuffle(seed=42)
)

val_dataset = (
    Dataset.from_pandas(val[['PLAYER', 'chunks', 'TIER']])
    .map(preprocess_function, batched=True)
    .remove_columns(['TIER'])
    .shuffle(seed=42)
)

test_dataset = (
    Dataset.from_pandas(test[['PLAYER', 'chunks', 'TIER']])
    .map(preprocess_function, batched=True)
    .remove_columns(['TIER'])
    .shuffle(seed=42)
)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [24]:
example = train_dataset[0]
example['PLAYER'], example['labels'], len(example['input_ids'])


('Juhann Begarin', [1.0], 512)

In [25]:
set([len(row) for row in train_dataset['input_ids']])

{512}

## Modeling

In [26]:
from sklearn.metrics import mean_squared_error

In [27]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # replace with regression loss
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss = F.mse_loss(outputs.get("logits"), labels)
        return (loss, outputs) if return_outputs else loss

In [28]:
os.environ["WANDB_DISABLED"] = "true"
torch.cuda.empty_cache()

In [29]:
CONFIG = {
    "weight_decay": 0.01,
    "num_train_epochs": 15.0,
    "load_best_model_at_end": True,
    "metric_for_best_model": 'eval_loss',
    "lr_scheduler_type": 'cosine_with_restarts',
}

In [30]:
run_name = f"fpell-{int(time.time())}"

print("=" * 50)
print(f"Starting run: {run_name}")
print("=" * 50)

training_args = TrainingArguments(
    run_name=run_name,
    output_dir="./results",
    save_total_limit=5,
    evaluation_strategy="steps",
    eval_steps=50,
    logging_steps=50,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    **CONFIG,
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
#     compute_metrics=compute_metrics,
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Starting run: fpell-1693928010


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
50,1.6127,1.713259
100,1.4571,1.790517
150,1.1098,2.005685
200,1.3299,1.529956
250,1.0188,1.413503
300,0.5439,1.431196
350,0.3312,1.988594
400,0.3279,1.411167
450,0.1986,1.425188
500,0.1707,1.388666


TrainOutput(global_step=975, training_loss=0.442028209123856, metrics={'train_runtime': 246.9658, 'train_samples_per_second': 31.158, 'train_steps_per_second': 3.948, 'total_flos': 1019318454236160.0, 'train_loss': 0.442028209123856, 'epoch': 15.0})

In [31]:
temp = trainer.predict(train_dataset)
mean_squared_error([x[0] for x in temp[1]],
                   [x[0] for x in temp[0]])

0.13562678

In [32]:
df[['TIER', 'TIER_DESCRIP']].drop_duplicates()

Unnamed: 0,TIER,TIER_DESCRIP
0,5,All-Star Upside
3,4,High-Leverage Starters
11,3,Upside Swings
17,2,Rotation Players
49,1,Second Round and Two-Ways


In [33]:
data = []

for i in range(len(train_dataset)):
    data.append([train_dataset[i]['PLAYER'], 
                 train_dataset[i]['labels'][0],
                 temp[0][i][0]])
    
res = pd.DataFrame(data, columns=['PLAYER', 'TIER', 'pred'])\
        .groupby(['PLAYER', 'TIER'])['pred'].mean().reset_index()
res['diff'] = res['TIER'] - res['pred']
print(mean_squared_error(res['TIER'], res['pred']))
res.sort_values('diff', ascending=False).drop_duplicates()

0.08300765623121671


Unnamed: 0,PLAYER,TIER,pred,diff
114,Justin Lewis,2.0,1.26819,0.73181
166,Quentin Grimes,2.0,1.320271,0.679729
105,Jordan Nwora,2.0,1.32734,0.67266
55,Grant Riller,2.0,1.368478,0.631522
41,Day’Ron Sharpe,2.0,1.379236,0.620764
63,Isaiah Jackson,2.0,1.397361,0.602639
191,Trey Murphy III,3.0,2.407991,0.592009
178,Santi Aldama,2.0,1.416319,0.583681
170,Robert Woodard,2.0,1.429242,0.570758
145,Miles McBride,3.0,2.461694,0.538306


In [34]:
temp = trainer.predict(val_dataset)
mean_squared_error([x[0] for x in temp[1]],
                   [x[0] for x in temp[0]])

1.3886658

In [35]:
data = []

for i in range(len(val_dataset)):
    data.append([val_dataset[i]['PLAYER'], 
                 val_dataset[i]['labels'][0],
                 temp[0][i][0]])
    
res = pd.DataFrame(data, columns=['PLAYER', 'TIER', 'pred'])\
        .groupby(['PLAYER', 'TIER'])['pred'].mean().reset_index()
res['diff'] = res['TIER'] - res['pred']
print(mean_squared_error(res['TIER'], res['pred']))
res.sort_values('diff', ascending=False).drop_duplicates()

0.9200509045320399


Unnamed: 0,PLAYER,TIER,pred,diff
4,Brandon Miller,5.0,1.973369,3.026631
17,Jarace Walker,4.0,2.281921,1.718079
1,Amen Thompson,5.0,3.38026,1.61974
38,Victor Wembanyama,5.0,3.70465,1.29535
3,Anthony Black,4.0,2.960854,1.039146
10,Dereck Lively II,3.0,2.093897,0.906103
27,Leonard Miller,3.0,2.325156,0.674844
23,Keyonte George,2.0,1.518262,0.481738
7,Cason Wallace,3.0,2.519022,0.480978
24,Kobe Bufkin,3.0,2.532893,0.467107


In [36]:
temp = trainer.predict(test_dataset)
mean_squared_error([x[0] for x in temp[1]],
                   [x[0] for x in temp[0]])

1.4046191

In [37]:
data = []

for i in range(len(test_dataset)):
    data.append([test_dataset[i]['PLAYER'], 
                 test_dataset[i]['labels'][0],
                 temp[0][i][0]])
    
res = pd.DataFrame(data, columns=['PLAYER', 'TIER', 'pred'])\
        .groupby(['PLAYER', 'TIER'])['pred'].mean().reset_index()
res['diff'] = res['TIER'] - res['pred']
print(mean_squared_error(res['TIER'], res['pred']))
res.sort_values('diff', ascending=False).drop_duplicates()

0.9478805109799278


Unnamed: 0,PLAYER,TIER,pred,diff
6,Cam Whitmore,5.0,2.360109,2.639891
27,Scoot Henderson,5.0,2.847514,2.152486
30,Taylor Hendricks,4.0,2.134745,1.865255
2,Ausar Thompson,4.0,2.230933,1.769067
4,Bilal Coulibaly,3.0,2.225444,0.774556
14,Jalen Hood-Schifino,3.0,2.340293,0.659707
19,Julian Phillips,2.0,1.417579,0.582421
12,Gradey Dick,3.0,2.520135,0.479865
20,Kobe Brown,2.0,1.686419,0.313581
9,Colby Jones,2.0,1.778601,0.221399
