# Jutsu Classifier


In [1]:
!pip install transformers[torch]
!pip install datasets
!pip install --upgrade pandas
!pip install evaluate
!pip install accelerate -U

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch->transformers[torch])
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.

In [2]:
import torch
import pandas as pd
from bs4 import BeautifulSoup
from sklearn import preprocessing
from sklearn.utils.class_weight import compute_class_weight
from datasets import Dataset
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np
from sklearn.metrics import classification_report
import torch
from torch import nn


In [9]:
#Define parameters for hugging face model
data_path = "/content/drive/MyDrive/Colab Notebooks/2024 Data Science Projects/Naruto/jutsus.jsonl" #@param {type:"string"}
text_column_name = "text" #@param {type:"string"}
label_column_name = "jutsu" #@param {type:"string"}

model_name = "distilbert-base-uncased" #@param {type:"string"}
test_size = 0.2 #@param {type:"number"}
num_labels = 3 #@param {type:"number"}

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [10]:
#read and prepare dataset
df_jus = pd.read_json(data_path, lines=True)
df_jus.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description
0,10 Hit Combo,Taijutsu,Lars punches the opponent before striking them...
1,Air Sand Protective Wall,Ninjutsu,This air defence technique creates a giant shi...
2,All-Killing Ash Bones,"Kekkei Mōra, Ninjutsu","A certain-kill technique, the user hardens the..."
3,Air Raid Shot,Ninjutsu,"Kankurō's puppet, Karasu, soars into the air w..."
4,All is Suffering,"Kekkei Genkai, Ninjutsu","Using the giant statue as a medium, each of th..."


In [11]:
unique_jutsus = df_jus['jutsu_type'].str.split(',').explode().str.strip().unique()
print(unique_jutsus)

unique_jutsus = df_jus['jutsu_type'].str.split(',').explode().str.strip().value_counts()
print(unique_jutsus)

['Taijutsu' 'Ninjutsu' 'Kekkei Mōra' 'Kekkei Genkai' 'Hiden' 'Genjutsu'
 'Cooperation Ninjutsu' 'Kinjutsu' 'Shurikenjutsu' 'Clone Techniques'
 'Fūinjutsu' 'Kenjutsu' 'Chakra Flow' 'Dōjutsu' 'Fighting Style'
 'Bukijutsu' 'Senjutsu' 'Collaboration Techniques' 'Medical Ninjutsu'
 'Regeneration Techniques' 'General skill' 'Juinjutsu' ''
 'Barrier Ninjutsu' 'Chakra Absorption Techniques'
 'Scientific Ninja Tool Techniques' 'Shinjutsu' 'Space–Time Ninjutsu'
 'Reincarnation Ninjutsu' 'Nintaijutsu' 'Jujutsu' 'Kyūjutsu'
 'Fighting style' 'Space-Time Ninjutsu' 'Kekkei Tōta' 'Ninshū']
jutsu_type
Ninjutsu                            2222
Taijutsu                             629
Kekkei Genkai                        550
Hiden                                303
Dōjutsu                              215
Cooperation Ninjutsu                 202
Kenjutsu                             177
Space–Time Ninjutsu                  147
Fūinjutsu                            130
Bukijutsu                            12

In [12]:
#using 3 main categories of Jutsu to simplify model
def simplify_jutsu(jutsu):
    if 'Genjutsu' in jutsu:
        return 'Genjutsu'
    if 'Taijutsu' in jutsu:
        return 'Taijutsu'
    if 'Ninjutsu' in jutsu:
        return 'Ninjutsu'

    return None

In [13]:
#applying function
df_jus['simple_jutsu'] = df_jus['jutsu_type'].apply(simplify_jutsu)

#creating new column and refining df
df_jus['text'] = df_jus['jutsu_name']+'. '+df_jus['jutsu_description']
df_jus['jutsu'] = df_jus['simple_jutsu']
df_jus= df_jus[['text','jutsu']]
df_jus = df_jus.dropna()
df_jus.head()

Unnamed: 0,text,jutsu
0,10 Hit Combo. Lars punches the opponent before...,Taijutsu
1,Air Sand Protective Wall. This air defence tec...,Ninjutsu
2,All-Killing Ash Bones. A certain-kill techniqu...,Ninjutsu
3,"Air Raid Shot. Kankurō's puppet, Karasu, soars...",Ninjutsu
4,All is Suffering. Using the giant statue as a ...,Ninjutsu


In [14]:
#Encoding the jutsu to be input into model
le = preprocessing.LabelEncoder()
le.fit(df_jus[label_column_name].tolist())
df_jus['label'] = le.transform(df_jus[label_column_name].tolist())
df_jus.head()

Unnamed: 0,text,jutsu,label
0,10 Hit Combo. Lars punches the opponent before...,Taijutsu,2
1,Air Sand Protective Wall. This air defence tec...,Ninjutsu,1
2,All-Killing Ash Bones. A certain-kill techniqu...,Ninjutsu,1
3,"Air Raid Shot. Kankurō's puppet, Karasu, soars...",Ninjutsu,1
4,All is Suffering. Using the giant statue as a ...,Ninjutsu,1


In [15]:
#Dataset is imbalanced between Jutsu types we need to use class weights to ensure accurate model training
class_weights = compute_class_weight('balanced',
                     classes=sorted(df_jus['label'].unique().tolist()),
                     y=df_jus['label'].tolist()).tolist()

In [16]:
#train test (stratify to ensure even split of data) and convert to huggingface dataset
df_train,df_test = train_test_split(df_jus,test_size=test_size,stratify=df_jus['label'])

train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/2174 [00:00<?, ? examples/s]

Map:   0%|          | 0/544 [00:00<?, ? examples/s]

In [17]:
#initialise the model and set up training arguments
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

#huggingface trainer model
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(device=device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

#training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy = "epoch",
    logging_strategy="epoch")

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]



In [18]:
#train and save the model
trainer.train()
trainer.save_model('jutsu_model')


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9822,0.828864,0.827206
2,0.8955,0.990038,0.840074
3,0.86,0.882879,0.834559
4,0.8351,1.041402,0.849265
5,0.6869,0.899352,0.836397


In [19]:
#Evaluate the model
preds = trainer.predict(tokenized_train)
preds = np.argmax(preds[:3][0],axis=1)
GT = df_train['label'].tolist()
print(classification_report(GT,preds))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        78
           1       0.92      0.95      0.93      1599
           2       0.84      0.89      0.86       497

    accuracy                           0.90      2174
   macro avg       0.59      0.61      0.60      2174
weighted avg       0.87      0.90      0.88      2174



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
preds = trainer.predict(tokenized_test)
preds = np.argmax(preds[:3][0],axis=1) #preds[:3][1]
GT = df_test['label'].tolist()
print(classification_report(GT,preds))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        19
           1       0.88      0.90      0.89       400
           2       0.70      0.75      0.73       125

    accuracy                           0.84       544
   macro avg       0.53      0.55      0.54       544
weighted avg       0.81      0.84      0.82       544



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
