In [1]:
import pandas as pd 
import numpy as np


In [2]:
data_path = "../data/jutsu.jsonl"
df = pd.read_json(data_path, lines=True)
df

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description
0,Area Scanning Technique,Ninjutsu,By touching a solid surface with their hand or...
1,Armour of Sticky Gold,"Hiden, Ninjutsu",Kidōmaru secretes Spider Sticky Gold from his ...
2,Arm Growth Technique,Ninjutsu,"From the shoulders, the user is able to create..."
3,Arm of Shukaku,Ninjutsu,Gaara engulfs himself in a sand barrier as Shu...
4,Armour of Sand,Ninjutsu,"This technique is the second part of Gaara's ""..."
...,...,...,...
5701,Absolute: Fang Passing Fang,"Taijutsu, Collaboration Techniques",Kiba and Akamaru perform the Fang Passing Fang...
5702,16 Hit Combo,Taijutsu,"A very effective move, Ino uses this as a quic..."
5703,1000 Metre Punch,Taijutsu,The user focuses a large amount of chakra into...
5704,100% Single Punch,Taijutsu,Tsunade gathers large amounts of chakra in her...


In [3]:
def simplify_jutsu(jutsu):
    if "Genjutsu" in jutsu:
        return "Genjutsu"
    if "Ninjutsu" in jutsu:
        return "Ninjutsu"
    if "Taijutsu" in jutsu:
        return "Taijutsu"
    

In [4]:
df['jutsu_type_simplified'] = df['jutsu_type'].apply(simplify_jutsu)

In [5]:
df

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description,jutsu_type_simplified
0,Area Scanning Technique,Ninjutsu,By touching a solid surface with their hand or...,Ninjutsu
1,Armour of Sticky Gold,"Hiden, Ninjutsu",Kidōmaru secretes Spider Sticky Gold from his ...,Ninjutsu
2,Arm Growth Technique,Ninjutsu,"From the shoulders, the user is able to create...",Ninjutsu
3,Arm of Shukaku,Ninjutsu,Gaara engulfs himself in a sand barrier as Shu...,Ninjutsu
4,Armour of Sand,Ninjutsu,"This technique is the second part of Gaara's ""...",Ninjutsu
...,...,...,...,...
5701,Absolute: Fang Passing Fang,"Taijutsu, Collaboration Techniques",Kiba and Akamaru perform the Fang Passing Fang...,Taijutsu
5702,16 Hit Combo,Taijutsu,"A very effective move, Ino uses this as a quic...",Taijutsu
5703,1000 Metre Punch,Taijutsu,The user focuses a large amount of chakra into...,Taijutsu
5704,100% Single Punch,Taijutsu,Tsunade gathers large amounts of chakra in her...,Taijutsu


In [6]:
df['jutsu_type_simplified'].value_counts()

jutsu_type_simplified
Ninjutsu    4402
Taijutsu     783
Genjutsu     189
Name: count, dtype: int64

In [7]:
df['text'] = df['jutsu_name'] + ". " + df['jutsu_description']
df['jutsu'] = df['jutsu_type_simplified']
df = df[['text', 'jutsu']]
df = df.dropna()

In [8]:
df

Unnamed: 0,text,jutsu
0,Area Scanning Technique. By touching a solid s...,Ninjutsu
1,Armour of Sticky Gold. Kidōmaru secretes Spide...,Ninjutsu
2,"Arm Growth Technique. From the shoulders, the ...",Ninjutsu
3,Arm of Shukaku. Gaara engulfs himself in a san...,Ninjutsu
4,Armour of Sand. This technique is the second p...,Ninjutsu
...,...,...
5701,Absolute: Fang Passing Fang. Kiba and Akamaru ...,Taijutsu
5702,"16 Hit Combo. A very effective move, Ino uses ...",Taijutsu
5703,1000 Metre Punch. The user focuses a large amo...,Taijutsu
5704,100% Single Punch. Tsunade gathers large amoun...,Taijutsu


In [9]:
label_column_name = 'jutsu'
text_column_name = 'text'

In [10]:
from sklearn.preprocessing import LabelEncoder

In [11]:
le = LabelEncoder()
le.fit(df[label_column_name].tolist())

In [12]:
label_dict = {index : label_name for index, label_name in enumerate( le.__dict__['classes_'].tolist())}


In [13]:
label_dict

{0: 'Genjutsu', 1: 'Ninjutsu', 2: 'Taijutsu'}

In [14]:
df['label'] = le.transform(df[label_column_name].tolist())

In [15]:
df

Unnamed: 0,text,jutsu,label
0,Area Scanning Technique. By touching a solid s...,Ninjutsu,1
1,Armour of Sticky Gold. Kidōmaru secretes Spide...,Ninjutsu,1
2,"Arm Growth Technique. From the shoulders, the ...",Ninjutsu,1
3,Arm of Shukaku. Gaara engulfs himself in a san...,Ninjutsu,1
4,Armour of Sand. This technique is the second p...,Ninjutsu,1
...,...,...,...
5701,Absolute: Fang Passing Fang. Kiba and Akamaru ...,Taijutsu,2
5702,"16 Hit Combo. A very effective move, Ino uses ...",Taijutsu,2
5703,1000 Metre Punch. The user focuses a large amo...,Taijutsu,2
5704,100% Single Punch. Tsunade gathers large amoun...,Taijutsu,2


In [16]:
from sklearn.model_selection import train_test_split  
from transformers import AutoTokenizer
from datasets import Dataset

  _torch_pytree._register_pytree_node(


In [17]:
test_size=0.2
df_train, df_test = train_test_split(df, stratify=df['label'], test_size=test_size)

In [18]:
model_name = 'distilbert/distilbert-base-uncased'

In [19]:
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [20]:
def preprocess_function(tokenizer, examples):
    return tokenizer(examples[text_column_name], truncation=True)

In [21]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

tokenized_train = train_dataset.map(lambda examples: preprocess_function(tokenizer, examples), batched=True)
tokenized_test = test_dataset.map(lambda examples: preprocess_function(tokenizer, examples), batched=True)

Map:   0%|          | 0/4299 [00:00<?, ? examples/s]

Map:   0%|          | 0/1075 [00:00<?, ? examples/s]