In [4]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset

## load dataset

In [5]:
data_path = "../data/jutsus.jsonl"
df = pd.read_json(data_path, lines=True)
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description
0,Almighty Sakura,Taijutsu,Sakura gathers large quantities of chakra and ...
1,Allied Shinobi Forces Technique,"Kekkei Genkai, Hiden, Ninjutsu, Genjutsu, Taij...",This article is about collaborative techniques...
2,10 Hit Combo,Taijutsu,Lars punches the opponent before striking them...
3,All Weapons Above Heaven,Ninjutsu,This technique raises all the status boosts (S...
4,All Directions Shuriken,"Ninjutsu, Shurikenjutsu, Clone Techniques, Kin...",After using the Multiple Shadow Clone Techniqu...


In [6]:
def simplify_jutsu(jutsu):
    if "Genjutsu" in jutsu:
        return "Genjutsu"
    if "Ninjutsu" in jutsu:
        return "Ninjutsu"
    if "Taijutsu" in jutsu:
        return "Taijutsu"

In [7]:
df['jutsu_type_simplified'] = df['jutsu_type'].apply(simplify_jutsu)

In [8]:
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description,jutsu_type_simplified
0,Almighty Sakura,Taijutsu,Sakura gathers large quantities of chakra and ...,Taijutsu
1,Allied Shinobi Forces Technique,"Kekkei Genkai, Hiden, Ninjutsu, Genjutsu, Taij...",This article is about collaborative techniques...,Genjutsu
2,10 Hit Combo,Taijutsu,Lars punches the opponent before striking them...,Taijutsu
3,All Weapons Above Heaven,Ninjutsu,This technique raises all the status boosts (S...,Ninjutsu
4,All Directions Shuriken,"Ninjutsu, Shurikenjutsu, Clone Techniques, Kin...",After using the Multiple Shadow Clone Techniqu...,Ninjutsu


In [9]:
df['jutsu_type_simplified'].value_counts()

jutsu_type_simplified
Ninjutsu    2270
Taijutsu     398
Genjutsu     101
Name: count, dtype: int64

In [10]:
df['text'] = df['jutsu_name'] + ". " + df['jutsu_description']
df['jutsus'] = df['jutsu_type_simplified']
df = df[['text', 'jutsus']]
df = df.dropna()

In [11]:
df.head()

Unnamed: 0,text,jutsus
0,Almighty Sakura. Sakura gathers large quantiti...,Taijutsu
1,Allied Shinobi Forces Technique. This article ...,Genjutsu
2,10 Hit Combo. Lars punches the opponent before...,Taijutsu
3,All Weapons Above Heaven. This technique raise...,Ninjutsu
4,All Directions Shuriken. After using the Multi...,Ninjutsu


In [12]:
from bs4 import BeautifulSoup
class Cleaner():
    def __init__(self):
        pass 
    
    def put_line_breaks(self, text):
        return text.replace("<\p>", "<\p>\n")
    
    def remove_html_tags(self, text):
        clean_text = BeautifulSoup(text, "lxml").text
        return clean_text

    def clean(self, text):
        text = self.put_line_breaks(text)
        text = self.remove_html_tags(text)
        text = text.strip()
        return text

In [13]:
text_column_name = 'text'
label_column_name = "jutsus"

In [14]:
# Clean Text
cleaner = Cleaner()
df['text_cleaned'] = df[text_column_name].apply(cleaner.clean)

In [15]:
df.head()

Unnamed: 0,text,jutsus,text_cleaned
0,Almighty Sakura. Sakura gathers large quantiti...,Taijutsu,Almighty Sakura. Sakura gathers large quantiti...
1,Allied Shinobi Forces Technique. This article ...,Genjutsu,Allied Shinobi Forces Technique. This article ...
2,10 Hit Combo. Lars punches the opponent before...,Taijutsu,10 Hit Combo. Lars punches the opponent before...
3,All Weapons Above Heaven. This technique raise...,Ninjutsu,All Weapons Above Heaven. This technique raise...
4,All Directions Shuriken. After using the Multi...,Ninjutsu,All Directions Shuriken. After using the Multi...


In [16]:
# Encode Labels 
le = preprocessing.LabelEncoder()
le.fit(df[label_column_name].tolist())

In [17]:
label_dict = {index:label_name for index, label_name in enumerate(le.__dict__['classes_'].tolist())}
label_dict

{0: 'Genjutsu', 1: 'Ninjutsu', 2: 'Taijutsu'}

In [18]:
df['label'] = le.transform(df[label_column_name].tolist())

In [19]:
df.head()

Unnamed: 0,text,jutsus,text_cleaned,label
0,Almighty Sakura. Sakura gathers large quantiti...,Taijutsu,Almighty Sakura. Sakura gathers large quantiti...,2
1,Allied Shinobi Forces Technique. This article ...,Genjutsu,Allied Shinobi Forces Technique. This article ...,0
2,10 Hit Combo. Lars punches the opponent before...,Taijutsu,10 Hit Combo. Lars punches the opponent before...,2
3,All Weapons Above Heaven. This technique raise...,Ninjutsu,All Weapons Above Heaven. This technique raise...,1
4,All Directions Shuriken. After using the Multi...,Ninjutsu,All Directions Shuriken. After using the Multi...,1


In [20]:
test_size = 0.2
df_train, df_test = train_test_split(df, 
                                     test_size=test_size, 
                                     stratify=df['label'],)

In [21]:
df_train['jutsus'].value_counts()

jutsus
Ninjutsu    1816
Taijutsu     318
Genjutsu      81
Name: count, dtype: int64

In [22]:
model_name = "distilbert/distilbert-base-uncased"

In [23]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [24]:
def preprocess_function(tokenizer,examples):
    return tokenizer(examples['text_cleaned'],truncation=True)

In [25]:
# Conver Pandas to a hugging face dataset
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

# tokenize the dataset
tokenized_train = train_dataset.map(lambda examples: preprocess_function(tokenizer, examples),
                                    batched=True)
tokenized_test = test_dataset.map(lambda examples: preprocess_function(tokenizer, examples),
                                    batched=True)

Map:   0%|          | 0/2215 [00:00<?, ? examples/s]

Map:   0%|          | 0/554 [00:00<?, ? examples/s]