In [1]:
import pandas as pd
from sklearn import preprocessing
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


# Load Dataset

In [2]:
data_path = "../data/jutsus.jsonl"
df = pd.read_json(data_path, lines = True)
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description
0,Acrobat,"Taijutsu, Kenjutsu","The Acrobat (荒繰鷺伐刀, Akurobatto) is a kenjutsu ..."
1,1000 Metre Punch,Taijutsu,The user focuses a large amount of chakra into...
2,Absorption Sphere,Ninjutsu,"Using the Jutsu Absorption Arm, the user creat..."
3,Acid Permeation,Ninjutsu,Utakata blows acidic bubbles from his pipe tha...
4,Acidic Sludge,Ninjutsu,The user spits out a small purple liquid at th...


In [3]:
# We need to keep only the primary jutsu classes for simplicity
def simplify_jutsu(jutsu):
    if "Genjutsu" in jutsu:
        return "Genjutsu"
    if "Ninjutsu" in jutsu:
        return "Ninjutsu"
    if "Taijutsu" in jutsu:
        return "Taijutsu"

In [4]:
df['jutsu_type_simplified'] = df['jutsu_type'].apply(simplify_jutsu)

In [5]:
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description,jutsu_type_simplified
0,Acrobat,"Taijutsu, Kenjutsu","The Acrobat (荒繰鷺伐刀, Akurobatto) is a kenjutsu ...",Taijutsu
1,1000 Metre Punch,Taijutsu,The user focuses a large amount of chakra into...,Taijutsu
2,Absorption Sphere,Ninjutsu,"Using the Jutsu Absorption Arm, the user creat...",Ninjutsu
3,Acid Permeation,Ninjutsu,Utakata blows acidic bubbles from his pipe tha...,Ninjutsu
4,Acidic Sludge,Ninjutsu,The user spits out a small purple liquid at th...,Ninjutsu


In [6]:
df['jutsu_type_simplified'].value_counts()

jutsu_type_simplified
Ninjutsu    2258
Taijutsu     397
Genjutsu     101
Name: count, dtype: int64

In [7]:
# The dataswt is skewed.

In [8]:
df['text'] = df['jutsu_name'] + ". " + df['jutsu_description']
df['jutsus'] = df['jutsu_type_simplified']
df = df[['text', 'jutsus']]
df = df.dropna()

In [9]:
df.head()

Unnamed: 0,text,jutsus
0,"Acrobat. The Acrobat (荒繰鷺伐刀, Akurobatto) is a ...",Taijutsu
1,1000 Metre Punch. The user focuses a large amo...,Taijutsu
2,Absorption Sphere. Using the Jutsu Absorption ...,Ninjutsu
3,Acid Permeation. Utakata blows acidic bubbles ...,Ninjutsu
4,Acidic Sludge. The user spits out a small purp...,Ninjutsu


In [10]:
# To clean the data scrapped off from the web. 
class Cleaner():
    def __init__(self):
        pass 
    
    def put_line_breaks(self, text):
        return text.replace("<\p>", "<\p>\n")
    
    def remove_html_tags(self, text):
        clean_text = BeautifulSoup(text, "lxml").text
        return clean_text

    def clean(self, text):
        text = self.put_line_breaks(text)
        text = self.remove_html_tags(text)
        text = text.strip()
        return text

  return text.replace("<\p>", "<\p>\n")
  return text.replace("<\p>", "<\p>\n")


In [11]:
text_column_name = 'text'
label_column_name = "jutsus"

In [12]:
# Clean Text
cleaner = Cleaner()
df['text_cleaned'] = df[text_column_name].apply(cleaner.clean)

  clean_text = BeautifulSoup(text, "lxml").text


In [13]:
df.head(2)

Unnamed: 0,text,jutsus,text_cleaned
0,"Acrobat. The Acrobat (荒繰鷺伐刀, Akurobatto) is a ...",Taijutsu,"Acrobat. The Acrobat (荒繰鷺伐刀, Akurobatto) is a ..."
1,1000 Metre Punch. The user focuses a large amo...,Taijutsu,1000 Metre Punch. The user focuses a large amo...


In [14]:
# Encode Labels for feeding to the learning network
le = preprocessing.LabelEncoder()
le.fit(df[label_column_name].tolist())

In [15]:
label_dict = {index:label_name for index, label_name in enumerate(le.__dict__['classes_'].tolist())}
label_dict

{0: 'Genjutsu', 1: 'Ninjutsu', 2: 'Taijutsu'}

In [16]:
df['label'] = le.transform(df[label_column_name].tolist())

In [17]:
df.head()

Unnamed: 0,text,jutsus,text_cleaned,label
0,"Acrobat. The Acrobat (荒繰鷺伐刀, Akurobatto) is a ...",Taijutsu,"Acrobat. The Acrobat (荒繰鷺伐刀, Akurobatto) is a ...",2
1,1000 Metre Punch. The user focuses a large amo...,Taijutsu,1000 Metre Punch. The user focuses a large amo...,2
2,Absorption Sphere. Using the Jutsu Absorption ...,Ninjutsu,Absorption Sphere. Using the Jutsu Absorption ...,1
3,Acid Permeation. Utakata blows acidic bubbles ...,Ninjutsu,Acid Permeation. Utakata blows acidic bubbles ...,1
4,Acidic Sludge. The user spits out a small purp...,Ninjutsu,Acidic Sludge. The user spits out a small purp...,1


In [18]:
test_size = 0.2
df_train, df_test = train_test_split(df, 
                                     test_size=test_size, 
                                     stratify=df['label'],)
# stratifying based on label as data set is sqewed and we might overrepresent one class in the train/test data.

In [19]:

df_train['jutsus'].value_counts()

jutsus
Ninjutsu    1806
Taijutsu     317
Genjutsu      81
Name: count, dtype: int64

In [20]:
# This is a transfomer model as well.
model_name = "distilbert/distilbert-base-uncased"

In [21]:
# And we are using distilbert-base-uncased to tokenize.
tokenizer = AutoTokenizer.from_pretrained(model_name)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [22]:
def preprocess_function(tokenizer,examples):
    return tokenizer(examples['text_cleaned'],truncation=True)

In [25]:
# Convert Pandas to a hugging face dataset
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

# tokenize the dataset
tokenized_train = train_dataset.map(lambda examples: preprocess_function(tokenizer, examples),
                                    batched=True)
tokenized_test = test_dataset.map(lambda examples: preprocess_function(tokenizer, examples),
                                    batched=True)

Map: 100%|██████████| 2204/2204 [00:00<00:00, 4355.83 examples/s]
Map: 100%|██████████| 552/552 [00:00<00:00, 3922.62 examples/s]
