In [3]:

from transformers import pipeline
from nltk.tokenize import sent_tokenize
import nltk
import torch
from glob import glob
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\VISHN\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

# Load Model


In [5]:
model_name = "facebook/bart-large-mnli"
device = 0 if torch.cuda.is_available() else 'cpu'

In [6]:
def load_model(device):
    theme_classifier = pipeline(
        "zero-shot-classification",
        model=model_name,
        device=device
    )

    return theme_classifier

In [7]:
theme_classifier = load_model(device)

Device set to use cpu


In [8]:
theme_list = ["friendship","hope","sacrifice","battle","self development","betrayal","love","dialogue"]

In [9]:
theme_classifier(
    "I gave him a right hook then a left jab",
    theme_list,
    multi_label=True
)

{'sequence': 'I gave him a right hook then a left jab',
 'labels': ['battle',
  'self development',
  'hope',
  'sacrifice',
  'dialogue',
  'betrayal',
  'love',
  'friendship'],
 'scores': [0.9121254086494446,
  0.47499966621398926,
  0.0878182128071785,
  0.04499955102801323,
  0.020132720470428467,
  0.012040308676660061,
  0.004292338155210018,
  0.0028172165621072054]}

# Load Dataset


In [10]:
files = glob('../data/Subtitles/*.ass')

In [11]:
files[:5]

['../data/Subtitles\\Naruto Season 1 - 01.ass',
 '../data/Subtitles\\Naruto Season 1 - 02.ass',
 '../data/Subtitles\\Naruto Season 1 - 03.ass',
 '../data/Subtitles\\Naruto Season 1 - 04.ass',
 '../data/Subtitles\\Naruto Season 1 - 05.ass']

In [12]:
with open(files[0],'r') as file:
    lines = file.readlines()
    lines = lines[27:]
    lines =  [ ",".join(line.split(',')[9:])  for line in lines ]

In [13]:
lines[:2]

['A long time ago, a powerful demon fox\\Nappeared with nine tails.\n',
 'With its powerful tails,\n']

In [14]:
lines = [ line.replace('\\N',' ') for line in lines]

In [15]:

lines[:2]

['A long time ago, a powerful demon fox appeared with nine tails.\n',
 'With its powerful tails,\n']

In [16]:
" ".join(lines[:10])

"A long time ago, a powerful demon fox appeared with nine tails.\n With its powerful tails,\n it could smash mountains and create tidal waves.\n A band of Ninjas rose to defend their village from attack.\n We have to wait until the Fourth Hokage gets here!\n We can't let it get any closer to our village!\n One great Ninja was able to imprison the monster,\n but died in the process.\n This Ninja was known asâ€¦ the Fourth Hokage.\n Naruto!\n"

In [17]:
int(files[0].split('-')[-1].split('.')[0].strip())

1

In [18]:
def load_subtitles_dataset(dataset_path):
    subtitles_paths = glob(dataset_path+'/*.ass')

    scripts=[]
    episode_num=[]

    for path in subtitles_paths:

        #Read Lines
        with open(path, 'r', encoding='utf-8') as file:
            lines = file.readlines()
            lines = lines[27:]
            lines =  [ ",".join(line.split(',')[9:])  for line in lines ]
        
        lines = [ line.replace('\\N',' ') for line in lines]
        script = " ".join(lines)

        episode = int(path.split('-')[-1].split('.')[0].strip())

        scripts.append(script)
        episode_num.append(episode)

    df = pd.DataFrame.from_dict({"episode":episode_num, "script":scripts })
    return df

In [19]:

dataset_path = "../data/Subtitles"
df = load_subtitles_dataset(dataset_path)


In [20]:

df.head()

Unnamed: 0,episode,script
0,1,"A long time ago, a powerful demon fox appeared..."
1,2,"C'mon!\n Running like a fugitive,\n Being chas..."
2,3,"C'mon!\n Running like a fugitive,\n Being chas..."
3,4,"C'mon!\n Running like a fugitive,\n Being chas..."
4,5,"C'mon!\n Running like a fugitive,\n Being chas..."



# Run Model


In [None]:

script = df.iloc[0]['script']


In [None]:

script
print(script)

A long time ago, a powerful demon fox appeared with nine tails.
 With its powerful tails,
 it could smash mountains and create tidal waves.
 A band of Ninjas rose to defend their village from attack.
 We have to wait until the Fourth Hokage gets here!
 We can't let it get any closer to our village!
 One great Ninja was able to imprison the monster,
 but died in the process.
 This Ninja was known as… the Fourth Hokage.
 Naruto!
 Why did you do such a thing?!
 You're really gonna get it this time!
 I don't care!
 You know your problem?
 You can't do the things I do!
 Only I can do this!
 I'm better than all of you! Believe it!
 There's a problem, sir!
 Lord Hokage!
 What is it?
 Did that Naruto do something again?
 Yes. He climbed onto the Mountainside Images…
 And he vandalized and graffitied all over them!
 Wait!
 Ha ha…
 Why should I?
 Hey, Naruto!
 How did you suddenly get here, lruka Sensei?
 The question is what are you doing here when you should be in class now?
 Now listen, Narut

In [52]:
import re

script = df.iloc[0]['script']

# Split on '.', '!', or '?' followed by a space or end of string
script_sentences = re.split(r'(?<=[.!?])\s+', script)

print(script_sentences[:3])


['A long time ago, a powerful demon fox appeared with nine tails.', 'With its powerful tails,\n it could smash mountains and create tidal waves.', 'A band of Ninjas rose to defend their village from attack.']


In [53]:
# Batch Sentence
sentence_batch_size=20
script_batches = []
for index in range(0,len(script_sentences),sentence_batch_size):
    sent = " ".join(script_sentences[index:index+sentence_batch_size])
    script_batches.append(sent)

In [54]:

script_batches[:2]

["A long time ago, a powerful demon fox appeared with nine tails. With its powerful tails,\n it could smash mountains and create tidal waves. A band of Ninjas rose to defend their village from attack. We have to wait until the Fourth Hokage gets here! We can't let it get any closer to our village! One great Ninja was able to imprison the monster,\n but died in the process. This Ninja was known as… the Fourth Hokage. Naruto! Why did you do such a thing?! You're really gonna get it this time! I don't care! You know your problem? You can't do the things I do! Only I can do this! I'm better than all of you! Believe it! There's a problem, sir! Lord Hokage! What is it? Did that Naruto do something again?",
 'Yes. He climbed onto the Mountainside Images…\n And he vandalized and graffitied all over them! Wait! Ha ha…\n Why should I? Hey, Naruto! How did you suddenly get here, lruka Sensei? The question is what are you doing here when you should be in class now? Now listen, Naruto. You failed t

In [55]:
theme_output = theme_classifier(
    script_batches[:2],
    theme_list,
    multi_label=True
)

In [56]:
theme_output

[{'sequence': "A long time ago, a powerful demon fox appeared with nine tails. With its powerful tails,\n it could smash mountains and create tidal waves. A band of Ninjas rose to defend their village from attack. We have to wait until the Fourth Hokage gets here! We can't let it get any closer to our village! One great Ninja was able to imprison the monster,\n but died in the process. This Ninja was known as… the Fourth Hokage. Naruto! Why did you do such a thing?! You're really gonna get it this time! I don't care! You know your problem? You can't do the things I do! Only I can do this! I'm better than all of you! Believe it! There's a problem, sir! Lord Hokage! What is it? Did that Naruto do something again?",
  'labels': ['dialogue',
   'betrayal',
   'battle',
   'sacrifice',
   'self development',
   'hope',
   'friendship',
   'love'],
  'scores': [0.9800736904144287,
   0.9396899938583374,
   0.8546878099441528,
   0.7349792122840881,
   0.7284933924674988,
   0.199098512530326

In [57]:

# Wrangle Ouput
# battle: [0.51489498, 0.2156498]
themes = {}
for output in theme_output:
    for label,score in zip(output['labels'],output['scores']):
        if label not in themes:
            themes[label] = []
        themes[label].append(score)

In [62]:

themes = {key: np.mean(np.array(value)) for key,value in themes.items()}

In [63]:

themes

{'dialogue': np.float64(0.9585431516170502),
 'betrayal': np.float64(0.7927071750164032),
 'battle': np.float64(0.7564093470573425),
 'sacrifice': np.float64(0.6804313659667969),
 'self development': np.float64(0.7981564700603485),
 'hope': np.float64(0.20166973769664764),
 'friendship': np.float64(0.07262793928384781),
 'love': np.float64(0.03414129093289375)}

In [64]:
def get_themes_inference(script):
    script_sentences = sent_tokenize(script)

    # Batch Sentence
    sentence_batch_size=20
    script_batches = []
    for index in range(0,len(script_sentences),sentence_batch_size):
        sent = " ".join(script_sentences[index:index+sentence_batch_size])
        script_batches.append(sent)
    
    # Run Model
    theme_output = theme_classifier(
        script_batches[:2],
        theme_list,
        multi_label=True
    )

    # Wrangle Output 
    themes = {}
    for output in theme_output:
        for label,score in zip(output['labels'],output['scores']):
            if label not in themes:
                themes[label] = []
            themes[label].append(score)

    themes = {key: np.mean(np.array(value)) for key,value in themes.items()}

    return themes


In [65]:

df = df.head(2)

In [66]:

df

Unnamed: 0,episode,script
0,1,"A long time ago, a powerful demon fox appeared..."
1,2,"C'mon!\n Running like a fugitive,\n Being chas..."


In [70]:
output_themes = df['script'].apply(get_themes_inference)

In [69]:
import nltk
nltk.download('punkt')  # or whichever resource is missing


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\VISHN\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [68]:
import nltk
nltk.download('all')  # downloads everything (large)


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\VISHN\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\VISHN\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\VISHN\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\VISHN\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers\averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\VISHN\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       tagge

True

In [71]:
output_themes

0    {'dialogue': 0.9585431516170502, 'betrayal': 0...
1    {'dialogue': 0.9606050848960876, 'sacrifice': ...
Name: script, dtype: object

In [72]:
theme_df = pd.DataFrame(output_themes.tolist())

In [73]:

theme_df

Unnamed: 0,dialogue,betrayal,battle,sacrifice,self development,hope,friendship,love
0,0.958543,0.792707,0.756409,0.680431,0.798156,0.20167,0.072628,0.034141
1,0.960605,0.429944,0.684843,0.570703,0.482808,0.154534,0.046261,0.173262


In [74]:

df

Unnamed: 0,episode,script
0,1,"A long time ago, a powerful demon fox appeared..."
1,2,"C'mon!\n Running like a fugitive,\n Being chas..."


In [75]:

df[theme_df.columns] = theme_df
df

Unnamed: 0,episode,script,dialogue,betrayal,battle,sacrifice,self development,hope,friendship,love
0,1,"A long time ago, a powerful demon fox appeared...",0.958543,0.792707,0.756409,0.680431,0.798156,0.20167,0.072628,0.034141
1,2,"C'mon!\n Running like a fugitive,\n Being chas...",0.960605,0.429944,0.684843,0.570703,0.482808,0.154534,0.046261,0.173262


# Visualize output


In [10]:
df = df.drop('dialogue', axis=1, errors='ignore')


In [21]:
theme_output = df.drop(['episode','script'],axis=1).sum().reset_index()
theme_output.columns = ['theme','score']
theme_output


KeyError: "['episode'] not found in axis"

In [6]:
%whos


Variable        Type        Data/Info
-------------------------------------
glob            function    <function glob at 0x0000028C1481CA40>
nltk            module      <module 'nltk' from 'c:\\<...>ages\\nltk\\__init__.py'>
np              module      <module 'numpy' from 'c:\<...>ges\\numpy\\__init__.py'>
pd              module      <module 'pandas' from 'c:<...>es\\pandas\\__init__.py'>
pipeline        function    <function pipeline at 0x0000028C35C9EB60>
sent_tokenize   function    <function sent_tokenize at 0x0000028C3754ACA0>
torch           module      <module 'torch' from 'c:\<...>ges\\torch\\__init__.py'>


['C:\\Users\\VISHN/nltk_data', 'c:\\Users\\VISHN\\AppData\\Local\\Programs\\Python\\Python313\\nltk_data', 'c:\\Users\\VISHN\\AppData\\Local\\Programs\\Python\\Python313\\share\\nltk_data', 'c:\\Users\\VISHN\\AppData\\Local\\Programs\\Python\\Python313\\lib\\nltk_data', 'C:\\Users\\VISHN\\AppData\\Roaming\\nltk_data', 'C:\\nltk_data', 'D:\\nltk_data', 'E:\\nltk_data', 'nltk_data', 'nltk_data', 'nltk_data', 'nltk_data']
