In [1]:
from transformers import pipeline
import nltk
from nltk import sent_tokenize
import torch #2.5.1+cu121
from glob import glob
import pandas as pd
import numpy as np

In [2]:
nltk.download('punkt')
nltk.download('punkt_tab')
  

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\VJ\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\VJ\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Load Model

In [3]:
model_name = "facebook/bart-large-mnli"
device = 0 if torch.cuda.is_available() else 'cpu'

In [4]:
device

0

In [5]:
def load_model(device):
    theme_classifier = pipeline('zero-shot-classification', model=model_name, device=device)
    return theme_classifier

In [6]:
classifier = load_model(device)

Device set to use cuda:0


In [7]:
theme_list = ["friendship", "hope", "sacrifice", "battle", "self development", "betrayal","love", "family"]

In [8]:
classifier("I want to secretly kill my best friend by giving him a right hook then a kick", theme_list,multi_label=True )
# classifier("I gave him a right hook then a left jab", theme_list, multi_label=True)

{'sequence': 'I want to secretly kill my best friend by giving him a right hook then a kick',
 'labels': ['family',
  'betrayal',
  'friendship',
  'battle',
  'sacrifice',
  'self development',
  'love',
  'hope'],
 'scores': [0.9812206029891968,
  0.8385915160179138,
  0.7063215374946594,
  0.4907201826572418,
  0.29688456654548645,
  0.015588836744427681,
  9.531954128760844e-05,
  7.946813275339082e-05]}

## Load Subtitle dataset

In [9]:
files = glob('../data/Subtitles/*.ass')


In [10]:
files[:5]

['../data/Subtitles\\Naruto Season 1 - 01.ass',
 '../data/Subtitles\\Naruto Season 1 - 02.ass',
 '../data/Subtitles\\Naruto Season 1 - 03.ass',
 '../data/Subtitles\\Naruto Season 1 - 04.ass',
 '../data/Subtitles\\Naruto Season 1 - 05.ass']

In [11]:
with open(files[0], 'r') as file:
    lines = file.readlines()
    lines = lines[27:]
    lines = [",".join(line.split(',')[9:]) for line in lines]


In [12]:
lines[:10]

['A long time ago, a powerful demon fox\\Nappeared with nine tails.\n',
 'With its powerful tails,\n',
 'it could smash mountains\\Nand create tidal waves.\n',
 'A band of Ninjas rose to\\Ndefend their village from attack.\n',
 'We have to wait until\\Nthe Fourth Hokage gets here!\n',
 "We can't let it get any closer\\Nto our village!\n",
 'One great Ninja was able to\\Nimprison the monster,\n',
 'but died in the process.\n',
 'This Ninja was known as…\\Nthe Fourth Hokage.\n',
 'Naruto!\n']

In [13]:
lines= [line.replace('\\N',' ') for line in lines]


In [14]:
lines

['A long time ago, a powerful demon fox appeared with nine tails.\n',
 'With its powerful tails,\n',
 'it could smash mountains and create tidal waves.\n',
 'A band of Ninjas rose to defend their village from attack.\n',
 'We have to wait until the Fourth Hokage gets here!\n',
 "We can't let it get any closer to our village!\n",
 'One great Ninja was able to imprison the monster,\n',
 'but died in the process.\n',
 'This Ninja was known as… the Fourth Hokage.\n',
 'Naruto!\n',
 'Why did you do such a thing?!\n',
 "You're really gonna get it this time!\n",
 "I don't care!\n",
 'You know your problem?\n',
 "You can't do the things I do!\n",
 'Only I can do this!\n',
 "I'm better than all of you! Believe it!\n",
 "There's a problem, sir!\n",
 'Lord Hokage!\n',
 'What is it?\n',
 'Did that Naruto do something again?\n',
 'Yes. He climbed onto the Mountainside Images…\n',
 'And he vandalized and graffitied all over them!\n',
 'Wait!\n',
 'Ha ha…\n',
 'Why should I?\n',
 'Hey, Naruto!\n',
 '

In [15]:
" ".join(lines[:10])

"A long time ago, a powerful demon fox appeared with nine tails.\n With its powerful tails,\n it could smash mountains and create tidal waves.\n A band of Ninjas rose to defend their village from attack.\n We have to wait until the Fourth Hokage gets here!\n We can't let it get any closer to our village!\n One great Ninja was able to imprison the monster,\n but died in the process.\n This Ninja was known as… the Fourth Hokage.\n Naruto!\n"

In [16]:
int(files[0].split('-')[-1].split('.')[0].strip()) # to get episode num

1

In [17]:
def load_subtitle_dataset(dataset_path):
    subtitles_path = glob(dataset_path)
    scripts = []
    episode_list = []

    for path in subtitles_path:    
        with open(path, 'r') as file:
            lines = file.readlines()
            lines = lines[27:]
            lines = [",".join(line.split(',')[9:]) for line in lines]

        lines= [line.replace('\\N',' ') for line in lines]

        script = " ".join(lines)

        episode_num = int(path.split('-')[-1].split('.')[0].strip()) # to get episode num


        scripts.append(script)
        episode_list.append(episode_num)
        
    df = pd.DataFrame.from_dict({"episode":episode_list, "script":scripts})
    return df


In [18]:
dataset_path = "../data/Subtitles/*.ass"
df = load_subtitle_dataset(dataset_path)

In [19]:
df

Unnamed: 0,episode,script
0,1,"A long time ago, a powerful demon fox appeared..."
1,2,"C'mon!\n Running like a fugitive,\n Being chas..."
2,3,"C'mon!\n Running like a fugitive,\n Being chas..."
3,4,"C'mon!\n Running like a fugitive,\n Being chas..."
4,5,"C'mon!\n Running like a fugitive,\n Being chas..."
...,...,...
213,216,"Summoning Jutsu!\n Oh, long time no see.\n We ..."
214,217,Gaara.\n Kankuro.\n Gaara.\n Primary Lotus!\n ...
215,218,To think the Leaf’s reinforcements will be you...
216,219,"Fly into the wavy and twisted sky, into your h..."


In [68]:
script = df.iloc[0]['script']
script

'A long time ago, a powerful demon fox appeared with nine tails.\n With its powerful tails,\n it could smash mountains and create tidal waves.\n A band of Ninjas rose to defend their village from attack.\n We have to wait until the Fourth Hokage gets here!\n We can\'t let it get any closer to our village!\n One great Ninja was able to imprison the monster,\n but died in the process.\n This Ninja was known as… the Fourth Hokage.\n Naruto!\n Why did you do such a thing?!\n You\'re really gonna get it this time!\n I don\'t care!\n You know your problem?\n You can\'t do the things I do!\n Only I can do this!\n I\'m better than all of you! Believe it!\n There\'s a problem, sir!\n Lord Hokage!\n What is it?\n Did that Naruto do something again?\n Yes. He climbed onto the Mountainside Images…\n And he vandalized and graffitied all over them!\n Wait!\n Ha ha…\n Why should I?\n Hey, Naruto!\n How did you suddenly get here, lruka Sensei?\n The question is what are you doing here when you should 

In [72]:
script_sentences = sent_tokenize(script)
script_sentences[:3]

['A long time ago, a powerful demon fox appeared with nine tails.',
 'With its powerful tails,\n it could smash mountains and create tidal waves.',
 'A band of Ninjas rose to defend their village from attack.']

In [73]:
sentence_batch_size = 20
script_batches= [ ]
for index in range(0,len(script_sentences), sentence_batch_size):
    sentence = " ".join(script_sentences[index:index + sentence_batch_size])
    script_batches.append(sentence)

In [74]:
script_batches

["A long time ago, a powerful demon fox appeared with nine tails. With its powerful tails,\n it could smash mountains and create tidal waves. A band of Ninjas rose to defend their village from attack. We have to wait until the Fourth Hokage gets here! We can't let it get any closer to our village! One great Ninja was able to imprison the monster,\n but died in the process. This Ninja was known as… the Fourth Hokage. Naruto! Why did you do such a thing?! You're really gonna get it this time! I don't care! You know your problem? You can't do the things I do! Only I can do this! I'm better than all of you! Believe it! There's a problem, sir! Lord Hokage! What is it? Did that Naruto do something again?",
 'Yes. He climbed onto the Mountainside Images…\n And he vandalized and graffitied all over them! Wait! Ha ha…\n Why should I? Hey, Naruto! How did you suddenly get here, lruka Sensei? The question is what are you doing here when you should be in class now? Now listen, Naruto. You failed t

In [75]:
theme_output = classifier(script_batches[:2], theme_list, multi_label=True)

In [76]:
theme_output

[{'sequence': "A long time ago, a powerful demon fox appeared with nine tails. With its powerful tails,\n it could smash mountains and create tidal waves. A band of Ninjas rose to defend their village from attack. We have to wait until the Fourth Hokage gets here! We can't let it get any closer to our village! One great Ninja was able to imprison the monster,\n but died in the process. This Ninja was known as… the Fourth Hokage. Naruto! Why did you do such a thing?! You're really gonna get it this time! I don't care! You know your problem? You can't do the things I do! Only I can do this! I'm better than all of you! Believe it! There's a problem, sir! Lord Hokage! What is it? Did that Naruto do something again?",
  'labels': ['betrayal',
   'battle',
   'family',
   'sacrifice',
   'self development',
   'hope',
   'friendship',
   'love'],
  'scores': [0.9396896362304688,
   0.8546874523162842,
   0.737002432346344,
   0.7349812388420105,
   0.7284945249557495,
   0.1990976780653,
   

In [78]:
# wrangle output
themes={}
for output in theme_output:
    for label,score in zip(output['labels'],output['scores']):
        if label not in themes:
            themes[label] = []
        themes[label].append(score)

In [95]:
themes

{'betrayal': np.float64(0.792706310749054),
 'battle': np.float64(0.7564084231853485),
 'family': np.float64(0.689649224281311),
 'sacrifice': np.float64(0.6804319322109222),
 'self development': np.float64(0.7981570661067963),
 'hope': np.float64(0.20166902244091034),
 'friendship': np.float64(0.07262797839939594),
 'love': np.float64(0.03414120804518461)}

In [97]:
themes = {key : (np.mean(np.array(val))) for key, val in themes.items()}
themes

{'betrayal': 0.792706310749054,
 'battle': 0.7564084231853485,
 'family': 0.689649224281311,
 'sacrifice': 0.6804319322109222,
 'self development': 0.7981570661067963,
 'hope': 0.20166902244091034,
 'friendship': 0.07262797839939594,
 'love': 0.03414120804518461}

In [20]:
def theme_scores(script):
    script_sentences = sent_tokenize(script)
    
    sentence_batch_size = 20
    script_batches= []
    for index in range(0,len(script_sentences), sentence_batch_size):
        sentence = " ".join(script_sentences[index:index + sentence_batch_size])
        script_batches.append(sentence)

    theme_output = classifier(script_batches[:2], theme_list, multi_label=True)
    themes={}
    for output in theme_output:
        for label,score in zip(output['labels'],output['scores']):
            if label not in themes:
                themes[label] = []
            themes[label].append(score)
    themes = {key : np.mean(np.array(val)) for key, val in themes.items()}
    return themes

In [21]:
d =df.head(10)

In [22]:
d

Unnamed: 0,episode,script
0,1,"A long time ago, a powerful demon fox appeared..."
1,2,"C'mon!\n Running like a fugitive,\n Being chas..."
2,3,"C'mon!\n Running like a fugitive,\n Being chas..."
3,4,"C'mon!\n Running like a fugitive,\n Being chas..."
4,5,"C'mon!\n Running like a fugitive,\n Being chas..."
5,6,"C'mon!\n Running like a fugitive,\n Being chas..."
6,7,"C'mon!\n Running like a fugitive,\n Being chas..."
7,8,"C'mon!\n Running like a fugitive,\n Being chas..."
8,9,"C'mon!\n Running like a fugitive,\n Being chas..."
9,12,"C'mon!\n Running like a fugitive,\n Being chas..."


In [23]:
output_themes = d['script'].apply(theme_scores)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [24]:
output_themes


0    {'betrayal': 0.792706310749054, 'battle': 0.75...
1    {'sacrifice': 0.5707018971443176, 'self develo...
2    {'sacrifice': 0.6615151166915894, 'self develo...
3    {'sacrifice': 0.6521520912647247, 'battle': 0....
4    {'betrayal': 0.8911022841930389, 'battle': 0.7...
5    {'battle': 0.5873122960329056, 'sacrifice': 0....
6    {'sacrifice': 0.8036543428897858, 'battle': 0....
7    {'family': 0.8762916624546051, 'sacrifice': 0....
8    {'battle': 0.9470804929733276, 'sacrifice': 0....
9    {'battle': 0.7906561493873596, 'sacrifice': 0....
Name: script, dtype: object

In [25]:
theme_df = pd.DataFrame(output_themes.tolist())
theme_df

Unnamed: 0,betrayal,battle,family,sacrifice,self development,hope,friendship,love
0,0.792706,0.756408,0.689649,0.680432,0.798157,0.201669,0.072628,0.034141
1,0.429943,0.684844,0.662919,0.570702,0.482807,0.154533,0.04626,0.17326
2,0.33949,0.707885,0.553195,0.661515,0.72414,0.573632,0.342218,0.394862
3,0.554723,0.723673,0.468861,0.652152,0.861326,0.445292,0.525341,0.315331
4,0.891102,0.783642,0.579165,0.585847,0.550455,0.093763,0.073856,0.05407
5,0.337699,0.587312,0.790307,0.858296,0.771086,0.387565,0.397151,0.361572
6,0.63195,0.854286,0.482102,0.803654,0.696881,0.307072,0.266852,0.383653
7,0.695014,0.893384,0.876292,0.842824,0.7543,0.315381,0.314524,0.128988
8,0.576206,0.94708,0.69728,0.793191,0.795375,0.393363,0.157878,0.156073
9,0.380064,0.790656,0.852948,0.717417,0.849929,0.431066,0.530748,0.377739


In [26]:
d[theme_df.columns] = theme_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d[theme_df.columns] = theme_df
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d[theme_df.columns] = theme_df
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d[theme_df.columns] = theme_df
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = val

In [27]:
d

Unnamed: 0,episode,script,betrayal,battle,family,sacrifice,self development,hope,friendship,love
0,1,"A long time ago, a powerful demon fox appeared...",0.792706,0.756408,0.689649,0.680432,0.798157,0.201669,0.072628,0.034141
1,2,"C'mon!\n Running like a fugitive,\n Being chas...",0.429943,0.684844,0.662919,0.570702,0.482807,0.154533,0.04626,0.17326
2,3,"C'mon!\n Running like a fugitive,\n Being chas...",0.33949,0.707885,0.553195,0.661515,0.72414,0.573632,0.342218,0.394862
3,4,"C'mon!\n Running like a fugitive,\n Being chas...",0.554723,0.723673,0.468861,0.652152,0.861326,0.445292,0.525341,0.315331
4,5,"C'mon!\n Running like a fugitive,\n Being chas...",0.891102,0.783642,0.579165,0.585847,0.550455,0.093763,0.073856,0.05407
5,6,"C'mon!\n Running like a fugitive,\n Being chas...",0.337699,0.587312,0.790307,0.858296,0.771086,0.387565,0.397151,0.361572
6,7,"C'mon!\n Running like a fugitive,\n Being chas...",0.63195,0.854286,0.482102,0.803654,0.696881,0.307072,0.266852,0.383653
7,8,"C'mon!\n Running like a fugitive,\n Being chas...",0.695014,0.893384,0.876292,0.842824,0.7543,0.315381,0.314524,0.128988
8,9,"C'mon!\n Running like a fugitive,\n Being chas...",0.576206,0.94708,0.69728,0.793191,0.795375,0.393363,0.157878,0.156073
9,12,"C'mon!\n Running like a fugitive,\n Being chas...",0.380064,0.790656,0.852948,0.717417,0.849929,0.431066,0.530748,0.377739
