In [3]:
# Import necessary libraries and modules  
import pandas as pd  
import numpy as np  
import nltk  
from nltk.corpus import stopwords  
from nltk.stem import WordNetLemmatizer  
import re  
from tqdm import tqdm  

In [5]:
# Download required NLTK resources  
nltk.download('stopwords')  
nltk.download('punkt')  
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tterr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tterr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tterr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
 # Load datasets from TSV files using updated on_bad_lines parameter  
movie_lines = pd.read_csv('data/movie_lines.tsv', sep='\t', header=None,  
                       names=['lineID', 'characterID', 'movieID', 'characterName', 'text'],  
                       encoding='utf-8', on_bad_lines='skip', quoting=3) 

print("Loaded movie_lines.tsv, shape:", movie_lines.shape)  
print(movie_lines.head(2)) 

Loaded movie_lines.tsv, shape: (304543, 5)
  lineID characterID movieID characterName          text
0  L1045          u0      m0        BIANCA  They do not!
1  L1044          u2      m0       CAMERON   They do to!


In [13]:
movie_characters = pd.read_csv('data/movie_characters_metadata.tsv', sep='\t', header=None,  
                                names=['characterID', 'characterName', 'movieID', 'movieTitle', 'gender', 'position'],  
                                encoding='utf-8', on_bad_lines='skip', quoting=3)  

print("Loaded movie_characters_metadata.tsv, shape:", movie_characters.shape)  
print(movie_characters.head(2)) 

Loaded movie_characters_metadata.tsv, shape: (9034, 6)
  characterID characterName movieID                  movieTitle gender  \
0          u0        BIANCA      m0  10 things i hate about you      f   
1          u1         BRUCE      m0  10 things i hate about you      ?   

  position  
0        4  
1        ?  


In [15]:
movie_titles = pd.read_csv('data/movie_titles_metadata.tsv', sep='\t', header=None,  
                            names=['movieID', 'movieTitle', 'movieYear', 'IMDB_rating', 'IMDB_votes', 'genres'],  
                            encoding='utf-8', on_bad_lines='skip', quoting=3)  

print("Loaded movie_titles_metadata.tsv, shape:", movie_titles.shape)  
print(movie_titles.head(2)) 

Loaded movie_titles_metadata.tsv, shape: (617, 6)
  movieID                  movieTitle movieYear  IMDB_rating  IMDB_votes  \
0      m0  10 things i hate about you      1999          6.9       62847   
1      m1  1492: conquest of paradise      1992          6.2       10421   

                                        genres  
0                         ['comedy' 'romance']  
1  ['adventure' 'biography' 'drama' 'history']  


In [17]:
# Combine dialogue per character from movie_lines  
# Group by characterID so we have a single combined dialogue per character  
character_dialogue = movie_lines.groupby('characterID')['text'].apply(lambda x: ' '.join(x.astype(str))).reset_index()  
character_dialogue.columns = ['characterID', 'combined_dialogue']  

print("Combined character dialogue (sample):")  
print(character_dialogue.head())  

Combined character dialogue (sample):
  characterID                                  combined_dialogue
0          u0  They do not! I hope so. Let's go. Okay -- you'...
1          u1  Just sent 'em through. Never Didn't have you p...
2         u10  Absolutely not. Your daughters went to the pro...
3        u100  She died in her sleep three days ago.  It was ...
4       u1000  Yeah and I'm gonna be right back at it tomorro...


In [19]:
# Preprocess the dialogue text to clean it.  
# Define a function to preprocess text: lowercasing, remove punctuation, tokenize, remove stopwords, lemmatize.  
stop_words = set(stopwords.words('english'))  
lemmatizer = WordNetLemmatizer()  

def preprocess_text(text):  
     # Lowercase the text  
     text = text.lower()  
     # Remove non-alphanumeric characters (keep spaces)  
     text = re.sub(r'[^a-z0-9\s]', ' ', text)  
     # Tokenize the text  
     tokens = nltk.word_tokenize(text)  
     # Remove stopwords and short tokens  
     tokens = [word for word in tokens if word not in stop_words and len(word) > 2]  
     # Lemmatize tokens  
     tokens = [lemmatizer.lemmatize(word) for word in tokens]  
     # Return processed text  
     return ' '.join(tokens)  

print("Preprocessing combined dialogue...")  
character_dialogue['processed_text'] = character_dialogue['combined_dialogue'].apply(preprocess_text)  

print("Sample processed text:")  
print(character_dialogue[['characterID', 'processed_text']].head())  

Preprocessing combined dialogue...
Sample processed text:
  characterID                                     processed_text
0          u0  hope let okay gon need learn lie kidding know ...
1          u1  sent never pegged gigglepuss fan little pre te...
2         u10  absolutely daughter went prom great time honey...
3        u100  died sleep three day ago paper tom dead callin...
4       u1000  yeah gon right back tomorrow tonight gon sit f...


In [21]:
# Now we move to trait classification using a pre-trained model from Hugging Face.  

from transformers import pipeline 

In [23]:
# Define candidate traits to check from each character's dialogue.  
candidate_traits = [  
 "friendly", "hostile",  
 "introverted", "extroverted",  
 "intelligent", "naive",  
 "courageous", "cowardly",  
 "compassionate", "ruthless",  
 "funny", "serious"  
]  

In [25]:
# Create the zero-shot classification pipeline using the model 'facebook/bart-large-mnli'  
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") 

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development





model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0


In [27]:
# Use the classifier on each character's combined dialogue text.  
# We'll create a list of dictionaries with results.  
trait_predictions = []  

for idx, row in tqdm(character_dialogue.iterrows(), total=len(character_dialogue), desc="Classifying traits"):  
 text = row['combined_dialogue']  
 # Run zero-shot classification on the dialogue text with candidate traits.  
 result = classifier(text, candidate_traits)  
 dominant_trait = result['labels'][0]  
 trait_predictions.append({  
     'characterID': row['characterID'],  
     'combined_dialogue': text,  
     'dominant_trait': dominant_trait,  
     'trait_scores': result['scores']  
 })  

Classifying traits:   0%|          | 10/9034 [00:07<2:18:03,  1.09it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Classifying traits: 100%|██████████| 9034/9034 [1:29:10<00:00,  1.69it/s]


In [29]:
# Convert the list to a DataFrame  
traits_df = pd.DataFrame(trait_predictions) 

print("Trait classification complete. Sample results:")  
print(traits_df.head()) 

Trait classification complete. Sample results:
  characterID                                  combined_dialogue  \
0          u0  They do not! I hope so. Let's go. Okay -- you'...   
1          u1  Just sent 'em through. Never Didn't have you p...   
2         u10  Absolutely not. Your daughters went to the pro...   
3        u100  She died in her sleep three days ago.  It was ...   
4       u1000  Yeah and I'm gonna be right back at it tomorro...   

  dominant_trait                                       trait_scores  
0    introverted  [0.1399690806865692, 0.10658851265907288, 0.09...  
1          funny  [0.5015628337860107, 0.17719238996505737, 0.16...  
2        hostile  [0.1995924562215805, 0.18560273945331573, 0.13...  
3        hostile  [0.29701340198516846, 0.2610812783241272, 0.07...  
4        serious  [0.25978192687034607, 0.12038370221853256, 0.0...  


In [35]:
# Merge the trait classification results with movie characters metadata to form character_interest  
character_interest = pd.merge(movie_characters, traits_df[['characterID', 'dominant_trait']], on='characterID', how='left')  

print("Merged character_interest (movie_characters with trait classification):")  
print(character_interest.head()) 

Merged character_interest (movie_characters with trait classification):
  characterID characterName movieID                  movieTitle gender  \
0          u0        BIANCA      m0  10 things i hate about you      f   
1          u1         BRUCE      m0  10 things i hate about you      ?   
2          u2       CAMERON      m0  10 things i hate about you      m   
3          u3      CHASTITY      m0  10 things i hate about you      ?   
4          u4          JOEY      m0  10 things i hate about you      m   

  position dominant_trait  
0        4    introverted  
1        ?          funny  
2        3    extroverted  
3        ?    extroverted  
4        6        serious  


In [37]:
# Merge with movie_titles metadata to include movie title details  
character_interest = pd.merge(character_interest, movie_titles[['movieID','movieTitle']], on='movieID', how='left', suffixes=('', '_movie'))  

print("Character interest DataFrame (merged):")  
print(character_interest.head())  

Character interest DataFrame (merged):
  characterID characterName movieID                  movieTitle gender  \
0          u0        BIANCA      m0  10 things i hate about you      f   
1          u1         BRUCE      m0  10 things i hate about you      ?   
2          u2       CAMERON      m0  10 things i hate about you      m   
3          u3      CHASTITY      m0  10 things i hate about you      ?   
4          u4          JOEY      m0  10 things i hate about you      m   

  position dominant_trait            movieTitle_movie  
0        4    introverted  10 things i hate about you  
1        ?          funny  10 things i hate about you  
2        3    extroverted  10 things i hate about you  
3        ?    extroverted  10 things i hate about you  
4        6        serious  10 things i hate about you  


In [39]:
# Save the final classification results to a CSV file  
output_filename = 'character_trait_classification.csv'  
character_interest.to_csv(output_filename, index=False)  

print("Character trait classification saved to", output_filename)  

Character trait classification saved to character_trait_classification.csv
