In [1]:
import pandas as pd
import numpy as np
import spacy
from collections import Counter
from textblob import TextBlob
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
import nltk
from tqdm import tqdm
import re
from nltk.corpus import stopwords

In [37]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tterr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\tterr\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tterr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tterr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
# Load spaCy model for NLP analysis
nlp = spacy.load("en_core_web_sm")

### Load and Merge Data

In [11]:
# Load movie lines file

# Define column names
column_names = ['line_id', 'characterID', 'movieID', 'characterName', 'lineText']

# Read the file with the provided column names
df_lines = pd.read_csv('data/movie_lines.tsv', sep='\t', encoding='utf-8', header=None, names=column_names, on_bad_lines='skip')

df_lines.head()

Unnamed: 0,line_id,characterID,movieID,characterName,lineText
0,L1045,u0,m0,BIANCA,They do not!
1,L1044,u2,m0,CAMERON,They do to!
2,L985,u0,m0,BIANCA,I hope so.
3,L984,u2,m0,CAMERON,She okay?
4,L925,u0,m0,BIANCA,Let's go.


In [13]:
# Load movie characters file

# Define column names for movie_characters_metadata
column_names_char = ['characterID', 'characterName', 'movieID', 'movieTitle', 'gender', 'creditsPos']

# Read the file with the provided column names
df_characters = pd.read_csv('data/movie_characters_metadata.tsv', sep='\t', encoding='utf-8', header=None, names=column_names_char, on_bad_lines='skip')

df_characters.head()

Unnamed: 0,characterID,characterName,movieID,movieTitle,gender,creditsPos
0,u0,BIANCA,m0,10 things i hate about you,f,4
1,u1,BRUCE,m0,10 things i hate about you,?,?
2,u2,CAMERON,m0,10 things i hate about you,m,3
3,u3,CHASTITY,m0,10 things i hate about you,?,?
4,u4,JOEY,m0,10 things i hate about you,m,6


In [15]:
# Load movie titles file

# Define column names for movie_titles_metadata
column_names_titles = ['movieID', 'movieTitle', 'movieYear', 'IMDBRating', 'IMBDVotes', 'genres']

# Read the file with the provided column names
df_movies = pd.read_csv('data/movie_titles_metadata.tsv', sep='\t', encoding='utf-8', header=None, names=column_names_titles, on_bad_lines='skip')

df_movies.head()

Unnamed: 0,movieID,movieTitle,movieYear,IMDBRating,IMBDVotes,genres
0,m0,10 things i hate about you,1999,6.9,62847.0,['comedy' 'romance']
1,m1,1492: conquest of paradise,1992,6.2,10421.0,['adventure' 'biography' 'drama' 'history']
2,m2,15 minutes,2001,6.1,25854.0,['action' 'crime' 'drama' 'thriller']
3,m3,2001: a space odyssey,1968,8.4,163227.0,['adventure' 'mystery' 'sci-fi']
4,m4,48 hrs.,1982,6.9,22289.0,['action' 'comedy' 'crime' 'drama' 'thriller']


In [17]:
print("Movie lines shape:", df_lines.shape)
print("Characters shape:", df_characters.shape)
print("Movies shape:", df_movies.shape)

Movie lines shape: (293202, 5)
Characters shape: (9034, 6)
Movies shape: (617, 6)


In [21]:
# Group lines by both characterID and movieID
character_movie_lines = df_lines.groupby(['characterID', 'movieID'])['lineText'].apply(list).reset_index()

character_movie_lines.head()

Unnamed: 0,characterID,movieID,lineText
0,u0,m0,"[They do not!, I hope so., Let's go., Okay -- ..."
1,u1,m0,"[Just sent 'em through., Never, Didn't have yo..."
2,u10,m0,"[Absolutely not., Your daughters went to the p..."
3,u100,m6,[She died in her sleep three days ago. It was...
4,u1000,m65,[Yeah and I'm gonna be right back at it tomorr...


In [23]:
# Merge with character metadata
character_analysis = pd.merge(character_movie_lines, df_characters, on=['characterID', 'movieID'])

character_analysis.head()

Unnamed: 0,characterID,movieID,lineText,characterName,movieTitle,gender,creditsPos
0,u0,m0,"[They do not!, I hope so., Let's go., Okay -- ...",BIANCA,10 things i hate about you,f,4
1,u1,m0,"[Just sent 'em through., Never, Didn't have yo...",BRUCE,10 things i hate about you,?,?
2,u10,m0,"[Absolutely not., Your daughters went to the p...",SHARON,10 things i hate about you,?,?
3,u100,m6,[She died in her sleep three days ago. It was...,AMY,8mm,f,7
4,u1000,m65,[Yeah and I'm gonna be right back at it tomorr...,MCGRAW,from dusk till dawn,?,?


In [25]:
# Merge with movie titles
character_analysis = pd.merge(character_analysis, df_movies[['movieID', 'movieTitle']], on='movieID')

character_analysis.head()

Unnamed: 0,characterID,movieID,lineText,characterName,movieTitle_x,gender,creditsPos,movieTitle_y
0,u0,m0,"[They do not!, I hope so., Let's go., Okay -- ...",BIANCA,10 things i hate about you,f,4,10 things i hate about you
1,u1,m0,"[Just sent 'em through., Never, Didn't have yo...",BRUCE,10 things i hate about you,?,?,10 things i hate about you
2,u10,m0,"[Absolutely not., Your daughters went to the p...",SHARON,10 things i hate about you,?,?,10 things i hate about you
3,u100,m6,[She died in her sleep three days ago. It was...,AMY,8mm,f,7,8mm
4,u1000,m65,[Yeah and I'm gonna be right back at it tomorr...,MCGRAW,from dusk till dawn,?,?,from dusk till dawn


In [27]:
# Clean movieTitle columns
if 'movieTitle_x' in character_analysis.columns and 'movieTitle_y' in character_analysis.columns:
    # Drop movieTitle_x and rename movieTitle_y to movieTitle
    character_analysis.drop(columns=['movieTitle_x'], inplace=True)
    character_analysis.rename(columns={'movieTitle_y': 'movieTitle'}, inplace=True)
elif 'movieTitle' in character_analysis.columns:
    # Nothing to do if only one exists
    pass

character_analysis.head()

Unnamed: 0,characterID,movieID,lineText,characterName,gender,creditsPos,movieTitle
0,u0,m0,"[They do not!, I hope so., Let's go., Okay -- ...",BIANCA,f,4,10 things i hate about you
1,u1,m0,"[Just sent 'em through., Never, Didn't have yo...",BRUCE,?,?,10 things i hate about you
2,u10,m0,"[Absolutely not., Your daughters went to the p...",SHARON,?,?,10 things i hate about you
3,u100,m6,[She died in her sleep three days ago. It was...,AMY,f,7,8mm
4,u1000,m65,[Yeah and I'm gonna be right back at it tomorr...,MCGRAW,?,?,from dusk till dawn


In [29]:
print("\nCharacter analysis dataframe shape:", character_analysis.shape)


Character analysis dataframe shape: (8749, 7)


### Analyze Dialogue

In [31]:
# Function to analyze dialogue and extract traits
def analyze_character_dialogue(lines):
    if not lines or all(pd.isna(line) for line in lines):
        return {}
    
    # Join all lines for this character
    all_text = ' '.join([str(line) for line in lines if pd.notna(line)])
    
    # Basic statistics
    word_count = len(all_text.split())
    if word_count < 10:  # Skip very short dialogues
        return {'word_count': word_count}
    
    # Sentiment analysis
    blob = TextBlob(all_text)
    sentiment_polarity = blob.sentiment.polarity
    sentiment_subjectivity = blob.sentiment.subjectivity
    
    # Process with spaCy for linguistic features (limit text length to avoid memory issues)
    max_text_length = 100000  # Limit text length for processing
    doc = nlp(all_text[:max_text_length])
    
    # Count parts of speech
    pos_counts = Counter([token.pos_ for token in doc])
    total_tokens = len(doc)
    
    # Calculate POS ratios
    noun_ratio = pos_counts.get('NOUN', 0) / total_tokens if total_tokens > 0 else 0
    verb_ratio = pos_counts.get('VERB', 0) / total_tokens if total_tokens > 0 else 0
    adj_ratio = pos_counts.get('ADJ', 0) / total_tokens if total_tokens > 0 else 0
    adv_ratio = pos_counts.get('ADV', 0) / total_tokens if total_tokens > 0 else 0
    
    # Count question marks (curiosity/inquisitiveness)
    question_ratio = all_text.count('?') / (word_count + 1)
    
    # Count exclamation marks (expressiveness)
    exclamation_ratio = all_text.count('!') / (word_count + 1)
    
    # Average sentence length (complexity of thought)
    sentences = [s for s in all_text.split('.') if s]
    avg_sentence_length = np.mean([len(s.split()) for s in sentences]) if sentences else 0
    
    # Vocabulary richness (unique words / total words)
    words = [w.lower() for w in word_tokenize(all_text) if w.isalpha()]
    stop_words = set(stopwords.words('english'))
    content_words = [w for w in words if w not in stop_words]
    vocabulary_richness = len(set(content_words)) / len(content_words) if content_words else 0
    
    # Most frequent words (interests/preoccupations)
    word_freq = Counter(content_words)
    most_common_words = word_freq.most_common(10)
    
    # Return all extracted features
    return {
        'word_count': word_count,
        'sentiment_polarity': sentiment_polarity,
        'sentiment_subjectivity': sentiment_subjectivity,
        'noun_ratio': noun_ratio,
        'verb_ratio': verb_ratio,
        'adj_ratio': adj_ratio,
        'adv_ratio': adv_ratio,
        'question_ratio': question_ratio,
        'exclamation_ratio': exclamation_ratio,
        'avg_sentence_length': avg_sentence_length,
        'vocabulary_richness': vocabulary_richness,
        'most_common_words': most_common_words
    }

In [33]:
# Apply the analysis function to each character's dialogue
print("Analyzing character dialogues...")
character_traits = []

Analyzing character dialogues...


In [39]:
# Use a regular for loop with tqdm for progress tracking instead of progress_apply
for idx, row in tqdm(character_analysis.iterrows(), total=len(character_analysis), desc="Analyzing characters"):
    char_id = row['characterID']
    movie_id = row['movieID']
    lines = row['lineText']
    
    # Get character name and movie title
    char_name = str(row['characterName']) if 'characterName' in character_analysis.columns else "Unknown"
    movie_title = str(row['movieTitle']) if 'movieTitle' in character_analysis.columns else "Unknown"
    
    # Analyze dialogue
    traits = analyze_character_dialogue(lines)
    
    # Add character and movie info to traits
    traits['characterID'] = char_id
    traits['movieID'] = movie_id
    traits['characterName'] = char_name
    traits['movieTitle'] = movie_title
    
    character_traits.append(traits)

Analyzing characters: 100%|██████████| 8749/8749 [06:14<00:00, 23.33it/s]


In [41]:
# Convert to DataFrame
character_traits_df = pd.DataFrame(character_traits)
print("Analysis complete. Shape of traits DataFrame:", character_traits_df.shape)

Analysis complete. Shape of traits DataFrame: (8749, 16)


In [47]:
# Function to generate a character profile
def generate_character_profile(character_id, movie_id):
    # Filter the character_traits_df DataFrame for this character and movie
    subset = character_traits_df[(character_traits_df['characterID'] == character_id) & 
                                (character_traits_df['movieID'] == movie_id)]
    
    if subset.empty:
        return f"No trait data available for characterID {character_id} in movieID {movie_id}"
    
    # Get character info from character_analysis
    char_info = character_analysis[(character_analysis['characterID'] == character_id) & 
                                  (character_analysis['movieID'] == movie_id)]
    
    if char_info.empty:
        return f"No character info available for characterID {character_id} in movieID {movie_id}"
    
    # Extract basic information
    char_name = str(char_info['characterName'].iloc[0]) if 'characterName' in char_info.columns else f"Character {character_id}"
    movie_title = str(char_info['movieTitle'].iloc[0]) if 'movieTitle' in char_info.columns else f"Movie {movie_id}"
    
    # Get the dialogue lines - handle potential float values
    lines = char_info['lineText'].iloc[0]
    
    # If lines is not iterable (e.g., a float), convert it to a list with a single string entry
    if not hasattr(lines, '__iter__') or isinstance(lines, str):
        try:
            lines = [str(lines)]
        except Exception as e:
            lines = ["Error converting dialogue to string"]
    
    # Get the first row of traits for this character-movie combination
    traits_row = subset.iloc[0]
    
    # Start building the profile
    profile = f"CHARACTER PROFILE: {char_name}\n"
    profile += f"Movie: {movie_title}\n"
    profile += f"Character ID: {character_id}\n\n"
    
    # Basic dialogue statistics
    word_count = traits_row.get('word_count', 0)
    profile += f"DIALOGUE STATISTICS:\n"
    profile += f"- Total words: {word_count}\n"
    
    avg_sentence_length = traits_row.get('avg_sentence_length', 0)
    if avg_sentence_length > 0:
        profile += f"- Average sentence length: {avg_sentence_length:.2f} words\n"
    
    # Sentiment analysis
    profile += "\nSENTIMENT ANALYSIS:\n"
    sentiment_polarity = traits_row.get('sentiment_polarity', 0)
    sentiment_subjectivity = traits_row.get('sentiment_subjectivity', 0)
    
    # Interpret sentiment polarity
    if sentiment_polarity > 0.2:
        sentiment_desc = "Positive"
    elif sentiment_polarity < -0.2:
        sentiment_desc = "Negative"
    else:
        sentiment_desc = "Neutral"
    
    profile += f"- Overall sentiment: {sentiment_desc} ({sentiment_polarity:.2f})\n"
    profile += f"- Subjectivity: {sentiment_subjectivity:.2f} (0=objective, 1=subjective)\n"
    
    # Linguistic patterns
    profile += "\nLINGUISTIC PATTERNS:\n"
    
    # Parts of speech ratios
    noun_ratio = traits_row.get('noun_ratio', 0)
    verb_ratio = traits_row.get('verb_ratio', 0)
    adj_ratio = traits_row.get('adj_ratio', 0)
    adv_ratio = traits_row.get('adv_ratio', 0)
    
    profile += f"- Noun usage: {noun_ratio:.2f}\n"
    profile += f"- Verb usage: {verb_ratio:.2f}\n"
    profile += f"- Adjective usage: {adj_ratio:.2f}\n"
    profile += f"- Adverb usage: {adv_ratio:.2f}\n"
    
    # Question and exclamation usage
    question_ratio = traits_row.get('question_ratio', 0)
    exclamation_ratio = traits_row.get('exclamation_ratio', 0)
    
    profile += f"- Question frequency: {question_ratio:.4f}\n"
    profile += f"- Exclamation frequency: {exclamation_ratio:.4f}\n"
    
    # Vocabulary richness
    vocab_richness = traits_row.get('vocab_richness', 0)
    profile += f"- Vocabulary richness: {vocab_richness:.4f}\n"
    
    # Frequently used words
    profile += "\nFREQUENTLY USED WORDS:\n"
    for i in range(1, 6):  # Assuming top 5 words are stored
        word_key = f'top_word_{i}'
        count_key = f'top_word_{i}_count'
        if word_key in traits_row and count_key in traits_row:
            word = traits_row[word_key]
            count = traits_row[count_key]
            if pd.notna(word) and pd.notna(count):
                profile += f"- {word}: {int(count)} times\n"
    
    # Personality traits interpretation
    profile += "\nPERSONALITY TRAITS INTERPRETATION:\n"
    
    # Define trait thresholds and interpretations
    traits_interpretation = []
    
    # Interpret noun ratio (high = detail-oriented, concrete)
    if noun_ratio > 0.25:
        traits_interpretation.append("Detail-oriented")
    
    # Interpret verb ratio (high = action-focused)
    if verb_ratio > 0.20:
        traits_interpretation.append("Action-focused")
    
    # Interpret adjective ratio (high = descriptive, emotional)
    if adj_ratio > 0.10:
        traits_interpretation.append("Descriptive")
    
    # Interpret adverb ratio (high = nuanced, qualification-heavy)
    if adv_ratio > 0.08:
        traits_interpretation.append("Nuanced")
    
    # Interpret question ratio (high = curious, inquisitive)
    if question_ratio > 0.05:
        traits_interpretation.append("Inquisitive")
    
    # Interpret exclamation ratio (high = expressive, emotional)
    if exclamation_ratio > 0.05:
        traits_interpretation.append("Expressive")
    
    # Interpret sentiment (positive/negative outlook)
    if sentiment_polarity > 0.3:
        traits_interpretation.append("Optimistic")
    elif sentiment_polarity < -0.3:
        traits_interpretation.append("Pessimistic")
    
    # Interpret subjectivity (high = opinionated)
    if sentiment_subjectivity > 0.6:
        traits_interpretation.append("Opinionated")
    
    # Interpret vocabulary richness (high = articulate, intellectual)
    if vocab_richness > 0.6:
        traits_interpretation.append("Articulate")
    
    # Add the interpreted traits to the profile
    if traits_interpretation:
        profile += "Based on linguistic analysis, this character appears to be:\n"
        for trait in traits_interpretation:
            profile += f"- {trait}\n"
    else:
        profile += "No distinctive personality traits detected from linguistic analysis.\n"
    
    # Overall impression
    profile += "\nOVERALL IMPRESSION:\n"
    if traits_interpretation:
        profile += f"{char_name} is primarily a "
        if len(traits_interpretation) >= 3:
            profile += ", ".join(traits_interpretation[:-1]) + ", and " + traits_interpretation[-1]
        elif len(traits_interpretation) == 2:
            profile += traits_interpretation[0] + " and " + traits_interpretation[1]
        else:
            profile += traits_interpretation[0]
        profile += " character based on their dialogue patterns.\n"
    else:
        profile += f"{char_name} shows a balanced personality without strongly dominant traits.\n"
    
    return profile

In [49]:
# Save all character profiles to a file
with open('character_trait_profiles_by_movie.txt', 'w', encoding='utf-8') as f:
    # Get unique character-movie combinations
    char_movie_combos = character_traits_df[['characterID', 'movieID']].drop_duplicates().values
    
    for combo in tqdm(char_movie_combos, desc="Generating profiles"):
        char_id, movie_id = combo[0], combo[1]
        try:
            profile = generate_character_profile(char_id, movie_id)
            f.write(profile + "\n" + "="*50 + "\n\n")
        except Exception as e:
            f.write(f"Error generating profile for characterID {char_id} in movieID {movie_id}: {str(e)}\n" + "="*50 + "\n\n")
            continue

print("Character trait profiles by movie saved to 'character_trait_profiles_by_movie.txt'")

Generating profiles: 100%|██████████| 8749/8749 [00:17<00:00, 500.77it/s]


Character trait profiles by movie saved to 'character_trait_profiles_by_movie.txt'


In [51]:
# Display a few sample profiles
print("\nSample Character Profiles:")
sample_combos = character_traits_df[['characterID', 'movieID']].drop_duplicates().head(3).values
for combo in sample_combos:
    char_id, movie_id = combo[0], combo[1]
    try:
        print(generate_character_profile(char_id, movie_id))
        print("\n" + "-"*50 + "\n")
    except Exception as e:
        print(f"Error generating profile for characterID {char_id} in movieID {movie_id}: {str(e)}")
        print("\n" + "-"*50 + "\n")


Sample Character Profiles:
CHARACTER PROFILE: BIANCA
Movie: 10 things i hate about you
Character ID: u0

DIALOGUE STATISTICS:
- Total words: 896.0
- Average sentence length: 10.07 words

SENTIMENT ANALYSIS:
- Overall sentiment: Neutral (0.12)
- Subjectivity: 0.57 (0=objective, 1=subjective)

LINGUISTIC PATTERNS:
- Noun usage: 0.10
- Verb usage: 0.14
- Adjective usage: 0.05
- Adverb usage: 0.05
- Question frequency: 0.0346
- Exclamation frequency: 0.0100
- Vocabulary richness: 0.0000

FREQUENTLY USED WORDS:

PERSONALITY TRAITS INTERPRETATION:
No distinctive personality traits detected from linguistic analysis.

OVERALL IMPRESSION:
BIANCA shows a balanced personality without strongly dominant traits.


--------------------------------------------------

CHARACTER PROFILE: BRUCE
Movie: 10 things i hate about you
Character ID: u1

DIALOGUE STATISTICS:
- Total words: 23.0
- Average sentence length: 7.67 words

SENTIMENT ANALYSIS:
- Overall sentiment: Neutral (-0.19)
- Subjectivity: 0.50 (0