In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter

In [2]:
raw_data = pd.read_csv('../Data/retyped_data.csv')
data_tmp_2 = raw_data.copy()

<h3>Cast Column<h3>

In [3]:
def process_cast(cast_string):
    """Normalize Cast string"""
    if pd.isna(cast_string):
        return []
    # Handle incorrect format cases
    cast_string = str(cast_string)
    cast_string = cast_string.replace('[', '').replace(']', '')  # Remove square brackets
    # Split actors and remove extra whitespace
    return [actor.strip() for actor in cast_string.split(',')]

def get_similar_movies(row, data, n_similar=5):
    """Find similar movies based on Director and Genre"""
    # If both Director and Genre are missing, return random sample
    if pd.isna(row['Director']) and pd.isna(row['Genre']):
        return data.sample(n=n_similar)
    
    # Create initial mask for all rows
    mask = pd.Series(True, index=data.index)
    
    # Filter by Director if available
    if not pd.isna(row['Director']):
        mask &= (data['Director'] == row['Director'])
    # Filter by Genre if available  
    if not pd.isna(row['Genre']):
        mask &= (data['Genre'] == row['Genre'])
    
    # Get movies matching both filters
    similar_movies = data[mask]

    # If not enough similar movies, get more movies with same Director
    if len(similar_movies) < n_similar and not pd.isna(row['Director']):
        director_movies = data[data['Director'] == row['Director']]
        # Combine and remove duplicates
        similar_movies = pd.concat([similar_movies, director_movies]).drop_duplicates()
    
    # Return top n similar movies
    return similar_movies.head(n_similar)


def fill_cast(row, data):
    """Fill missing Cast based on similar movies"""
    if pd.isna(row['Cast']):
        similar_movies = get_similar_movies(row, data)
        # Get all casts from similar movies
        all_casts = []
        for _, movie in similar_movies.iterrows():
            if not pd.isna(movie['Cast']):
                all_casts.extend(process_cast(movie['Cast']))
        
        # Get the most frequent actors
        if all_casts:
            most_common = Counter(all_casts).most_common(3)
            # Create string list with correct format
            actors = [actor for actor, _ in most_common]
            return f"[{', '.join(f'{actor}' for actor in actors)}]"
        return 'Unknown Cast'
    return row['Cast']

In [4]:
data_test = data_tmp_2.copy()

# Randomly select 30% of rows with non-null Cast for testing
cast_not_null = data_test[data_test['Cast'].notna()].index
test_idx, _ = train_test_split(cast_not_null, test_size=0.3, random_state=42)

# Create Cast_Original column to store original values
data_test['Cast_Original'] = data_test['Cast']

# Set null for selected test rows
data_test.loc[test_idx, 'Cast'] = np.nan

# 2. Apply missing values fill method
data_test['Cast_Generated'] = data_test.apply(lambda row: fill_cast(row, raw_data), axis=1)

# 3. Compare results
comparison_df = pd.DataFrame({
    'Original_Cast': data_test.loc[test_idx, 'Cast_Original'],
    'Generated_Cast': data_test.loc[test_idx, 'Cast_Generated']
})

# Add context information
comparison_df['Director'] = data_test.loc[test_idx, 'Director']
comparison_df['Genre'] = data_test.loc[test_idx, 'Genre']

# 4. Display results and statistics
print("Number of test cases:", len(test_idx))
print("\nSome comparison examples:")
print(comparison_df.head())

# 5. Evaluate accuracy
def calculate_cast_overlap(row):
    """Calculate overlap between original and generated cast"""
    original = set(process_cast(row['Original_Cast']))
    generated = set(process_cast(row['Generated_Cast']))
    
    if len(original) == 0 or len(generated) == 0:
        return 0
    
    overlap = len(original.intersection(generated))
    return overlap / max(len(original), len(generated))

# Calculate average accuracy
comparison_df['Accuracy'] = comparison_df.apply(calculate_cast_overlap, axis=1)

print("\nAccuracy statistics:")
print(f"Average accuracy: {comparison_df['Accuracy'].mean():.2%}")
print(f"Highest accuracy: {comparison_df['Accuracy'].max():.2%}")
print(f"Lowest accuracy: {comparison_df['Accuracy'].min():.2%}")

Number of test cases: 3551

Some comparison examples:
                                          Original_Cast  \
1497  ['Matthew McConaughey', 'Gugu Mbatha-Raw', 'Ma...   
3139  ['Richard Attenborough', 'Hermione Baddeley', ...   
2879  ['Tye Sheridan', 'Logan Miller', 'Joey Morgan'...   
3171  ['Rob Lowe', 'Demi Moore', 'Andrew McCarthy', ...   
1233  ['Bruce Willis', 'Danny Aiello', 'Andie MacDow...   

                                         Generated_Cast  \
1497  ['Elizabeth Banks', 'Matthew McConaughey', 'Gu...   
3139  ['Richard Attenborough', 'Hermione Baddeley', ...   
2879  ['Jessica Rothe', 'Israel Broussard', 'Tye She...   
3171      ['Tommy Lee Jones', 'Rob Lowe', 'Demi Moore']   
1233  ['Bruce Willis', 'Danny Aiello', 'Andie MacDow...   

                    Director                                     Genre  
1497           ['Gary Ross']  ['History', 'Drama', 'Biography', 'War']  
3139       ['John Boulting']                        ['Crime', 'Drama']  
2879  ['Christoph

In [5]:
# Save index of rows with missing Cast before filling
missing_cast_idx = data_tmp_2[data_tmp_2['Cast'].isnull()].index

# 3. Apply missing values filling
data_tmp_2['Cast'] = data_tmp_2.apply(lambda row: fill_cast(row, data_tmp_2), axis=1)

print("Number of missing values in Cast after Preprocess:", data_tmp_2['Cast'].isnull().sum())

Number of missing values in Cast after Preprocess: 0


<h3>Director Column<h3>

In [6]:
# 1. First look at the distribution of Director
print("Number of missing values in Director:", data_tmp_2['Director'].isnull().sum())

def get_similar_movies_for_director(row, data, n_similar=5):
    """Find similar movies based on Cast and Genre"""
    if pd.isna(row['Cast']) and pd.isna(row['Genre']):
        return data.sample(n=n_similar)
    
    mask = pd.Series(True, index=data.index)
    
    # Use filled Cast
    if not pd.isna(row['Cast']):
        # Get list of actors
        row_cast = set(process_cast(row['Cast']))
        # Find movies with at least 1 matching actor
        cast_mask = data['Cast'].apply(lambda x: bool(row_cast.intersection(set(process_cast(x)))))
        mask &= cast_mask
    
    if not pd.isna(row['Genre']):
        mask &= (data['Genre'] == row['Genre'])
    
    similar_movies = data[mask]
    
    # If not enough similar movies, expand by Genre
    if len(similar_movies) < n_similar and not pd.isna(row['Genre']):
        genre_movies = data[data['Genre'] == row['Genre']]
        similar_movies = pd.concat([similar_movies, genre_movies]).drop_duplicates()
    
    return similar_movies.head(n_similar)

def fill_director(row, data):
    """Fill missing Director based on similar movies"""
    if pd.isna(row['Director']):
        similar_movies = get_similar_movies_for_director(row, data)
        # Get directors from similar movies
        directors = []
        for _, movie in similar_movies.iterrows():
            if not pd.isna(movie['Director']):
                directors.append(movie['Director'])
        
        # Get most frequent director
        if directors:
            most_common = Counter(directors).most_common(1)
            return most_common[0][0]
        return 'Unknown Director'
    return row['Director']

Number of missing values in Director: 35


In [7]:
# 1. Create test set
data_test = data_tmp_2.copy()

# Randomly select 30% of rows with non-null Director for testing
director_not_null = data_test[data_test['Director'].notna()].index
test_idx, _ = train_test_split(director_not_null, test_size=0.3, random_state=42)

# Create Director_Original column to store original values
data_test['Director_Original'] = data_test['Director']

# Set null for selected test rows
data_test.loc[test_idx, 'Director'] = np.nan

# 2. Apply missing values imputation method
data_test['Director_Generated'] = data_test.apply(lambda row: fill_director(row, raw_data), axis=1)

# 3. Compare results
comparison_df = pd.DataFrame({
    'Original_Director': data_test.loc[test_idx, 'Director_Original'],
    'Generated_Director': data_test.loc[test_idx, 'Director_Generated'],
    'Cast': data_test.loc[test_idx, 'Cast'],
    'Genre': data_test.loc[test_idx, 'Genre']
})

# 4. Display results and statistics
print("Number of test cases:", len(test_idx))
print("\nSome comparison examples:")
print(comparison_df.head())

# 5. Evaluate accuracy
def calculate_director_accuracy(row):
    """Calculate accuracy of Director imputation"""
    if row['Original_Director'] == row['Generated_Director']:
        return 1.0
    return 0.0

# Calculate average accuracy
comparison_df['Accuracy'] = comparison_df.apply(calculate_director_accuracy, axis=1)

print("\nAccuracy statistics:")
print(f"Average accuracy: {comparison_df['Accuracy'].mean():.2%}")
print(f"Number of correct predictions: {(comparison_df['Accuracy'] == 1.0).sum()}")
print(f"Number of incorrect predictions: {(comparison_df['Accuracy'] == 0.0).sum()}")
print(f"Number of Unknown Director cases: {(comparison_df['Generated_Director'] == '[Unknown Director]').sum()}")

Number of test cases: 3549

Some comparison examples:
         Original_Director     Generated_Director  \
4582     ['Matthew Watts']       ['Billy Wilder']   
1496   ['Graeme Campbell']  ['Richard Schenkman']   
2650       ['Alan Parker']        ['Alan Parker']   
2878    ['Rodman Flender']     ['Rodman Flender']   
4726  ['Darren Aronofsky']   ['Darren Aronofsky']   

                                                   Cast  \
4582  ['Caitlin FitzGerald', 'Cheyenne Jackson', 'Pe...   
1496  ['Timothy Busfield', 'Jennifer Dale', 'Dan Pet...   
2650  ['Irene Cara', 'Eddie Barth', 'Lee Curreri', '...   
2878  ['Devon Sawa', 'Seth Green', 'Elden Henson', '...   
4726  ['Sean Gullette', 'Mark Margolis', 'Ben Shenkm...   

                                          Genre  
4582                                 ['Comedy']  
1496                                 ['Sci-Fi']  
2650                                ['Musical']  
2878                                 ['Horror']  
4726  ['Mystery & Thri

In [8]:
# Lưu index của các row có Director missing
missing_director_idx = data_tmp_2[data_tmp_2['Director'].isnull()].index

# Áp dụng fill_director
data_tmp_2['Director'] = data_tmp_2.apply(lambda row: fill_director(row, data_tmp_2), axis=1)

# In số lượng missing values sau khi điền
print("Number of missing values in Director after Preprocess:", data_tmp_2['Director'].isnull().sum())

Number of missing values in Director after Preprocess: 0


<h3>Genre Column<h3>

In [9]:
# 1. First, let's look at the distribution of Genre data
print("Number of missing values in Genre:", data_tmp_2['Genre'].isnull().sum())

# 2. Create helper functions to process Genre
def process_genre(genre_string):
    """Normalize Genre string"""
    if pd.isna(genre_string):
        return []
    # Handle incorrect format cases
    genre_string = str(genre_string)
    genre_string = genre_string.replace('[', '').replace(']', '')  # Remove square brackets
    # Split genres and remove extra whitespace
    return [genre.strip() for genre in genre_string.split(',')]

def get_similar_movies_for_genre(row, data, n_similar=5):
    """Find similar movies based on Cast and Director"""
    if pd.isna(row['Cast']) and pd.isna(row['Director']):
        return data.sample(n=n_similar)
    
    mask = pd.Series(True, index=data.index)
    
    # Filter by Cast if available
    if not pd.isna(row['Cast']):
        mask &= (data['Cast'] == row['Cast'])
    
    # Filter by Director if available
    if not pd.isna(row['Director']):
        mask &= (data['Director'] == row['Director'])
    
    similar_movies = data[mask]
    
    # If not enough similar movies, get more movies with same Director
    if len(similar_movies) < n_similar and not pd.isna(row['Director']):
        director_movies = data[data['Director'] == row['Director']]
        similar_movies = pd.concat([similar_movies, director_movies]).drop_duplicates()
    
    return similar_movies.head(n_similar)


def fill_genre(row, data):
    """Fill missing Genre based on similar movies"""
    if pd.isna(row['Genre']):
        similar_movies = get_similar_movies_for_genre(row, data)
        # Get all genres from similar movies
        all_genres = []
        for _, movie in similar_movies.iterrows():
            if not pd.isna(movie['Genre']):
                all_genres.extend(process_genre(movie['Genre']))
        
        # Get the most frequent genres
        if all_genres:
            most_common = Counter(all_genres).most_common(2)  # Lấy 2 thể loại phổ biến nhất
            # Create string list with correct format
            genres = [genre for genre, _ in most_common]
            return f"[{', '.join(f'{genre}' for genre in genres)}]"
        return 'Unknown Genre'
    return row['Genre']

Number of missing values in Genre: 46


In [10]:
# 1. Tạo tập test
data_test = data_tmp_2.copy()

# Chọn ngẫu nhiên 30% rows có Genre không phải null để test
genre_not_null = data_test[data_test['Genre'].notna()].index
test_idx, _ = train_test_split(genre_not_null, test_size=0.3, random_state=42)

# Tạo cột Genre_Original để lưu giá trị gốc
data_test['Genre_Original'] = data_test['Genre']

# Set null cho các rows được chọn để test
data_test.loc[test_idx, 'Genre'] = np.nan

# 2. Áp dụng phương pháp điền missing values
data_test['Genre_Generated'] = data_test.apply(lambda row: fill_genre(row, raw_data), axis=1)

# 3. So sánh kết quả
comparison_df = pd.DataFrame({
    'Original_Genre': data_test.loc[test_idx, 'Genre_Original'],
    'Generated_Genre': data_test.loc[test_idx, 'Genre_Generated'],
    'Cast': data_test.loc[test_idx, 'Cast'],
    'Director': data_test.loc[test_idx, 'Director']
})

# 4. Hiển thị kết quả và thống kê
print("Số lượng test cases:", len(test_idx))
print("\nMột vài ví dụ so sánh:")
print(comparison_df.head())

# 5. Đánh giá độ chính xác
def calculate_genre_overlap(row):
    """Tính độ chính xác của Genre imputation dựa trên overlap"""
    original = set(process_genre(row['Original_Genre']))
    generated = set(process_genre(row['Generated_Genre']))
    
    if len(original) == 0 or len(generated) == 0:
        return 0
    
    overlap = len(original.intersection(generated))
    return overlap / max(len(original), len(generated))

# Tính độ chính xác trung bình
comparison_df['Accuracy'] = comparison_df.apply(calculate_genre_overlap, axis=1)

print("\nThống kê độ chính xác:")
print(f"Độ chính xác trung bình: {comparison_df['Accuracy'].mean():.2%}")
print(f"Độ chính xác cao nhất: {comparison_df['Accuracy'].max():.2%}")
print(f"Độ chính xác thấp nhất: {comparison_df['Accuracy'].min():.2%}")
print(f"Số cases là Unknown Genre: {(comparison_df['Generated_Genre'] == '[Unknown Genre]').sum()}")

Số lượng test cases: 3542

Một vài ví dụ so sánh:
                      Original_Genre         Generated_Genre  \
2262            ['Comedy', 'Action']    ['Comedy', 'Action']   
2482  ['Horror', 'Sci-Fi', 'Action']    ['Horror', 'Sci-Fi']   
2883          ['Mystery & Thriller']  ['Mystery & Thriller']   
2417             ['Action', 'Drama']     ['Action', 'Drama']   
1490             ['Comedy', 'Drama']     ['Comedy', 'Drama']   

                                                   Cast  \
2262  ['Michael Peña', 'Dax Shepard', "Vincent D'Ono...   
2482  ['Ethan Hawke', 'Willem Dafoe', 'Claudia Karva...   
2883  ['Clark Gable', 'Gene Tierney', 'Bernard Miles...   
2417           ['Dev', 'Rukmini Maitra', 'Chandan Sen']   
1490  ['Cezmi Baskin', 'Özgü Namal', 'Sirri Sureyya ...   

                     Director  
2262          ['Dax Shepard']  
2482        ['Peter Spierig']  
2883         ['Delmer Daves']  
2417          ['Raja Chanda']  
1490  ['Sirri Sureyya Onder']  

Thống kê độ chính

In [11]:
# Save index of rows with missing Genre before filling
missing_genre_idx = data_tmp_2[data_tmp_2['Genre'].isnull()].index

# Apply missing values filling
data_tmp_2['Genre'] = data_tmp_2.apply(lambda row: fill_genre(row, data_tmp_2), axis=1)

print("Number of missing values in Genre after Preprocess:", data_tmp_2['Genre'].isnull().sum())

Number of missing values in Genre after Preprocess: 0
