In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import requests
import bs4 as bs
import urllib.request
from tmdbv3api import TMDb, Movie
import json
import logging
from typing import List, Dict, Optional, Union
import warnings
from tqdm import tqdm

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Configure warnings
warnings.filterwarnings('ignore')

# Configure TMDB
tmdb = TMDb()
tmdb.api_key = ''  # Add your API key here
tmdb_movie = Movie()

# Helper functions for data validation
def validate_string(s: str) -> bool:
    """Validate if a string is non-empty and contains valid characters."""
    return bool(s and isinstance(s, str) and not s.isspace())

def validate_list(lst: list) -> bool:
    """Validate if a list is non-empty and contains valid elements."""
    return bool(lst and isinstance(lst, list) and all(validate_string(x) for x in lst))

def safe_request(url: str) -> Optional[str]:
    """Make a safe HTTP request with error handling."""
    try:
        return urllib.request.urlopen(url).read()
    except Exception as e:
        logging.error(f"Error fetching URL {url}: {str(e)}")
        return None


In [None]:
# Data extraction functions
def get_genre(title: str) -> Optional[str]:
    """
    Get movie genres from TMDB API with error handling and validation.
    
    Args:
        title: Movie title to search for
        
    Returns:
        String of genres separated by spaces or None if not found
    """
    try:
        result = tmdb_movie.search(title)
        if not result:
            logging.warning(f"No TMDB results found for movie: {title}")
            return None
            
        movie_id = result[0].id
        response = requests.get(f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={tmdb.api_key}')
        data = response.json()
        
        if not data.get('genres'):
            logging.warning(f"No genres found for movie: {title}")
            return None
            
        genres = [genre['name'] for genre in data['genres']]
        return " ".join(genres)
        
    except Exception as e:
        logging.error(f"Error getting genres for {title}: {str(e)}")
        return None

def extract_name(text: str, role: str) -> Optional[str]:
    """
    Extract name from text based on role with validation.
    
    Args:
        text: Text to extract name from
        role: Role to look for (director/actor)
        
    Returns:
        Extracted name or None if not found
    """
    try:
        if f" ({role})" in text:
            name = text.split(f" ({role})")[0]
            return name if validate_string(name) else None
        return None
    except Exception as e:
        logging.error(f"Error extracting {role} name: {str(e)}")
        return None

def get_director(text: str) -> Optional[str]:
    """Extract director name with validation."""
    for role in ["director", "directors", "director/screenplay"]:
        name = extract_name(text, role)
        if name:
            return name
    return None

def get_actor(text: str, position: int) -> Optional[str]:
    """
    Extract actor name for given position with validation.
    
    Args:
        text: Text containing actor information
        position: Position of actor (1-3)
        
    Returns:
        Actor name or None if not found
    """
    try:
        actors = (text.split("screenplay); ")[-1]).split(", ")
        if len(actors) >= position:
            name = actors[position-1]
            return name if validate_string(name) else None
        return None
    except Exception as e:
        logging.error(f"Error extracting actor {position} name: {str(e)}")
        return None


In [None]:
# Data extraction from Wikipedia
logging.info("Starting Wikipedia data extraction...")

# Wikipedia URL for 2020 movies
WIKI_URL = "https://en.wikipedia.org/wiki/List_of_American_films_of_2020"

# Get page content
source = safe_request(WIKI_URL)
if not source:
    raise RuntimeError("Failed to fetch Wikipedia page")

# Parse HTML
soup = bs.BeautifulSoup(source, 'lxml')
tables = soup.find_all('table', class_='wikitable sortable')

if not tables:
    raise RuntimeError("No tables found on Wikipedia page")

logging.info(f"Found {len(tables)} quarterly tables")

# Extract data from all tables
dfs = []
for i, table in enumerate(tables, 1):
    try:
        df = pd.read_html(str(table))[0]
        logging.info(f"Extracted {len(df)} movies from Q{i} table")
        dfs.append(df)
    except Exception as e:
        logging.error(f"Error processing Q{i} table: {str(e)}")

# Combine all tables
df_2020 = pd.concat(dfs, ignore_index=True)
logging.info(f"Total movies extracted: {len(df_2020)}")

# Display sample
print("\nSample of extracted data:")
display(df_2020.head())


In [None]:
# Process and clean the data
logging.info("Starting data processing...")

# Select required columns
df_2020 = df_2020[['Title', 'Cast and crew']]

# Add genres using TMDB API
logging.info("Fetching genres from TMDB API...")
df_2020['genres'] = df_2020['Title'].progress_apply(get_genre)

# Extract director and actors
logging.info("Extracting director and actor information...")
df_2020['director_name'] = df_2020['Cast and crew'].apply(get_director)
df_2020['actor_1_name'] = df_2020['Cast and crew'].apply(lambda x: get_actor(x, 1))
df_2020['actor_2_name'] = df_2020['Cast and crew'].apply(lambda x: get_actor(x, 2))
df_2020['actor_3_name'] = df_2020['Cast and crew'].apply(lambda x: get_actor(x, 3))

# Rename title column
df_2020 = df_2020.rename(columns={'Title': 'movie_title'})

# Convert movie titles to lowercase
df_2020['movie_title'] = df_2020['movie_title'].str.lower()

# Create combined features column
df_2020['comb'] = (df_2020['actor_1_name'] + ' ' + 
                   df_2020['actor_2_name'] + ' ' + 
                   df_2020['actor_3_name'] + ' ' + 
                   df_2020['director_name'] + ' ' + 
                   df_2020['genres'])

# Select final columns
final_cols = ['director_name', 'actor_1_name', 'actor_2_name', 
              'actor_3_name', 'genres', 'movie_title', 'comb']
new_df20 = df_2020[final_cols]

# Data quality checks
logging.info("\nData quality check:")
logging.info(f"Total rows: {len(new_df20)}")
logging.info("\nMissing values:")
display(new_df20.isna().sum())

# Drop rows with missing values
new_df20 = new_df20.dropna(how='any')
logging.info(f"\nRows after dropping missing values: {len(new_df20)}")

# Display sample of processed data
print("\nSample of processed data:")
display(new_df20.head())


In [None]:
# Merge with existing dataset
logging.info("Starting dataset merge...")

try:
    # Load existing dataset
    old_df = pd.read_csv('final_data.csv')
    logging.info(f"Loaded existing dataset with {len(old_df)} rows")
    
    # Check for duplicate movies
    duplicates = pd.concat([old_df['movie_title'], new_df20['movie_title']]).duplicated()
    duplicate_count = sum(duplicates)
    if duplicate_count > 0:
        logging.warning(f"Found {duplicate_count} duplicate movies")
    
    # Merge datasets
    final_df = pd.concat([old_df, new_df20], ignore_index=True)
    
    # Remove duplicates if any
    final_df = final_df.drop_duplicates(subset=['movie_title'], keep='first')
    
    logging.info(f"Final dataset shape: {final_df.shape}")
    
    # Data quality summary
    print("\nFinal dataset summary:")
    print(f"Total movies: {len(final_df)}")
    print(f"Unique directors: {final_df['director_name'].nunique()}")
    print(f"Unique actors: {len(set(final_df['actor_1_name'].tolist() + final_df['actor_2_name'].tolist() + final_df['actor_3_name'].tolist()))}")
    print(f"Unique genres: {len(set(' '.join(final_df['genres'].fillna('')).split()))}")
    
    # Save final dataset
    final_df.to_csv('main_data.csv', index=False)
    logging.info("Successfully saved final dataset to main_data.csv")
    
except FileNotFoundError:
    logging.warning("No existing dataset found, saving only 2020 data")
    new_df20.to_csv('main_data.csv', index=False)
    logging.info("Successfully saved 2020 dataset to main_data.csv")
    
except Exception as e:
    logging.error(f"Error during dataset merge: {str(e)}")
    raise


In [None]:
# Import các thư viện cần thiết
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')


In [None]:
# Load dữ liệu đã xử lý
data_old = pd.read_csv('data.csv')
data_2017 = pd.read_csv('data_2017_integrated.csv')

print(f"Old Dataset shape: {data_old.shape}")
print(f"2017 Dataset shape: {data_2017.shape}")

# Hiển thị sample data
print("\nSample old data:")
display(data_old.head(3))
print("\nSample 2017 data:")
display(data_2017.head(3))


In [None]:
# Combine datasets
data = pd.concat([data_old, data_2017], ignore_index=True)
print(f"Combined Dataset shape: {data.shape}")

# Check for duplicates
duplicates = data.duplicated(subset=['movie_title']).sum()
print(f"\nDuplicates found: {duplicates}")

# Remove duplicates if any
if duplicates > 0:
    data = data.drop_duplicates(subset=['movie_title'], keep='first')
    print(f"Shape after removing duplicates: {data.shape}")

# Display sample of combined data
print("\nSample combined data:")
display(data.head())


In [None]:
# TF-IDF Vectorization cho genres
tfidf = TfidfVectorizer(stop_words='english')
genres_tfidf = tfidf.fit_transform(data['genres'])
genres_df = pd.DataFrame(genres_tfidf.toarray(), columns=tfidf.get_feature_names_out())
data = pd.concat([data, genres_df], axis=1)

# Label Encoding cho categorical variables
le = LabelEncoder()
data['director_encoded'] = le.fit_transform(data['director_name'])
data['actor1_encoded'] = le.fit_transform(data['actor_1_name'])
data['actor2_encoded'] = le.fit_transform(data['actor_2_name'])
data['actor3_encoded'] = le.fit_transform(data['actor_3_name'])

print("Features after engineering:")
print("\nGenre features:", genres_df.columns.tolist())
print("\nEncoded features:", ['director_encoded', 'actor1_encoded', 'actor2_encoded', 'actor3_encoded'])


In [None]:
def optimize_memory(df):
    """
    Optimize memory usage của DataFrame
    """
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    
    return df

# Optimize memory
print("Memory usage before optimization:", data.memory_usage(deep=True).sum() / 1024**2, "MB")
data = optimize_memory(data)
print("Memory usage after optimization:", data.memory_usage(deep=True).sum() / 1024**2, "MB")


In [None]:
def quality_assurance(df):
    """
    Kiểm tra chất lượng dữ liệu cuối cùng
    """
    print("=== FINAL QUALITY ASSURANCE ===")
    
    # Check missing values
    missing = df.isnull().sum()
    print("\nMissing values:")
    print(missing[missing > 0] if missing.any() else "No missing values")
    
    # Check duplicates
    duplicates = df.duplicated().sum()
    print(f"\nDuplicates: {duplicates} ({(duplicates/len(df)*100):.2f}%)")
    
    # Check data types
    print("\nData types:")
    print(df.dtypes)
    
    # Check memory usage
    print(f"\nMemory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    # Check feature distributions
    print("\nFeature statistics:")
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    display(df[numeric_cols].describe())
    
    # Check text data quality
    print("\nText data quality:")
    text_cols = ['movie_title', 'director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name', 'genres']
    for col in text_cols:
        if col in df.columns:
            empty = df[df[col].str.len() == 0].shape[0] if df[col].dtype == object else 0
            print(f"\n{col}:")
            print(f"  - Empty strings: {empty}")
            print(f"  - Unique values: {df[col].nunique()}")
            print(f"  - Sample values: {df[col].sample(3).tolist()}")
    
    return True

quality_assurance(data)


In [None]:
# Visualize numeric feature distributions
numeric_cols = data.select_dtypes(include=[np.number]).columns[:6]  # Lấy 6 cột đầu tiên để demo
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for idx, col in enumerate(numeric_cols):
    sns.histplot(data=data, x=col, ax=axes[idx])
    axes[idx].set_title(f'Distribution of {col}')
    axes[idx].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Visualize correlations
plt.figure(figsize=(12, 8))
sns.heatmap(data[numeric_cols].corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlations')
plt.show()

# Top directors và actors
plt.figure(figsize=(12, 6))
data['director_name'].value_counts().head(10).plot(kind='barh')
plt.title('Top 10 Directors')
plt.xlabel('Number of Movies')
plt.show()

plt.figure(figsize=(12, 6))
data['actor_1_name'].value_counts().head(10).plot(kind='barh')
plt.title('Top 10 Lead Actors')
plt.xlabel('Number of Movies')
plt.show()


In [None]:
# Lưu dữ liệu đã xử lý
data.to_csv('main_data.csv', index=False)
print("Final data saved successfully!")

# Hiển thị thông tin cuối cùng
print("\nFinal dataset information:")
print(f"Shape: {data.shape}")
print(f"Memory usage: {data.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"Number of features: {len(data.columns)}")
print("\nFeature types:")
print(data.dtypes.value_counts())
