In [1]:
import numpy as np
import pandas as pd
import ast
import logging

# Cấu hình logging cơ bản
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)


In [2]:
logging.info("Loading credits data...")
data = pd.read_csv('datasets/credits.csv')


2025-06-12 22:31:01,923 - INFO - Loading credits data...


In [3]:
logging.info("Loading metadata...")
meta = pd.read_csv('datasets/movie_metadata.csv')


2025-06-12 22:31:08,558 - INFO - Loading metadata...


In [4]:
logging.info("Processing release dates...")
meta['year'] = meta['title_year']


2025-06-12 22:31:11,063 - INFO - Processing release dates...


In [6]:
def validate_json_data(df, columns):
    """
    Kiểm tra tính hợp lệ của dữ liệu JSON trong các cột được chỉ định
    
    Args:
        df (pandas.DataFrame): DataFrame cần kiểm tra
        columns (list): Danh sách các cột cần validate
        
    Returns:
        dict: Kết quả validate cho mỗi cột
    """
    results = {}
    
    for col in columns:
        if col in df.columns:
            total = len(df[col])
            valid_json = 0
            invalid_json = 0
            empty_json = 0
            
            for item in df[col]:
                if pd.isna(item) or item == '':
                    empty_json += 1
                else:
                    try:
                        # Thử parse JSON string
                        ast.literal_eval(item)
                        valid_json += 1
                    except:
                        invalid_json += 1
            
            # Log kết quả
            logging.info(f"\n{col}:")
            logging.info(f"  Valid JSON: {valid_json}")
            logging.info(f"  Invalid JSON: {invalid_json}")
            logging.info(f"  Empty JSON: {empty_json}")
            logging.info(f"  Validity rate: {(valid_json/total*100):.2f}%")
            
            results[col] = {
                'valid': valid_json,
                'invalid': invalid_json,
                'empty': empty_json,
                'total': total
            }
    
    return results


In [12]:
# Load và xử lý dữ liệu ban đầu
logging.info("Loading data files...")
credits = pd.read_csv('datasets/credits.csv')
meta = pd.read_csv('datasets/movie_metadata.csv')

logging.info("=== DATASET INFORMATION ===")
logging.info(f"Credits shape: {credits.shape}")
logging.info(f"Metadata shape: {meta.shape}")

# Xử lý năm phát hành
logging.info("Processing release dates...")
meta['year'] = meta['title_year']

# Validate JSON columns trong credits dataset
json_columns = ['cast', 'crew']  # Chỉ validate cast và crew vì genres nằm trong meta
validation_results = validate_json_data(credits, json_columns)

# Hiển thị phân bố theo năm
logging.info("\n=== YEAR DISTRIBUTION ===")
year_dist = meta['year'].value_counts().sort_index()
display(year_dist)

# Lọc phim 2017 - giữ nguyên logic gốc
logging.info("Filtering 2017 movies...")
new_meta = meta.loc[meta.year == 2017, ['genres', 'movie_title', 'title_year']]

# Merge dữ liệu
logging.info("Merging datasets...")
data = pd.merge(new_meta, credits, on='movie_title')
pd.set_option('display.max_colwidth', 75)

logging.info("\n=== MERGED DATA INFORMATION ===")
logging.info(f"Shape after merge: {data.shape}")
display(data.head())


2025-06-12 22:49:42,511 - INFO - Loading data files...
2025-06-12 22:49:46,012 - INFO - === DATASET INFORMATION ===
2025-06-12 22:49:46,013 - INFO - Credits shape: (45476, 3)
2025-06-12 22:49:46,014 - INFO - Metadata shape: (5043, 28)
2025-06-12 22:49:46,015 - INFO - Processing release dates...
2025-06-12 22:50:07,160 - INFO - 
cast:
2025-06-12 22:50:07,161 - INFO -   Valid JSON: 45476
2025-06-12 22:50:07,162 - INFO -   Invalid JSON: 0
2025-06-12 22:50:07,163 - INFO -   Empty JSON: 0
2025-06-12 22:50:07,164 - INFO -   Validity rate: 100.00%
2025-06-12 22:50:24,003 - INFO - 
crew:
2025-06-12 22:50:24,004 - INFO -   Valid JSON: 45476
2025-06-12 22:50:24,004 - INFO -   Invalid JSON: 0
2025-06-12 22:50:24,004 - INFO -   Empty JSON: 0
2025-06-12 22:50:24,005 - INFO -   Validity rate: 100.00%
2025-06-12 22:50:24,006 - INFO - 
=== YEAR DISTRIBUTION ===


year
1916.0      1
1920.0      1
1925.0      1
1927.0      1
1929.0      2
         ... 
2012.0    221
2013.0    237
2014.0    252
2015.0    226
2016.0    106
Name: count, Length: 91, dtype: int64

2025-06-12 22:50:24,009 - INFO - Filtering 2017 movies...
2025-06-12 22:50:24,012 - INFO - Merging datasets...


KeyError: 'movie_title'

In [None]:
meta['year'] = meta['release_date'].dt.year


In [None]:
meta['year'].value_counts().sort_index()


In [None]:
# Getting only 2017 movies as we already have movies up to the year 2016 in preprocessing 1 file. 
# We don't have enough data for the movies from 2018, 2019 and 2020. 
# We'll deal with it in the upcoming preprocessing files
logging.info("Filtering 2017 movies...")
new_meta = meta.loc[meta.year == 2017, ['genres', 'id', 'title', 'year']]


In [None]:
new_meta['id'] = new_meta['id'].astype(int)


In [None]:
logging.info("Merging datasets...")
data = pd.merge(new_meta, credits, on='id')


In [None]:
pd.set_option('display.max_colwidth', 75)
data


In [None]:
# evaluates an expression node or a string containing a Python literal or container display
logging.info("Parsing JSON data...")
data['genres'] = data['genres'].map(lambda x: ast.literal_eval(x))
data['cast'] = data['cast'].map(lambda x: ast.literal_eval(x))
data['crew'] = data['crew'].map(lambda x: ast.literal_eval(x))


In [None]:
def make_genresList(x):
    gen = []
    st = " "
    for i in x:
        if i.get('name') == 'Science Fiction':
            scifi = 'Sci-Fi'
            gen.append(scifi)
        else:
            gen.append(i.get('name'))
    if gen == []:
        return np.NaN
    else:
        return (st.join(gen))


In [None]:
logging.info("Creating genres list...")
data['genres_list'] = data['genres'].map(lambda x: make_genresList(x))


In [None]:
def get_actor1(x):
    casts = []
    for i in x:
        casts.append(i.get('name'))
    if casts == []:
        return np.NaN
    else:
        return (casts[0])

def get_actor2(x):
    casts = []
    for i in x:
        casts.append(i.get('name'))
    if casts == [] or len(casts)<=1:
        return np.NaN
    else:
        return (casts[1])

def get_actor3(x):
    casts = []
    for i in x:
        casts.append(i.get('name'))
    if casts == [] or len(casts)<=2:
        return np.NaN
    else:
        return (casts[2])


In [None]:
logging.info("Extracting actor information...")
data['actor_1_name'] = data['cast'].map(lambda x: get_actor1(x))
data['actor_2_name'] = data['cast'].map(lambda x: get_actor2(x))
data['actor_3_name'] = data['cast'].map(lambda x: get_actor3(x))


In [None]:
def get_directors(x):
    dt = []
    st = " "
    for i in x:
        if i.get('job') == 'Director':
            dt.append(i.get('name'))
    if dt == []:
        return np.NaN
    else:
        return (st.join(dt))


In [None]:
logging.info("Extracting director information...")
data['director_name'] = data['crew'].map(lambda x: get_directors(x))


In [None]:
logging.info("Preparing final dataset...")
movie = data.loc[:, ['director_name', 'actor_1_name', 'actor_2_name', 'actor_3_name', 'genres_list', 'title']]
movie = movie.dropna(how='any')

# Đổi tên cột
movie = movie.rename(columns={'genres_list': 'genres'})
movie = movie.rename(columns={'title': 'movie_title'})

# Chuyển tên phim về chữ thường
movie['movie_title'] = movie['movie_title'].str.lower()

# Tạo cột combined features
movie['comb'] = movie['actor_1_name'] + ' ' + movie['actor_2_name'] + ' ' + movie['actor_3_name'] + ' ' + movie['director_name'] + ' ' + movie['genres']

# Load dữ liệu cũ và merge
logging.info("Loading and processing old data...")
old = pd.read_csv('data.csv')
old['comb'] = old['actor_1_name'] + ' ' + old['actor_2_name'] + ' ' + old['actor_3_name'] + ' ' + old['director_name'] + ' ' + old['genres']

# Lưu kết quả
logging.info("Saving results...")
movie.to_csv('new_data.csv', index=False)
