<a href="https://colab.research.google.com/github/yasuke123/DAAN_888/blob/main/DAAN888_cleaned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!kaggle datasets download -d asaniczka/tmdb-movies-dataset-2023-930k-movies
!kaggle datasets download -d joyshil0599/movie-reviews-dataset-10k-scraped-data

Dataset URL: https://www.kaggle.com/datasets/asaniczka/tmdb-movies-dataset-2023-930k-movies
License(s): ODC Attribution License (ODC-By)
Downloading tmdb-movies-dataset-2023-930k-movies.zip to /content
 99% 208M/209M [00:06<00:00, 38.7MB/s]
100% 209M/209M [00:06<00:00, 32.6MB/s]
Dataset URL: https://www.kaggle.com/datasets/joyshil0599/movie-reviews-dataset-10k-scraped-data
License(s): CC0-1.0
Downloading movie-reviews-dataset-10k-scraped-data.zip to /content
100% 2.43M/2.43M [00:00<00:00, 4.56MB/s]
100% 2.43M/2.43M [00:00<00:00, 3.93MB/s]


In [2]:
!unzip tmdb-movies-dataset-2023-930k-movies.zip
!unzip movie-reviews-dataset-10k-scraped-data.zip

Archive:  tmdb-movies-dataset-2023-930k-movies.zip
  inflating: TMDB_movie_dataset_v11.csv  
Archive:  movie-reviews-dataset-10k-scraped-data.zip
  inflating: letterboxd-reviews.csv  
  inflating: metacritic-reviews.csv  


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
warnings.filterwarnings("ignore")

In [4]:
import os
import pandas as pd

# Detect if running in Google Colab
try:
    import google.colab
    in_colab = True
except ImportError:
    in_colab = False

# Set the data directory based on the environment
if in_colab:
    data_dir = "/content/"  # Default path in Colab
else:
    data_dir = "./data/"   # Default path for local machines (a 'data' subfolder)

# Define file paths using the data directory
tmdb_file = os.path.join(data_dir, "TMDB_movie_dataset_v11.csv")
metacritic_file = os.path.join(data_dir, "metacritic-reviews.csv")
letterboxd_file = os.path.join(data_dir, "letterboxd-reviews.csv")

# Check if all files exist before proceeding
for file in [tmdb_file, metacritic_file, letterboxd_file]:
    if not os.path.exists(file):
        print(f"Error: File not found at {file}")
        print("Please ensure the following datasets are placed in the correct directory:")
        print("- TMDB_movie_dataset_v11.csv")
        print("- metacritic-reviews.csv")
        print("- letterboxd-reviews.csv")
        if in_colab:
            print("In Colab, upload the files to '/content/' or adjust 'data_dir' in the code.")
        else:
            print("Locally, place the files in a 'data' subfolder or adjust 'data_dir' in the code.")
        raise FileNotFoundError(f"Missing file: {file}")



In [5]:
# Load the datasets
tmdb_df = pd.read_csv(tmdb_file, encoding="utf-8")
metacritic_df = pd.read_csv(metacritic_file, encoding="ISO-8859-1", on_bad_lines='skip')
letterboxd_df = pd.read_csv(letterboxd_file, encoding="ISO-8859-1")

# Your code continues here...
print("Datasets loaded successfully!")

Datasets loaded successfully!


In [6]:
tmdb_df.rename(columns={'title': 'Movie name'}, inplace=True)

merged_df = tmdb_df.merge(metacritic_df, on="Movie name", how="inner")
merged_df = merged_df.merge(letterboxd_df, on="Movie name", how="inner")
merged_df = merged_df.drop_duplicates()


print(f"Total Records (Rows): {merged_df.shape[0]}")
print(f"Total Columns: {merged_df.shape[1]}\n")
print("Column Headers:", list(merged_df.columns), "\n")
print(merged_df.head())







Total Records (Rows): 18406
Total Columns: 36

Column Headers: ['id', 'Movie name', 'vote_average', 'vote_count', 'status', 'release_date', 'revenue', 'runtime', 'adult', 'backdrop_path', 'budget', 'homepage', 'imdb_id', 'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'tagline', 'genres', 'production_companies', 'production_countries', 'spoken_languages', 'keywords', 'Release Date', 'Rating_x', 'summary', 'User rating', 'Website rating', 'Release Year', 'Rating_y', 'Reviewer name', 'Review date', 'Review', 'Comment count', 'Like count'] 

       id    Movie name  vote_average  vote_count    status release_date  \
0  157336  Interstellar         8.417       32571  Released   2014-11-05   
1  157336  Interstellar         8.417       32571  Released   2014-11-05   
2  157336  Interstellar         8.417       32571  Released   2014-11-05   
3  157336  Interstellar         8.417       32571  Released   2014-11-05   
4  157336  Interstellar         8.417      

In [7]:
# Explore
print(merged_df.info())
print("\n")

print(merged_df.head())
print("\n")

print(merged_df.describe())
print("\n")

print(merged_df.isna().sum())
print("\n")



duplicates = merged_df.duplicated().sum()
print(f"5. Duplicate Rows Remaining: {duplicates}\n")

if 'status' in merged_df.columns:
    print("Unique 'status' Values:", merged_df['status'].unique())
    print()

if 'adult' in merged_df.columns:
    print("Unique 'adult' Values:", merged_df['adult'].unique())
    print()



def parse_star_rating(star_str):
    if not isinstance(star_str, str):
        return None
    star_str = star_str.strip()
    full_stars = star_str.count("â??")
    half_star = 0.5 if "â½" in star_str else 0.0
    return full_stars + half_star

merged_df["Rating_y_cleaned"] = merged_df["Rating_y"].apply(parse_star_rating)

merged_df.drop("Rating_y", axis=1, inplace=True)

# Convert dates
date_cols = ["Release Date", "Review date"]
for col in date_cols:
    if col in merged_df.columns:
        merged_df[col] = pd.to_datetime(merged_df[col], errors="coerce")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18406 entries, 0 to 18405
Data columns (total 36 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    18406 non-null  int64  
 1   Movie name            18406 non-null  object 
 2   vote_average          18406 non-null  float64
 3   vote_count            18406 non-null  int64  
 4   status                18406 non-null  object 
 5   release_date          16198 non-null  object 
 6   revenue               18406 non-null  int64  
 7   runtime               18406 non-null  int64  
 8   adult                 18406 non-null  bool   
 9   backdrop_path         7988 non-null   object 
 10  budget                18406 non-null  int64  
 11  homepage              4837 non-null   object 
 12  imdb_id               12244 non-null  object 
 13  original_language     18406 non-null  object 
 14  original_title        18406 non-null  object 
 15  overview           

In [8]:

# Drop cols with many missing non-numerical values, ids, repeat columns
cols_to_drop = [
    "homepage", "backdrop_path", "tagline", "poster_path",
    "release_date", "imdb_id", "keywords"
]
merged_df.drop(columns=[c for c in cols_to_drop if c in merged_df.columns],
               inplace=True,
               errors="ignore")



# Concat digits for like and comment count
def concatenate_all_digits(text):
    if pd.isna(text):
        return None
    digits = re.findall(r'\d+', str(text))
    if not digits:
        return None
    concatenated = "".join(digits)  # e.g. ["6", "6", "08"] -> "6608"
    return int(concatenated)

if "Like count" in merged_df.columns:
    merged_df["Like count"] = merged_df["Like count"].apply(concatenate_all_digits)

if "Comment count" in merged_df.columns:
    merged_df["Comment count"] = merged_df["Comment count"].apply(concatenate_all_digits)


# Check and save
print(merged_df.info())
print(merged_df.isna().sum())
print(merged_df.head())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18406 entries, 0 to 18405
Data columns (total 29 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   id                    18406 non-null  int64         
 1   Movie name            18406 non-null  object        
 2   vote_average          18406 non-null  float64       
 3   vote_count            18406 non-null  int64         
 4   status                18406 non-null  object        
 5   revenue               18406 non-null  int64         
 6   runtime               18406 non-null  int64         
 7   adult                 18406 non-null  bool          
 8   budget                18406 non-null  int64         
 9   original_language     18406 non-null  object        
 10  original_title        18406 non-null  object        
 11  overview              17157 non-null  object        
 12  popularity            18406 non-null  float64       
 13  genres          

In [9]:
cleaned_file = "merged_dataset.csv"
merged_df.to_csv(cleaned_file, index=False, encoding="utf-8")
print(f" saved to: {cleaned_file}")

 saved to: merged_dataset.csv
