In [27]:
import os
import tarfile
import requests
import pandas as pd
from pathlib import Path
from typing import Optional
from pydantic import BaseModel, ConfigDict


class MovieDataset(BaseModel):
    """Class to handle downloading, extracting, and loading the CMU Movie Dataset."""

    base_url: str = "http://www.cs.cmu.edu/~ark/personas/data/"
    dataset_filename: str = "MovieSummaries.tar.gz"
    download_dir: Path = Path("downloads")
    extracted_dir: Path = download_dir / "MovieSummaries"
    dataset_path: Path = download_dir / dataset_filename

    # Paths to dataset files
    movie_metadata_path: Path = extracted_dir / "movie.metadata.tsv"
    plot_summaries_path: Path = extracted_dir / "plot_summaries.txt"
    character_metadata_path: Path = extracted_dir / "character.metadata.tsv"

    # Data attributes (Defined statically)
    movie_metadata: Optional[pd.DataFrame] = None
    plot_summaries: Optional[pd.DataFrame] = None
    character_metadata: Optional[pd.DataFrame] = None

    model_config = ConfigDict(arbitrary_types_allowed=True)  # Allow DataFrame types

    def __init__(self):
        """Initialize the class: Download, extract, and load multiple datasets."""
        super().__init__()

        # Create download directory
        self.download_dir.mkdir(exist_ok=True)

        # Download dataset if not already present
        if not self.dataset_path.exists():
            self.download_dataset()

        # Extract dataset if not already extracted
        if not self.extracted_dir.exists():
            self.extract_dataset()

        # Load multiple datasets into Pandas DataFrames
        object.__setattr__(self, "movie_metadata", self.load_dataset(self.movie_metadata_path, sep="\t"))
        object.__setattr__(self, "plot_summaries", self.load_dataset(self.plot_summaries_path, sep="\t"))
        object.__setattr__(self, "character_metadata", self.load_dataset(self.character_metadata_path, sep="\t"))

    def download_dataset(self):
        """Download the dataset if it does not already exist."""
        print(f"Downloading {self.dataset_filename}...")

        response = requests.get(self.base_url + self.dataset_filename, stream=True)
        response.raise_for_status()

        with open(self.dataset_path, "wb") as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)

        print("Download complete.")

    def extract_dataset(self):
        """Extract the dataset from the tar.gz file."""
        print("Extracting dataset...")

        with tarfile.open(self.dataset_path, "r:gz") as tar:
            tar.extractall(path=self.download_dir)

        print("Extraction complete.")

    def load_dataset(self, file_path: Path, sep: str = "\t") -> Optional[pd.DataFrame]:
        """Load a dataset into a Pandas DataFrame."""
        if file_path.exists():
            return pd.read_csv(file_path, sep=sep, header=None)
        else:
            print(f"Warning: {file_path} not found!")
            return None


#if __name__ == "__main__":
#    movie_data = MovieDataset()
#
#    # Accessing instance attributes
#    print(movie_data.movie_metadata.head())  # Metadata
#    print(movie_data.plot_summaries.head())  # Plot summaries
#    print(movie_data.character_metadata.head())  # Character metadata




In [None]:
test_data = MovieDataset()


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"
