In [37]:
import os
import tarfile
import requests
import pandas as pd
from pathlib import Path
from typing import Optional, Dict
from pydantic import BaseModel, ConfigDict


class MovieDataset(BaseModel):
    """Class to handle downloading, extracting, and loading the CMU Movie Dataset.

    Attributes:
        base_url (str): URL for downloading the dataset.
        dataset_filename (str): Name of the dataset archive file.
        download_dir (Path): Directory where dataset will be downloaded.
        extracted_dir (Path): Directory where dataset will be extracted.
        dataset_path (Path): Full path to the downloaded archive.
        dataframes (Dict[str, Optional[pd.DataFrame]]): Dictionary holding all loaded DataFrames.
    """

    base_url: str = "http://www.cs.cmu.edu/~ark/personas/data/"
    dataset_filename: str = "MovieSummaries.tar.gz"
    download_dir: Path = Path("downloads")
    extracted_dir: Path = download_dir / "MovieSummaries"
    dataset_path: Path = download_dir / dataset_filename

    # Dictionary to store all dynamically loaded datasets
    dataframes: Dict[str, Optional[pd.DataFrame]] = {}

    model_config = ConfigDict(arbitrary_types_allowed=True)  # Allow Pandas DataFrames

    def __init__(self):
        """
        Initializes the MovieDataset class.
        - Creates necessary directories.
        - Downloads dataset if missing.
        - Extracts dataset if needed.
        - Dynamically loads all `.tsv` and `.txt` files into Pandas DataFrames.
        """

        super().__init__()

        # Ensure the download directory exists
        self.download_dir.mkdir(exist_ok=True)

        # Download dataset if it does not exist
        if not self.dataset_path.exists():
            self.download_dataset()

        # Extract dataset if it has not been extracted
        if not self.extracted_dir.exists():
            self.extract_dataset()

        # Load all available dataset files dynamically
        self.load_all_datasets()

    def download_dataset(self):
        """Downloads the dataset from the specified URL if it does not already exist."""
        print(f"Downloading {self.dataset_filename}...")

        try:
            response = requests.get(self.base_url + self.dataset_filename, stream=True)
            response.raise_for_status()

            with open(self.dataset_path, "wb") as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)

            print("Download complete.")

        except requests.exceptions.RequestException as e:
            print(f"Download failed: {e}")

    def extract_dataset(self):
        """Extracts the dataset archive into the designated directory."""
        print("Extracting dataset...")

        try:
            with tarfile.open(self.dataset_path, "r:gz") as tar:
                tar.extractall(path=self.download_dir)
            print("Extraction complete.")
        except tarfile.TarError as e:
            print(f"Error extracting dataset: {e}")

    def load_all_datasets(self):
        """
        Dynamically loads all `.tsv` and `.txt` files from the extracted directory into Pandas DataFrames.
        - Each dataset is stored in a dictionary (`dataframes`) using the filename (without extension) as the key.
        """
        print("Loading datasets...")

        if not self.extracted_dir.exists():
            print(f"Error: Extracted directory {self.extracted_dir} does not exist.")
            return

        for file_path in self.extracted_dir.glob("*"):
            if file_path.suffix in [".tsv", ".txt"]:  # Load only relevant file types
                self.load_dataset(file_path, sep="\t")
        
    def load_dataset(self, file_path: Path, sep: str = "\t"):
        """
        Loads a dataset file into a Pandas DataFrame and stores it in `dataframes`.

        Args:
            file_path (Path): Path to the dataset file.
            sep (str): Separator used in the file (default is tab-separated).
        """
        dataset_name = file_path.stem  # Extract filename without extension

        print(f"Checking file: {file_path}")  # Debugging line

        if file_path.exists():
            try:
                df = pd.read_csv(file_path, sep=sep, header=None)
                object.__setattr__(self, dataset_name, df)  # Dynamically set as attribute
                self.dataframes[dataset_name] = df  # Store in dictionary
                print(f"Loaded dataset: {dataset_name}, Shape: {df.shape}")
            except Exception as e:
                print(f"ERROR while loading {file_path}: {e}")
                self.dataframes[dataset_name] = None
        else:
            print(f"ERROR: {file_path} NOT FOUND!")
            self.dataframes[dataset_name] = None


#if __name__ == "__main__":
#    movie_data = MovieDataset()
#
#    # Accessing instance attributes
#    print(movie_data.movie_metadata.head())  # Metadata
#    print(movie_data.plot_summaries.head())  # Plot summaries
#    print(movie_data.character_metadata.head())  # Character metadata


In [38]:
test_data = MovieDataset()


Extracting dataset...


  tar.extractall(path=self.download_dir)


Extraction complete.
Loading datasets...
Checking file: downloads/MovieSummaries/plot_summaries.txt
Loaded dataset: plot_summaries, Shape: (42303, 2)
Checking file: downloads/MovieSummaries/movie.metadata.tsv
Loaded dataset: movie.metadata, Shape: (81741, 9)
Checking file: downloads/MovieSummaries/name.clusters.txt
Loaded dataset: name.clusters, Shape: (2666, 2)
Checking file: downloads/MovieSummaries/README.txt
Loaded dataset: README, Shape: (52, 1)
Checking file: downloads/MovieSummaries/character.metadata.tsv
Loaded dataset: character.metadata, Shape: (450669, 13)
Checking file: downloads/MovieSummaries/tvtropes.clusters.txt
Loaded dataset: tvtropes.clusters, Shape: (501, 2)
