In [None]:
import os
import tarfile
import requests
import pandas as pd
from pathlib import Path
from typing import Optional
from pydantic import BaseModel, ConfigDict


class MovieDataset(BaseModel):
    """Class to handle downloading, extracting, and loading the CMU Movie Dataset."""

    base_url: str = "http://www.cs.cmu.edu/~ark/personas/data/"
    dataset_filename: str = "MovieSummaries.tar.gz"
    download_dir: Path = Path("downloads")
    extracted_dir: Path = download_dir / "MovieSummaries"
    dataset_path: Path = download_dir / dataset_filename
    movie_metadata_path: Path = extracted_dir / "movie.metadata.tsv"

    movie_metadata: Optional[pd.DataFrame] = None

    model_config = ConfigDict(arbitrary_types_allowed=True)  # Allow DataFrame

    def __init__(self):
        """Initialize the class: Download, extract, and load the dataset."""
        super().__init__()
        self.download_dir.mkdir(exist_ok=True)

        # Download dataset if not already present
        if not self.dataset_path.exists():
            self.download_dataset()

        # Extract dataset if not already extracted
        if not self.extracted_dir.exists():
            self.extract_dataset()

        # Load dataset into Pandas DataFrame
        self.load_dataset()

    def download_dataset(self):
        """Download the dataset if it does not already exist."""
        print(f"Downloading {self.dataset_filename}...")

        response = requests.get(self.base_url + self.dataset_filename, stream=True)
        response.raise_for_status()

        with open(self.dataset_path, "wb") as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)

        print("Download complete.")

    def extract_dataset(self):
        """Extract the dataset from the tar.gz file."""
        print("Extracting dataset...")

        with tarfile.open(self.dataset_path, "r:gz") as tar:
            tar.extractall(path=self.download_dir)

        print("Extraction complete.")

    def load_dataset(self):
        """Load the movie metadata into a Pandas DataFrame."""
        if self.movie_metadata_path.exists():
            self.movie_metadata = pd.read_csv(
                self.movie_metadata_path, sep="\t", header=None
            )
            print("Dataset loaded successfully.")
        else:
            raise FileNotFoundError(f"{self.movie_metadata_path} not found!")


if __name__ == "__main__":
    movie_data = MovieDataset()
    print(movie_data.movie_metadata.head())








Dataset loaded successfully.
          0           1                                                  2  \
0    975900   /m/03vyhn                                     Ghosts of Mars   
1   3196793   /m/08yl5d  Getting Away with Murder: The JonBenét Ramsey ...   
2  28463795  /m/0crgdbh                                        Brun bitter   
3   9363483  /m/0285_cd                                   White Of The Eye   
4    261236   /m/01mrr1                                  A Woman in Flames   

            3           4      5                                   6  \
0  2001-08-24  14010832.0   98.0  {"/m/02h40lc": "English Language"}   
1  2000-02-16         NaN   95.0  {"/m/02h40lc": "English Language"}   
2        1988         NaN   83.0  {"/m/05f_3": "Norwegian Language"}   
3        1987         NaN  110.0  {"/m/02h40lc": "English Language"}   
4        1983         NaN  106.0   {"/m/04306rv": "German Language"}   

                                           7  \
0  {"/m/09c7w0": "Uni

In [None]:

MovieDataset().head()


Dataset loaded successfully.


AttributeError: 'MovieDataset' object has no attribute 'head'