In [14]:
import os
import requests
import tarfile
import pandas as pd
from pydantic import BaseModel
from typing import Optional

class DataDownloader(BaseModel):
    dataframes: dict = {}
    url: str
    download_path: str = 'downloads/MS_data.tar.gz'
    extract_path: str = 'downloads/MS_extracted/'

    def __init__(self, url: str, **data):
        super().__init__(url=url, **data)
        self.dataframes = {}
        self.download_data()
        self.unzip_data()
        self.load_dataframes()
        self.set_dataframe_attributes()

    def download_data(self):
        if not os.path.exists(self.download_path):
            os.makedirs(os.path.dirname(self.download_path), exist_ok=True)
            response = requests.get(self.url)
            with open(self.download_path, 'wb') as file:
                file.write(response.content)
            print(f"Downloaded data to {self.download_path}")
        else:
            print(f"Data already exists at {self.download_path}")

    def unzip_data(self):
        with tarfile.open(self.download_path, 'r:gz') as tar_ref:
            tar_ref.extractall(self.extract_path)
        print(f"Extracted data to {self.extract_path}")

    def load_dataframes(self):
        # Assuming the zip file contains CSV files
        self.dataframes = {}
        for file_name in os.listdir(self.extract_path):
            if file_name.endswith('.csv'):
                df_name = file_name.split('.')[0]
                self.dataframes[df_name] = pd.read_csv(os.path.join(self.extract_path, file_name))
                print(f"Loaded {file_name} into dataframe {df_name}")

    def set_dataframe_attributes(self):
        for df_name, df in self.dataframes.items():
            setattr(self, df_name, df)
            print(f"Set attribute {df_name} for dataframe")

# Example usage:
downloader = DataDownloader(url='http://www.cs.cmu.edu/~ark/personas/data/MovieSummaries.tar.gz')

Data already exists at downloads/MS_data.tar.gz


  tar_ref.extractall(self.extract_path)


Extracted data to downloads/MS_extracted/
