# Data Catalog of DS-Students-Resources

In [1]:
# Import the necessary libraries 
import os
import pandas as pd
import glob
from pathlib import Path
import csv
from datetime import datetime
import mimetypes

In [2]:
# function to recursively find all dataset files in local GitHub repository ignoring .ipynb_checkpoint files
def find_datasets(root_path, extensions):
    datasets = []
    for ext in extensions:
        datasets.extend([path for path in Path(root_path).rglob(f"*.{ext}") if ".ipynb_checkpoints" not in str(path)])
    return datasets

### List (not exhaustive) of dataset file formats and their corresponding extensions:

- CSV (Comma Separated Values): csv
- TSV (Tab Separated Values): tsv
- JSON (JavaScript Object Notation): json
- Excel: xls, xlsx
- Plain Text: txt
- Data: data
- HDF5 (Hierarchical Data Format): h5, hdf5
- NetCDF (Network Common Data Form): nc, nc4
- XML (eXtensible Markup Language): xml
- Parquet: parquet
- Avro: avro
- Feather: feather
- ORC (Optimized Row Columnar): orc
- Protocol Buffers: pb, pbf
- GeoJSON: geojson
- Pickle: pkl, pickle
- MATLAB: mat
- ARFF (Attribute-Relation File Format): arff
- NPY (NumPy array): npy
- NPZ (NumPy Zipped): npz
- SAS: sas7bdat
- STATA: dta
- R (RData, RDS): RData, rds

In [3]:
# Define the extensions of dataset files you want to search for and call the find_datasets function.
dataset_extensions = [
    "csv", "tsv", "json", "xlsx", "xls", "txt", "data", "h5", "hdf5",
    "nc", "nc4", "xml", "parquet", "avro", "feather", "orc", "pb", "pbf",
    "geojson", "pkl", "pickle", "mat", "arff", "npy", "npz", "sas7bdat",
    "dta", "RData", "rds"
]

root_dir = "."  # The root directory of local GitHub repository
datasets = find_datasets(root_dir, dataset_extensions)


In [4]:
# Create a function to extract the necessary metadata from each dataset file.
def get_dataset_metadata(dataset_path):
    metadata = {
        "filename": dataset_path.name,
        "path": str(dataset_path),
        "extension": dataset_path.suffix[1:],
        "size": dataset_path.stat().st_size,
        "mime_type": mimetypes.guess_type(dataset_path)[0] or "unknown",
        "modified_date": datetime.fromtimestamp(dataset_path.stat().st_mtime).strftime('%Y-%m-%d %H:%M:%S'),
    }
    return metadata


In [5]:
# Create a Pandas DataFrame to store the dataset metadata and write it to a CSV file.
dataset_metadata = [get_dataset_metadata(ds) for ds in datasets]
metadata_df = pd.DataFrame(dataset_metadata)
metadata_df.to_csv("ds-datasets.csv", index=False)

In [6]:
# list the metadata contained in the metadata_df
metadata_df.head()

Unnamed: 0,filename,path,extension,size,mime_type,modified_date
0,ds-video-catalog.csv,ds-video-catalog.csv,csv,58906,text/csv,2023-05-15 18:01:33
1,ds-datasets.csv,ds-datasets.csv,csv,75844,text/csv,2023-05-27 07:30:58
2,ds-jupyter-notebooks.csv,ds-jupyter-notebooks.csv,csv,99403,text/csv,2023-05-27 07:31:14
3,DS-Students.csv,DS108-Databases/NoSQL/Data/DS-Students.csv,csv,12452,text/csv,2021-11-18 13:17:16
4,addresses.csv,DS108-Databases/SQL/Workshops/Week-2/08-Export...,csv,52009,text/csv,2021-12-03 16:18:53


In [7]:
# computer and print total size of all datasets
total_size_bytes = metadata_df["size"].sum()
total_size_mb = total_size_bytes / (1024 ** 2)  # Convert bytes to megabytes
print(f"Total size of all Datasets: {total_size_mb:.2f} MB")

Total size of all Datasets: 919.12 MB


In [8]:
# count total number of datasets
total_datasets = len(datasets)
print(f"Total number of Datasets: {total_datasets}")

Total number of Datasets: 566
