# Data Catalog of DS-Students-Resources

In [11]:
import os
import pandas as pd
import glob
from pathlib import Path
import csv
from datetime import datetime
import mimetypes

In [12]:
# function to recursively find all dataset files in a local GitHub repository
def find_datasets(root_path, extensions):
    datasets = []
    for ext in extensions:
        datasets.extend(list(Path(root_path).rglob(f"*.{ext}")))
    return datasets

### List (not exhaustive) of dataset file formats and their corresponding extensions:

- CSV (Comma Separated Values): csv
- TSV (Tab Separated Values): tsv
- JSON (JavaScript Object Notation): json
- Excel: xls, xlsx
- Plain Text: txt
- Data: data
- HDF5 (Hierarchical Data Format): h5, hdf5
- NetCDF (Network Common Data Form): nc, nc4
- XML (eXtensible Markup Language): xml
- Parquet: parquet
- Avro: avro
- Feather: feather
- ORC (Optimized Row Columnar): orc
- Protocol Buffers: pb, pbf
- GeoJSON: geojson
- Pickle: pkl, pickle
- MATLAB: mat
- ARFF (Attribute-Relation File Format): arff
- NPY (NumPy array): npy
- NPZ (NumPy Zipped): npz
- SAS: sas7bdat
- STATA: dta
- R (RData, RDS): RData, rds

In [13]:
# Define the extensions of dataset files you want to search for and call the find_datasets function.
dataset_extensions = [
    "csv", "tsv", "json", "xlsx", "xls", "txt", "data", "h5", "hdf5",
    "nc", "nc4", "xml", "parquet", "avro", "feather", "orc", "pb", "pbf",
    "geojson", "pkl", "pickle", "mat", "arff", "npy", "npz", "sas7bdat",
    "dta", "RData", "rds"
]

root_dir = "."  # The root directory of your local GitHub repository
datasets = find_datasets(root_dir, dataset_extensions)


In [14]:
# Create a function to extract the necessary metadata from each dataset file.
def get_dataset_metadata(dataset_path):
    metadata = {
        "filename": dataset_path.name,
        "path": str(dataset_path),
        "extension": dataset_path.suffix[1:],
        "size": dataset_path.stat().st_size,
        "mime_type": mimetypes.guess_type(dataset_path)[0] or "unknown",
        "modified_date": datetime.fromtimestamp(dataset_path.stat().st_mtime).strftime('%Y-%m-%d %H:%M:%S'),
    }
    return metadata


In [15]:
# Create a Pandas DataFrame to store the dataset metadata and write it to a CSV file.
dataset_metadata = [get_dataset_metadata(ds) for ds in datasets]
metadata_df = pd.DataFrame(dataset_metadata)
metadata_df.to_csv("ds-datasets.csv", index=False)