In [None]:
import time
from pathlib import Path
import requests
import json
import pandas as pd

In [None]:
working_dir = Path("/kaggle/working/")
dataset_dir = working_dir / "dataset"
pdf_temp_dir = working_dir / "new-pdfs"
metadata_path = working_dir / "indian-supreme-court-judgments/data/metadata/clean/judgments.csv"
kaggle_dataset_id = "vangap/indian-supreme-court-judgments"


In [None]:
import zipfile


def get_zip_files(zip_file_path):
    all_files = []
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        for file_info in zip_ref.infolist():
            all_files.append(file_info.filename)
    return all_files

def add_files_to_zip(zip_file_path: Path, files_to_add: list[Path], directory_in_zip: str):
    with zipfile.ZipFile(zip_file_path, 'a') as zip_ref:
        for file_to_add in files_to_add:
            arcname = f"{directory_in_zip}/{file_to_add.name}"
            zip_ref.write(file_to_add, arcname=arcname)

def download_file(url, file_path):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for any request errors
        with open(file_path, "wb") as f:
            f.write(response.content)
        print(f"Downloaded {file_path}")
        return True
    except requests.exceptions.RequestException as e:
        print(f"Error downloading file from {url}: {e}")
        return False

    
def download_judgments(df, output_dir, existing_pdfs):
    failed_pdfs = []
    downloaded_pdfs = []
    for index, row in df.iterrows():
        temp_link = row["temp_link"]
        file_name = row["diary_no"] + "___" + temp_link
        file_name = file_name.replace("/", "__")
        file_path = Path(f"{output_dir}/{file_name}")
        if not file_name in existing_pdfs:
            url = f"https://main.sci.gov.in/{temp_link}"
            if download_file(url, file_path):
                downloaded_pdfs.append(file_path)
                time.sleep(1)
            else:
                failed_pdfs.append(file_path)
        else:
            pass
    return downloaded_pdfs, failed_pdfs

def get_existing_pdfs(dataset_zip_file):
    all_files = get_zip_files(dataset_zip_file)
    pdfs = [Path(f).name for f in all_files if f.startswith("pdfs/")]
    return pdfs


In [None]:
!git clone https://github.com/vanga/indian-supreme-court-judgments

In [None]:
from kaggle_secrets import UserSecretsClient
kaggle_cred_path = Path('/root/.kaggle/')
kaggle_cred_path.mkdir(exist_ok=True)    
kaggle_api_key = UserSecretsClient().get_secret("KAGGLE_API_KEY")
kaggle_username = UserSecretsClient().get_secret("KAGGLE_USERNAME")


with open(kaggle_cred_path / "kaggle.json", 'w') as fid:
    fid.writelines(json.dumps({"username":kaggle_username,"key":kaggle_api_key}))
!chmod 600 /root/.kaggle/kaggle.json

dataset_dir.mkdir(exist_ok=True, parents=True)
with open(dataset_dir / 'dataset-metadata.json', 'w') as json_fid:
    json_fid.write(json.dumps({"title": "Indian Supreme Court Judgments", "id": kaggle_dataset_id}))

!kaggle datasets download {kaggle_dataset_id} -p {dataset_dir}

In [None]:
%cd {working_dir}
dataset_zip_path = dataset_dir / "indian-supreme-court-judgments.zip"
df = pd.read_csv(metadata_path)
existing_pdfs = get_existing_pdfs(dataset_zip_path)
print(f"Existing pdfs: {len(existing_pdfs)}")
downloaded_pdfs, failed_pdfs = download_judgments(df, pdf_temp_dir, existing_pdfs)
print(f"Downloaded {len(downloaded_pdfs)} new judgments, {len(failed_pdfs)} failed")

In [None]:
if downloaded_pdfs:
    add_files_to_zip(dataset_zip_path, downloaded_pdfs, "pdfs")
    %cd {dataset_dir}
    !kaggle datasets version -m "Latest"    
else:
    print("No new judgments")

In [None]:
!kaggle datasets status {kaggle_dataset_id}