In [1]:
import time
import random
import logging
import requests
import json
import pandas as pd
from typing import Optional, List, Dict, Union
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from urllib.parse import quote
import os

In [2]:
class CommonCrawlScraper:
    def __init__(self, base_index: str = "CC-MAIN-2024-30"):
        self.base_url = f"https://index.commoncrawl.org/{base_index}-index"
        self.session = requests.Session()
        
        # Set up logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
    
    def fetch_page(self, search_term: str, page: int) -> Optional[List[Dict]]:
        """
        Fetch a single page of results for a search term.
        
        Args:
            search_term: Term to search for (e.g., "*.gov")
            page: Page number to fetch
            
        Returns:
            List of result dictionaries or None if page is invalid.
        """
        encoded_term = quote(search_term)
        url = f"{self.base_url}?url={encoded_term}&output=json&page={page}"
        logging.info(f"Fetching page {page} for {search_term}")
        try:
            response = self.session.get(url, timeout=30)
            response.raise_for_status()
            
            results = []
            for line in response.text.strip().split('\n'):
                try:
                    data = json.loads(line)
                    if 'message' in data and 'invalid' in data['message'].lower():
                        logging.info(f"Reached end of valid pages at page {page} for {search_term}")
                        return None
                    results.append(data)
                except json.JSONDecodeError as e:
                    logging.warning(f"Could not parse line on page {page} for {search_term}: {e}")
                    continue
            logging.info(f"Page {page} returned {len(results)} results.")
            return results
        except requests.exceptions.RequestException as e:
            logging.error(f"Error fetching page {page} for {search_term}: {e}")
            return None
    
    def scrape_pattern(self, country: str, pattern: str, start_page: int = 0, 
                      delay: float = 1.0) -> pd.DataFrame:
        """
        Scrape all pages for a single search pattern.
        
        Args:
            country: Country label (e.g., "United States")
            pattern: Domain pattern to search (e.g., "*.gov")
            start_page: Page number to start from
            delay: Base delay between requests in seconds
            
        Returns:
            DataFrame containing results for this pattern.
        """
        all_results = []
        page = start_page
        pages_scraped = 0
        
        logging.info(f"Starting scrape for {country} pattern: {pattern}")
        while True:
            results = self.fetch_page(pattern, page)
            if not results:
                logging.info(f"No more results for {country} pattern {pattern} after page {page}.")
                break
            for result in results:
                result['country'] = country
                result['pattern'] = pattern
            all_results.extend(results)
            pages_scraped += 1
            logging.info(f"Scraped page {page} for {country} pattern {pattern} ({len(results)} results). Total pages scraped: {pages_scraped}")
            
            sleep_time = delay + random.uniform(0, delay)
            logging.info(f"Sleeping for {sleep_time:.2f} seconds before next page request.")
            time.sleep(sleep_time)
            page += 1
        
        logging.info(f"Finished scraping pattern {pattern} for {country}. Total results: {len(all_results)}")
        return pd.DataFrame(all_results) if all_results else pd.DataFrame()
    
    def scrape_all(self, search_terms: Dict[str, Union[str, List[str]]], 
               delay: float = 1.0) -> pd.DataFrame:
        """
        Scrape all pages for multiple countries and their patterns.

        Args:
            search_terms: Dictionary mapping countries to patterns, 
                          e.g., {"USA": ["*.gov", "*.fed.us"], "Canada": ["*.gc.ca"]}
            delay: Base delay between requests in seconds.

        Returns:
            DataFrame containing all results.
        """
        all_dfs = []
        for country, patterns in search_terms.items():
            if isinstance(patterns, str):
                patterns = [patterns]
            for pattern in patterns:
                safe_pattern = pattern.replace('*', 'ALL').replace('.', '_')
                filename = f'commoncrawl_{country.lower()}_{safe_pattern}_results.parquet'
                if os.path.exists(filename):
                    logging.info(f"File {filename} already exists; skipping scrape for {country} pattern {pattern}")
                    continue

                logging.info(f"Starting scrape for {country} pattern: {pattern}")
                try:
                    df = self.scrape_pattern(country, pattern, delay=delay)
                    if not df.empty:
                        all_dfs.append(df)
                        df.to_parquet(filename, index=False)
                        logging.info(f"Saved intermediate results for {country} pattern {pattern} to {filename}")
                    else:
                        logging.info(f"No data found for {country} pattern {pattern}.")
                except Exception as e:
                    logging.error(f"Error processing {country} pattern {pattern}: {e}")
                    continue
        if all_dfs:
            final_df = pd.concat(all_dfs, ignore_index=True)
            final_filename = "commoncrawl_all_results.parquet"
            final_df.to_parquet(final_filename, index=False)
            logging.info(f"Scrape complete. Total results: {len(final_df)}. Saved final results to {final_filename}")

            summary = final_df.groupby(['country', 'pattern']).size().reset_index(name='count')
            logging.info("Summary of results by country and pattern:")
            logging.info(summary.to_string(index=False))
            return final_df
        else:
            logging.info("No results were scraped.")
            return pd.DataFrame()

In [3]:
gov_domains = {
    # North America
    "United States": ["*.gov", "*.mil"],
    "Canada": ["*.gc.ca", "*.canada.ca"],
    "Mexico": "*.gob.mx",
    
    # Caribbean
    "Jamaica": "*.gov.jm",
    "Trinidad and Tobago": "*.gov.tt",
    "Barbados": "*.gov.bb",
    "Bahamas": "*.gov.bs",
    "Dominican Republic": "*.gob.do",
    
    # Central America
    "Costa Rica": "*.go.cr",
    "Panama": "*.gob.pa",
    "Guatemala": "*.gob.gt",
    "El Salvador": "*.gob.sv",
    "Honduras": "*.gob.hn",
    "Nicaragua": "*.gob.ni",
    "Belize": "*.gov.bz",
    
    # South America
    "Brazil": "*.gov.br",
    "Argentina": "*.gob.ar",
    "Chile": "*.gob.cl",
    "Colombia": "*.gov.co",
    "Peru": "*.gob.pe",
    "Venezuela": "*.gob.ve",
    "Ecuador": "*.gob.ec",
    "Bolivia": "*.gob.bo",
    "Paraguay": "*.gov.py",
    "Uruguay": "*.gub.uy",
    "Guyana": "*.gov.gy",
    "Suriname": "*.gov.sr",
    
    # Western Europe
    "United Kingdom": "*.gov.uk",
    "France": "*.gouv.fr",
    "Germany": ["*.bund.de", "*.bayern.de"],
    "Italy": "*.gov.it",
    "Spain": "*.gob.es",
    "Portugal": "*.gov.pt",
    "Netherlands": ["*.overheid.nl", "*.regering.nl"],
    "Belgium": ["*.belgium.be", "*.fed.be"],
    "Ireland": "*.gov.ie",
    "Luxembourg": "*.gouvernement.lu",
    "Monaco": "*.gouv.mc",
    
    # Northern Europe
    "Sweden": ["*.regeringen.se", "*.gov.se"],
    "Norway": "*.regjeringen.no",
    "Denmark": "*.gov.dk",
    "Finland": "*.gov.fi",
    "Iceland": "*.island.is",
    "Estonia": "*.gov.ee",
    "Latvia": "*.gov.lv",
    "Lithuania": "*.gov.lt",
    
    # Eastern Europe
    "Poland": "*.gov.pl",
    "Czech Republic": "*.gov.cz",
    "Slovakia": "*.gov.sk",
    "Hungary": "*.gov.hu",
    "Romania": "*.gov.ro",
    "Bulgaria": "*.government.bg",
    "Moldova": "*.gov.md",
    "Ukraine": "*.gov.ua",
    "Belarus": "*.gov.by",
    
    # Southern Europe
    "Greece": "*.gov.gr",
    "Croatia": "*.gov.hr",
    "Serbia": "*.gov.rs",
    "Slovenia": "*.gov.si",
    "Albania": "*.gov.al",
    "North Macedonia": "*.gov.mk",
    "Bosnia and Herzegovina": "*.gov.ba",
    "Montenegro": "*.gov.me",
    "Malta": "*.gov.mt",
    "Cyprus": "*.gov.cy",
    
    # South Asia
    "India": ["*.gov.in", "*.nic.in"],
    "Pakistan": "*.gov.pk",
    "Bangladesh": "*.gov.bd",
    "Sri Lanka": "*.gov.lk",
    "Nepal": "*.gov.np",
    "Bhutan": "*.gov.bt",
    "Maldives": "*.gov.mv",
    "Afghanistan": "*.gov.af",
    
    # East Asia
    "Japan": "*.go.jp",
    "South Korea": "*.go.kr",
    "North Korea": "*.gov.kp",
    "China": ["*.gov.cn", "*.政务.cn"],
    "Mongolia": "*.gov.mn",
    "Taiwan": "*.gov.tw",

    # Southeast Asia
    "Indonesia": "*.go.id",
    "Malaysia": "*.gov.my",
    "Singapore": "*.gov.sg",
    "Philippines": "*.gov.ph",
    "Thailand": "*.go.th",
    "Vietnam": "*.gov.vn",
    "Myanmar": "*.gov.mm",
    "Cambodia": "*.gov.kh",
    "Laos": "*.gov.la",
    "Brunei": "*.gov.bn",
    "Timor-Leste": "*.gov.tl",
    
    # Central Asia
    "Kazakhstan": "*.gov.kz",
    "Uzbekistan": "*.gov.uz",
    "Kyrgyzstan": "*.gov.kg",
    "Tajikistan": "*.gov.tj",
    "Turkmenistan": "*.gov.tm",
    
    # Middle East
    "Saudi Arabia": "*.gov.sa",
    "UAE": "*.gov.ae",
    "Iran": "*.gov.ir",
    "Iraq": "*.gov.iq",
    "Israel": "*.gov.il",
    "Jordan": "*.gov.jo",
    "Lebanon": "*.gov.lb",
    "Oman": "*.gov.om",
    "Qatar": "*.gov.qa",
    "Kuwait": "*.gov.kw",
    "Bahrain": "*.gov.bh",
    "Yemen": "*.gov.ye",
    "Syria": "*.gov.sy",
    
    # North Africa
    "Egypt": "*.gov.eg",
    "Morocco": "*.gov.ma",
    "Tunisia": "*.gov.tn",
    "Algeria": "*.gov.dz",
    "Libya": "*.gov.ly",
    "Sudan": "*.gov.sd",
    
    # West Africa
    "Nigeria": "*.gov.ng",
    "Ghana": "*.gov.gh",
    "Senegal": "*.gouv.sn",
    "Ivory Coast": "*.gouv.ci",
    "Mali": "*.gouv.ml",
    "Burkina Faso": "*.gov.bf",
    "Guinea": "*.gov.gn",
    "Sierra Leone": "*.gov.sl",
    "Liberia": "*.gov.lr",
    "Togo": "*.gouv.tg",
    "Benin": "*.gouv.bj",
    "Niger": "*.gouv.ne",
    "Gambia": "*.gov.gm",
    "Guinea-Bissau": "*.gov.gw",
    "Cape Verde": "*.gov.cv",
    
    # East Africa
    "Kenya": "*.go.ke",
    "Tanzania": "*.go.tz",
    "Uganda": "*.go.ug",
    "Ethiopia": "*.gov.et",
    "Rwanda": "*.gov.rw",
    "Burundi": "*.gov.bi",
    "South Sudan": "*.gov.ss",
    "Eritrea": "*.gov.er",
    "Djibouti": "*.gouv.dj",
    "Somalia": "*.gov.so",
    
    # Southern Africa
    "South Africa": "*.gov.za",
    "Namibia": "*.gov.na",
    "Botswana": "*.gov.bw",
    "Zimbabwe": "*.gov.zw",
    "Mozambique": "*.gov.mz",
    "Zambia": "*.gov.zm",
    "Malawi": "*.gov.mw",
    "Angola": "*.gov.ao",
    "Madagascar": "*.gov.mg",
    "Mauritius": "*.gov.mu",
    "Seychelles": "*.gov.sc",
    "Lesotho": "*.gov.ls",
    "Eswatini": "*.gov.sz",
    
    # Oceania
    "Australia": "*.gov.au",
    "New Zealand": "*.govt.nz",
    "Papua New Guinea": "*.gov.pg",
    "Fiji": "*.gov.fj",
    "Solomon Islands": "*.gov.sb",
    "Vanuatu": "*.gov.vu",
    "New Caledonia": "*.gouv.nc",
    "Samoa": "*.gov.ws",
    "Tonga": "*.gov.to",
    "French Polynesia": "*.gouv.pf"
}

In [4]:
scraper = CommonCrawlScraper()

df = scraper.scrape_all(gov_domains)

2025-02-18 20:43:59,730 - INFO - File commoncrawl_united states_ALL_gov_results.parquet already exists; skipping scrape for United States pattern *.gov
2025-02-18 20:43:59,731 - INFO - File commoncrawl_united states_ALL_mil_results.parquet already exists; skipping scrape for United States pattern *.mil
2025-02-18 20:43:59,731 - INFO - File commoncrawl_canada_ALL_gc_ca_results.parquet already exists; skipping scrape for Canada pattern *.gc.ca
2025-02-18 20:43:59,731 - INFO - File commoncrawl_canada_ALL_canada_ca_results.parquet already exists; skipping scrape for Canada pattern *.canada.ca
2025-02-18 20:43:59,732 - INFO - File commoncrawl_mexico_ALL_gob_mx_results.parquet already exists; skipping scrape for Mexico pattern *.gob.mx
2025-02-18 20:43:59,732 - INFO - File commoncrawl_jamaica_ALL_gov_jm_results.parquet already exists; skipping scrape for Jamaica pattern *.gov.jm
2025-02-18 20:43:59,732 - INFO - File commoncrawl_trinidad and tobago_ALL_gov_tt_results.parquet already exists; s

### Upload to Harvard Dataverse

In [8]:
# Configuration
API_KEY = "945730d0-ef77-4c33-b67a-1165e79519ea"
DATAVERSE_URL = "https://dataverse.harvard.edu"
doi = "doi:10.7910/DVN/NKHAKM"  # Your dataset persistent ID
folder_path = Path(".")

# Set headers for authentication
headers = {"X-Dataverse-key": API_KEY}

# URL to fetch dataset metadata (to get existing file names)
metadata_url = f"{DATAVERSE_URL}/api/datasets/:persistentId?persistentId={quote(doi)}"

def get_existing_file_labels():
    response = requests.get(metadata_url, headers=headers)
    if response.status_code != 200:
        print("Error getting dataset metadata:", response.status_code)
        return []
    try:
        data = response.json()["data"]
    except Exception as e:
        print("Error parsing JSON metadata:", e)
        return []
    # Navigate to the latest version; file info is usually stored in the "files" list.
    latest_version = data.get("latestVersion", {})
    files = latest_version.get("files", [])
    # Extract file names. Dataverse may store the filename in "dataFile" or as "label".
    labels = []
    for f in files:
        if "dataFile" in f and f["dataFile"].get("filename"):
            labels.append(f["dataFile"]["filename"])
        elif "label" in f:
            labels.append(f["label"])
    return labels

# Retrieve existing file names in the dataset
existing_files = get_existing_file_labels()
print("Existing files in dataset:", existing_files)

# Build the upload URL using the persistentId endpoint
upload_url = f"{DATAVERSE_URL}/api/datasets/:persistentId/add?persistentId={quote(doi)}"

# Loop over all Parquet files in the folder
for fn in sorted(folder_path.glob("*.parquet")):
    if fn.name in existing_files:
        print("File already exists, skipping:", fn.name)
        continue
    print("Uploading:", fn.name)
    with fn.open("rb") as f:
        files = {
            "file": (fn.name, f, "application/octet-stream"),
            "jsonData": (None, "{}")  # Send empty JSON metadata
        }
        response = requests.post(upload_url, headers=headers, files=files)
    
    print("Status code:", response.status_code)
    print("Response text:", response.text)

Existing files in dataset: ['commoncrawl_albania_ALL_gov_al_results.parquet', 'commoncrawl_argentina_ALL_gob_ar_results.parquet', 'commoncrawl_barbados_ALL_gov_bb_results.parquet', 'commoncrawl_belgium_ALL_belgium_be_results.parquet', 'commoncrawl_belize_ALL_gov_bz_results.parquet', 'commoncrawl_benin_ALL_gouv_bj_results.parquet', 'commoncrawl_botswana_ALL_gov_bw_results.parquet', 'commoncrawl_brazil_ALL_gov_br_results.parquet', 'commoncrawl_bulgaria_ALL_government_bg_results.parquet', 'commoncrawl_burkina faso_ALL_gov_bf_results.parquet', 'commoncrawl_cambodia_ALL_gov_kh_results.parquet', 'commoncrawl_canada_ALL_canada_ca_results.parquet', 'commoncrawl_canada_ALL_gc_ca_results.parquet', 'commoncrawl_china_ALL_gov_cn_results.parquet', 'commoncrawl_croatia_ALL_gov_hr_results.parquet', 'commoncrawl_czech republic_ALL_gov_cz_results.parquet', 'commoncrawl_denmark_ALL_gov_dk_results.parquet', 'commoncrawl_djibouti_ALL_gouv_dj_results.parquet', 'commoncrawl_estonia_ALL_gov_ee_results.parque

KeyboardInterrupt: 

In [13]:
from pyDataverse.api import Api
from pathlib import Path
from urllib.parse import quote

# Configuration
API_KEY = "945730d0-ef77-4c33-b67a-1165e79519ea"
DATAVERSE_URL = "https://dataverse.harvard.edu"
doi = "doi:10.7910/DVN/NKHAKM"  # Dataset persistent ID (include the "doi:" prefix)
folder_path = Path(".")

# Instantiate the API client with the API token passed as a constructor argument
api = Api(DATAVERSE_URL, api_token=API_KEY)

# Function to retrieve existing file labels from the dataset metadata
def get_existing_file_labels():
    metadata_url = f"{DATAVERSE_URL}/api/datasets/:persistentId?persistentId={quote(doi)}"
    resp = api.get_dataset(doi)
    if resp.status_code != 200:
        print("Error getting dataset metadata:", resp.status_code)
        return []
    try:
        data = resp.json()["data"]
    except Exception as e:
        print("Error parsing JSON metadata:", e)
        return []
    latest_version = data.get("latestVersion", {})
    files = latest_version.get("files", [])
    labels = []
    for f in files:
        if "dataFile" in f and f["dataFile"].get("filename"):
            labels.append(f["dataFile"]["filename"])
        elif "label" in f:
            labels.append(f["label"])
    return labels

# Retrieve existing file names in the dataset
existing_files = get_existing_file_labels()
print("Existing files in dataset:", existing_files)

# Build the upload URL using the persistentId endpoint
upload_url = f"{DATAVERSE_URL}/api/datasets/:persistentId/add?persistentId={quote(doi)}"

# Loop over all Parquet files in the folder and upload those not already present
for fn in sorted(folder_path.glob("*.parquet")):
    if fn.name in existing_files:
        print("File already exists, skipping:", fn.name)
        continue
    print("Uploading:", fn.name)
    with fn.open("rb") as f:
        files = {
            "file": (fn.name, f, "application/octet-stream"),
            "jsonData": (None, "{}")  # Provide an empty JSON object for metadata
        }
        resp = api.upload_file(doi, str(fn), json_str="{}")
    print("Status code:", resp.status_code)
    print("Response text:", resp.text)

AttributeError: 'Api' object has no attribute 'get_dataset'

In [14]:
from pyDataverse.api import Api
from pathlib import Path
from urllib.parse import quote

# Configuration
API_KEY = "945730d0-ef77-4c33-b67a-1165e79519ea"
DATAVERSE_URL = "https://dataverse.harvard.edu"
doi = "doi:10.7910/DVN/NKHAKM"  # Dataset persistent ID (include the "doi:" prefix)
folder_path = Path(".")

# Instantiate the API client with the API token passed as a constructor argument
api = Api(DATAVERSE_URL, api_token=API_KEY)

# Function to retrieve existing file labels from the dataset metadata
def get_existing_file_labels():
    metadata_url = f"{DATAVERSE_URL}/api/datasets/:persistentId?persistentId={quote(doi)}"
    resp = api.get_dataset(doi)
    if resp.status_code != 200:
        print("Error getting dataset metadata:", resp.status_code)
        return []
    try:
        data = resp.json()["data"]
    except Exception as e:
        print("Error parsing JSON metadata:", e)
        return []
    latest_version = data.get("latestVersion", {})
    files = latest_version.get("files", [])
    labels = []
    for f in files:
        if "dataFile" in f and f["dataFile"].get("filename"):
            labels.append(f["dataFile"]["filename"])
        elif "label" in f:
            labels.append(f["label"])
    return labels

# Retrieve existing file names in the dataset
existing_files = get_existing_file_labels()
print("Existing files in dataset:", existing_files)

# Build the upload URL using the persistentId endpoint
upload_url = f"{DATAVERSE_URL}/api/datasets/:persistentId/add?persistentId={quote(doi)}"

# Loop over all Parquet files in the folder and upload those not already present
for fn in sorted(folder_path.glob("*.parquet")):
    if fn.name in existing_files:
        print("File already exists, skipping:", fn.name)
        continue
    print("Uploading:", fn.name)
    with fn.open("rb") as f:
        files = {
            "file": (fn.name, f, "application/octet-stream"),
            "jsonData": (None, "{}")  # Provide an empty JSON object for metadata
        }
        resp = api.upload_file(doi, str(fn), json_str="{}")
    print("Status code:", resp.status_code)
    print("Response text:", resp.text)

AttributeError: 'Api' object has no attribute 'get_dataset'

In [17]:
import requests
import json
from pathlib import Path
from urllib.parse import quote
from pyDataverse.api import Api  # using Api, since NativeApi isn't available

# Configuration
API_KEY = "945730d0-ef77-4c33-b67a-1165e79519ea"
DATAVERSE_URL = "https://dataverse.harvard.edu"
doi = "doi:10.7910/DVN/NKHAKM"  # Dataset persistent ID (with "doi:" prefix)
folder_path = Path(".")

# Instantiate the API client, supplying the API token in the constructor if available.
# (If your version doesn't support setting the token via the constructor,
#  we can simply pass the token in our requests.)
api = Api(DATAVERSE_URL, api_token=API_KEY)

# Since your Api object doesn't offer get_dataset, we'll use requests directly
headers = {"X-Dataverse-key": API_KEY}

def get_existing_file_labels():
    """Fetch dataset metadata using the persistentId endpoint and return a list of existing file names."""
    metadata_url = f"{DATAVERSE_URL}/api/datasets/:persistentId?persistentId={quote(doi)}"
    resp = requests.get(metadata_url, headers=headers)
    if resp.status_code != 200:
        print("Error getting dataset metadata:", resp.status_code)
        return []
    try:
        data = resp.json()["data"]
    except Exception as e:
        print("Error parsing JSON metadata:", e)
        return []
    latest_version = data.get("latestVersion", {})
    files = latest_version.get("files", [])
    labels = []
    for f in files:
        if "dataFile" in f and f["dataFile"].get("filename"):
            labels.append(f["dataFile"]["filename"])
        elif "label" in f:
            labels.append(f["label"])
    return labels

existing_files = get_existing_file_labels()
print("Existing files in dataset:", existing_files)

# Build the upload URL using the persistentId endpoint
upload_url = f"{DATAVERSE_URL}/api/datasets/:persistentId/add?persistentId={quote(doi)}"

# Loop over all Parquet files in the folder and upload those that do not already exist
for fn in sorted(folder_path.glob("*.parquet")):
    if fn.name in existing_files:
        print("File already exists, skipping:", fn.name)
        continue
    print("Uploading:", fn.name)
    with fn.open("rb") as f:
        files_payload = {
            "file": (fn.name, f, "application/octet-stream"),
            "jsonData": (None, "{}")  # Provide an empty JSON object for metadata
        }
        # Using pyDataverse's upload_file if available; if not, fallback to requests
        try:
            resp = api.upload_data(doi, str(fn), json_str="{}")
        except Exception as e:
            print(f"api.upload_file failed for {fn.name} with error: {e}")
            print("Falling back to direct requests.post...")
            resp = requests.post(upload_url, headers=headers, files=files_payload)
    
    print("Status code:", resp.status_code)
    print("Response text:", resp.text)


Existing files in dataset: ['commoncrawl_albania_ALL_gov_al_results.parquet', 'commoncrawl_argentina_ALL_gob_ar_results.parquet', 'commoncrawl_barbados_ALL_gov_bb_results.parquet', 'commoncrawl_belgium_ALL_belgium_be_results.parquet', 'commoncrawl_belize_ALL_gov_bz_results.parquet', 'commoncrawl_benin_ALL_gouv_bj_results.parquet', 'commoncrawl_botswana_ALL_gov_bw_results.parquet', 'commoncrawl_brazil_ALL_gov_br_results.parquet', 'commoncrawl_bulgaria_ALL_government_bg_results.parquet', 'commoncrawl_burkina faso_ALL_gov_bf_results.parquet', 'commoncrawl_cambodia_ALL_gov_kh_results.parquet', 'commoncrawl_canada_ALL_canada_ca_results.parquet', 'commoncrawl_canada_ALL_gc_ca_results.parquet', 'commoncrawl_china_ALL_gov_cn_results.parquet', 'commoncrawl_croatia_ALL_gov_hr_results.parquet', 'commoncrawl_czech republic_ALL_gov_cz_results.parquet', 'commoncrawl_denmark_ALL_gov_dk_results.parquet', 'commoncrawl_djibouti_ALL_gouv_dj_results.parquet', 'commoncrawl_estonia_ALL_gov_ee_results.parque

KeyboardInterrupt: 

In [20]:
import requests
import hashlib
import json
from pathlib import Path
from urllib.parse import quote
from pyDataverse.api import Api

# Configuration
API_KEY = "945730d0-ef77-4c33-b67a-1165e79519ea"
DATAVERSE_URL = "https://dataverse.harvard.edu"
doi = "doi:10.7910/DVN/NKHAKM"  # Your dataset persistent ID
folder_path = Path(".")

# Instantiate the API client (even if we don't use its methods, it shows we're using pyDataverse)
api = Api(DATAVERSE_URL, api_token=API_KEY)

# Set headers for direct requests
headers = {"X-Dataverse-key": API_KEY}

# Function to compute MD5 checksum of a file
def md5_checksum(file_path, chunk_size=8192):
    md5 = hashlib.md5()
    with file_path.open("rb") as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            md5.update(chunk)
    return md5.hexdigest()

# Retrieve dataset metadata using requests to get existing file checksums
metadata_url = f"{DATAVERSE_URL}/api/datasets/:persistentId?persistentId={quote(doi)}"
resp = requests.get(metadata_url, headers=headers)
if resp.status_code != 200:
    print("Error retrieving dataset metadata:", resp.status_code)
    existing_checksums = set()
else:
    try:
        data = resp.json().get("data", {})
    except Exception as e:
        print("Error parsing JSON metadata:", e)
        data = {}
    latest_version = data.get("latestVersion", {})
    files = latest_version.get("files", [])
    existing_checksums = set()
    for f in files:
        # Some Dataverse versions store the checksum under f["dataFile"]["checksum"]["value"]
        if "dataFile" in f and f["dataFile"].get("checksum", {}).get("value"):
            existing_checksums.add(f["dataFile"]["checksum"]["value"])
print("Existing file checksums in dataset:", existing_checksums)

# Build the upload URL using the persistentId endpoint
upload_url = f"{DATAVERSE_URL}/api/datasets/:persistentId/add?persistentId={quote(doi)}"

# Loop over all Parquet files in the folder
for fn in sorted(folder_path.glob("*.parquet")):
    file_md5 = md5_checksum(fn)
    if file_md5 in existing_checksums:
        print(f"Skipping {fn.name} (duplicate content; MD5: {file_md5}).")
        continue
    print("Uploading:", fn.name)
    with fn.open("rb") as f:
        files_payload = {
            "file": (fn.name, f, "application/octet-stream"),
            "jsonData": (None, json.dumps({"description": "Uploaded via API"}))
        }
        response = requests.post(upload_url, headers=headers, files=files_payload)
    
    print("Status code:", response.status_code)
    print("Response text:", response.text)


Existing file checksums in dataset: {'ab01e2d0af832c76b2d1173ffbb1b155', 'e43fc01546cdf1434b099fd5e2e4f588', '4b51852060c1882d8363b057d9bc7648', '6fd45dd490ad555bede0ada109584b4d', '10a32403670c8dd8e7c197414f327fe8', '649e404530c4b7ca07a9ad92d9c62db2', '43d632c74bc8b4083ea11b8201b0ad32', 'dc5780faf421d7d6463b954d806269af', '06dab45d62b2008e4e3a9c49118f3e55', '890ee1b47380a1526919c74977676652', '79ce9f110ac4a4fee8c56268c9e8979d', '36eacf7eb8acc06a44a9a334d6e95368', '25a701b7b6dbfc0a3f5ea08cc9be3540', 'c7b3e44899acd214e189625050894df5', 'baf650d3959d48d8373958cff131d379', 'cbcebade24e9e8896b60542b50ead644', 'e21a3ff2da1e494d9d308a682029e5db', '751a9d98fe5da2df15f22a225025e460', '471411ce5da6eaa861b222f8d8ee4da0', '7f2968f28658b9c129ff0fa776d3f855', 'b7c97d8346663e429f5ab065e57796ec', '6f79a9f443828da71e10ae557546f5eb', 'a2b9b4ec95033b72ef78d43b5b21ca75', '4a78560bfdb04e69ebe1e06778921d7f', '50a244af6656621f01df104d5576abf4', '8bf81f9091dfd2150772e31cb010b85c', '7c2ca81bf932010f81092ffd6e

KeyboardInterrupt: 