In [1]:
!pip install internetarchive requests

Collecting internetarchive
  Downloading internetarchive-5.4.0-py3-none-any.whl.metadata (4.7 kB)
Downloading internetarchive-5.4.0-py3-none-any.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.9/105.9 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: internetarchive
Successfully installed internetarchive-5.4.0


In [2]:
import os
import requests
from internetarchive import search_items, get_item, download

def search_internet_archive(query, max_results=5):
    """
    Search IA for items that mention `query` in title OR subject OR description.
    Returns up to max_results dicts with 'identifier' and 'title'.
    """
    # broadened JQL to search multiple metadata fields
    jql = (
        f'title:({query}) '
        f'OR subject:({query}) '
        f'OR description:({query})'
    )
    results = []
    for count, item in enumerate(search_items(jql), start=1):
        results.append({
            'identifier': item['identifier'],
            'title': item.get('title', '(no title)')
        })
        if count >= max_results:
            break
    return results

def download_from_internet_archive(
    identifier,
    download_dir='downloads',
    max_downloads=10,
    extensions=('.pdf', '.lcpdf', '.txt', '.djvu', '.epub')
):
    """
    Download up to max_downloads files for `identifier` whose names end
    with one of extensions, skipping only those explicitly marked restricted.
    """
    item = get_item(identifier)
    os.makedirs(os.path.join(download_dir, identifier), exist_ok=True)

    to_fetch = []
    for f in item.files:
        name = f.get('name', '')
        if any(name.lower().endswith(ext) for ext in extensions):
            if f.get('restricted') == '1':
                print(f"Skipping restricted: {name}")
                continue
            to_fetch.append(name)
            if len(to_fetch) >= max_downloads:
                break

    if not to_fetch:
        print(f"No public files with extensions {extensions} for '{identifier}'")
        return

    try:
        download(
            identifier,
            files=to_fetch,
            destdir=os.path.join(download_dir, identifier),
            verbose=True
        )
    except Exception as e:
        print(f"Error downloading from {identifier}: {e}")

def download_gutenberg_text(book_id, save_dir='downloads/gutenberg'):
    """
    Download the plain-text version of a Gutenberg book by its ID.
    """
    os.makedirs(save_dir, exist_ok=True)
    for variant in (f'{book_id}-0.txt', f'{book_id}.txt'):
        url = f'https://www.gutenberg.org/files/{book_id}/{variant}'
        try:
            r = requests.get(url)
            r.raise_for_status()
            path = os.path.join(save_dir, f'gutenberg_{book_id}.txt')
            with open(path, 'w', encoding='utf-8') as out:
                out.write(r.text)
            print(f"Downloaded Gutenberg book {book_id} as {variant}")
            return
        except requests.HTTPError:
            continue
    print(f"No text file found for Gutenberg ID {book_id}")

if __name__ == '__main__':
    # 1) search & download IA items for "Amazon archaeology"
    items = search_internet_archive('Amazon archaeology', max_results=100)
    for itm in items:
        print(f"Found “{itm['title']}” (ID: {itm['identifier']})")
        download_from_internet_archive(
            itm['identifier'],
            max_downloads=10
        )

    # 2) Gutenberg examples (optional)
    for gid in (12472, 22752):
        download_gutenberg_text(gid)

Found “(no title)” (ID: 2010-mattievich-enrico-journey-to-the-mythological-inferno-americas-discovery-by)


2010-mattievich-enrico-journey-to-the-mythological-inferno-americas-discovery-by:
 downloading 1995 Enrico Mattievich ~ Tαξίδι στη μυθολογική κόλαση - Η ανακάλυψη της Αμερικής από τους Έλληνες  #1#.pdf: 100%|██████████| 9.91M/9.91M [00:00<00:00, 13.4MiB/s]
 downloading 1995 Enrico Mattievich ~ Tαξίδι στη μυθολογική κόλαση - Η ανακάλυψη της Αμερικής από τους Έλληνες  #1#_djvu.txt: 100%|██████████| 480k/480k [00:00<00:00, 1.58MiB/s]
 downloading 2010 Mattievich, Enrico ~ Journey to the Mythological Inferno_ America's Discovery by the Ancient Greeks [Rogem Press] _.pdf: 100%|██████████| 34.2M/34.2M [00:01<00:00, 23.0MiB/s]
 downloading 2010 Mattievich, Enrico ~ Journey to the Mythological Inferno_ America's Discovery by the Ancient Greeks [Rogem Press] __djvu.txt: 100%|██████████| 424k/424k [00:00<00:00, 1.41MiB/s]


Found “(no title)” (ID: 3CreepyTv)
No public files with extensions ('.pdf', '.lcpdf', '.txt', '.djvu', '.epub') for '3CreepyTv'
Found “(no title)” (ID: 4lv08yhcls19uobhwcsegehdjg8x2ptbysyaev8x)
No public files with extensions ('.pdf', '.lcpdf', '.txt', '.djvu', '.epub') for '4lv08yhcls19uobhwcsegehdjg8x2ptbysyaev8x'
Found “(no title)” (ID: ABudABowlShowHD420PodcastHistoryOfCannabisInTheAncientWorldBCPartTwo)
No public files with extensions ('.pdf', '.lcpdf', '.txt', '.djvu', '.epub') for 'ABudABowlShowHD420PodcastHistoryOfCannabisInTheAncientWorldBCPartTwo'
Found “(no title)” (ID: AlfredWebreAudio)
No public files with extensions ('.pdf', '.lcpdf', '.txt', '.djvu', '.epub') for 'AlfredWebreAudio'
Found “(no title)” (ID: AncientSuppressedDiscoveriesWithJonathanGray)
No public files with extensions ('.pdf', '.lcpdf', '.txt', '.djvu', '.epub') for 'AncientSuppressedDiscoveriesWithJonathanGray'
Found “(no title)” (ID: AssemblyLinesPodcast83)
No public files with extensions ('.pdf', '.lcpdf

ERIC_ED371971:
 downloading ERIC_ED371971.djvu: 100%|██████████| 3.67M/3.67M [00:00<00:00, 5.45MiB/s]
 downloading ERIC_ED371971.pdf: 100%|██████████| 1.35M/1.35M [00:00<00:00, 3.05MiB/s]
 downloading ERIC_ED371971_djvu.txt: 124kiB [00:00, 1.59MiB/s]


Found “(no title)” (ID: KIDE_91_3_FM_20170217_170000)
No public files with extensions ('.pdf', '.lcpdf', '.txt', '.djvu', '.epub') for 'KIDE_91_3_FM_20170217_170000'
Found “(no title)” (ID: KIDE_91_3_FM_20170218_020000)
No public files with extensions ('.pdf', '.lcpdf', '.txt', '.djvu', '.epub') for 'KIDE_91_3_FM_20170218_020000'
Found “(no title)” (ID: KIDE_91_3_FM_20170224_170000)
No public files with extensions ('.pdf', '.lcpdf', '.txt', '.djvu', '.epub') for 'KIDE_91_3_FM_20170224_170000'
Found “(no title)” (ID: KPFA_94_1_FM_20180312_010000)
No public files with extensions ('.pdf', '.lcpdf', '.txt', '.djvu', '.epub') for 'KPFA_94_1_FM_20180312_010000'
Found “(no title)” (ID: KSKA_91_1_FM_20170218_050000)
No public files with extensions ('.pdf', '.lcpdf', '.txt', '.djvu', '.epub') for 'KSKA_91_1_FM_20170218_050000'
Found “(no title)” (ID: ManlyPhall)


ManlyPhall:
 error downloading file downloads/ManlyPhall/ManlyPhall/Astrology-and-Reincarnation-Manly-Palmer-Hall.pdf, exception raised: 403 Client Error: Forbidden for url: https://ia902904.us.archive.org/34/items/ManlyPhall/Astrology-and-Reincarnation-Manly-Palmer-Hall.pdf


Error downloading from ManlyPhall: 403 Client Error: Forbidden for url: https://ia902904.us.archive.org/34/items/ManlyPhall/Astrology-and-Reincarnation-Manly-Palmer-Hall.pdf
Found “(no title)” (ID: NASA_NTRS_Archive_20030016612)


NASA_NTRS_Archive_20030016612:
 downloading NASA_NTRS_Archive_20030016612.pdf: 100%|██████████| 3.10M/3.10M [00:00<00:00, 5.31MiB/s]
 downloading NASA_NTRS_Archive_20030016612_djvu.txt: 100%|██████████| 225k/225k [00:00<00:00, 1.04MiB/s]


Found “(no title)” (ID: Pmb094PersonalSpaceBrianBennettGiantClawComoCarouselMakersBuddy)
No public files with extensions ('.pdf', '.lcpdf', '.txt', '.djvu', '.epub') for 'Pmb094PersonalSpaceBrianBennettGiantClawComoCarouselMakersBuddy'
Found “(no title)” (ID: ROTAmiolo)


ROTAmiolo:
 downloading ROTAmiolo.pdf: 100%|██████████| 180M/180M [00:04<00:00, 43.5MiB/s]
 downloading ROTAmiolo_djvu.txt: 1.56MiB [00:00, 5.10MiB/s]


Found “(no title)” (ID: SanAgustnastronautaDePiedraSingle)
No public files with extensions ('.pdf', '.lcpdf', '.txt', '.djvu', '.epub') for 'SanAgustnastronautaDePiedraSingle'
Found “(no title)” (ID: VOA_Africa_20181105_190000)
No public files with extensions ('.pdf', '.lcpdf', '.txt', '.djvu', '.epub') for 'VOA_Africa_20181105_190000'
Found “(no title)” (ID: VOA_Global_English_20181105_130000)
No public files with extensions ('.pdf', '.lcpdf', '.txt', '.djvu', '.epub') for 'VOA_Global_English_20181105_130000'
Found “(no title)” (ID: VOA_Global_English_20181105_190000)
No public files with extensions ('.pdf', '.lcpdf', '.txt', '.djvu', '.epub') for 'VOA_Global_English_20181105_190000'
Found “(no title)” (ID: VirtualSkeptics118242015)
No public files with extensions ('.pdf', '.lcpdf', '.txt', '.djvu', '.epub') for 'VirtualSkeptics118242015'
Found “(no title)” (ID: ancientastronautarchive)


ancientastronautarchive:
 downloading Lost Book of Enki, Memories and Prophecies of an Extraterrestrial God - Zecharia Sitchin.pdf: 100%|██████████| 970k/970k [00:00<00:00, 2.73MiB/s]
 downloading Lost Book of Enki, Memories and Prophecies of an Extraterrestrial God - Zecharia Sitchin_djvu.txt: 453kiB [00:00, 2.57MiB/s]
 downloading Lost Tablets of Enki.pdf: 100%|██████████| 945k/945k [00:00<00:00, 2.71MiB/s]
 downloading Lost Tablets of Enki_djvu.txt: 452kiB [00:00, 2.50MiB/s]


Found “(no title)” (ID: annualreviewofan15sieg)


annualreviewofan15sieg:
 downloading annualreviewofan15sieg.lcpdf: 100%|██████████| 27.6M/27.6M [00:10<00:00, 2.88MiB/s]
 error downloading file downloads/annualreviewofan15sieg/annualreviewofan15sieg/annualreviewofan15sieg.pdf, exception raised: 403 Client Error: Forbidden for url: https://ia801309.us.archive.org/1/items/annualreviewofan15sieg/annualreviewofan15sieg.pdf


Error downloading from annualreviewofan15sieg: 403 Client Error: Forbidden for url: https://ia801309.us.archive.org/1/items/annualreviewofan15sieg/annualreviewofan15sieg.pdf
Found “(no title)” (ID: archaologischeun0000hilb)


archaologischeun0000hilb:
 downloading archaologischeun0000hilb.lcpdf: 100%|██████████| 20.4M/20.4M [00:00<00:00, 25.7MiB/s]
 error downloading file downloads/archaologischeun0000hilb/archaologischeun0000hilb/archaologischeun0000hilb.pdf, exception raised: 403 Client Error: Forbidden for url: https://ia601009.us.archive.org/28/items/archaologischeun0000hilb/archaologischeun0000hilb.pdf


Error downloading from archaologischeun0000hilb: 403 Client Error: Forbidden for url: https://ia601009.us.archive.org/28/items/archaologischeun0000hilb/archaologischeun0000hilb.pdf
Found “(no title)” (ID: borderlines-false-self-unlike-narcissists-see-pinned-comment)
No public files with extensions ('.pdf', '.lcpdf', '.txt', '.djvu', '.epub') for 'borderlines-false-self-unlike-narcissists-see-pinned-comment'
Found “(no title)” (ID: davidicke)


davidicke:
 error downloading file downloads/davidicke/davidicke/David Icke - Alice In Wonderland And The World Trade Center Disaster.pdf, exception raised: 403 Client Error: Forbidden for url: https://ia802902.us.archive.org/9/items/davidicke/David%20Icke%20-%20Alice%20In%20Wonderland%20And%20The%20World%20Trade%20Center%20Disaster.pdf


Error downloading from davidicke: 403 Client Error: Forbidden for url: https://ia802902.us.archive.org/9/items/davidicke/David%20Icke%20-%20Alice%20In%20Wonderland%20And%20The%20World%20Trade%20Center%20Disaster.pdf
Found “(no title)” (ID: dudufilm)
No public files with extensions ('.pdf', '.lcpdf', '.txt', '.djvu', '.epub') for 'dudufilm'
Found “(no title)” (ID: encyclopediaofge00dina)


encyclopediaofge00dina:
 downloading encyclopediaofge00dina.lcpdf: 100%|██████████| 58.5M/58.5M [00:03<00:00, 16.2MiB/s]
 error downloading file downloads/encyclopediaofge00dina/encyclopediaofge00dina/encyclopediaofge00dina.pdf, exception raised: 403 Client Error: Forbidden for url: https://ia800504.us.archive.org/24/items/encyclopediaofge00dina/encyclopediaofge00dina.pdf


Error downloading from encyclopediaofge00dina: 403 Client Error: Forbidden for url: https://ia800504.us.archive.org/24/items/encyclopediaofge00dina/encyclopediaofge00dina.pdf
Found “(no title)” (ID: encyclopediaofge00dina_0)


encyclopediaofge00dina_0:
 downloading encyclopediaofge00dina_0.lcpdf: 100%|██████████| 55.6M/55.6M [00:01<00:00, 31.3MiB/s]
 error downloading file downloads/encyclopediaofge00dina_0/encyclopediaofge00dina_0/encyclopediaofge00dina_0.pdf, exception raised: 401 Client Error: Unauthorized for url: https://dn790007.ca.archive.org/0/items/encyclopediaofge00dina_0/encyclopediaofge00dina_0.pdf


Error downloading from encyclopediaofge00dina_0: 401 Client Error: Unauthorized for url: https://dn790007.ca.archive.org/0/items/encyclopediaofge00dina_0/encyclopediaofge00dina_0.pdf
Found “(no title)” (ID: encyclopediaofge00dina_1)


encyclopediaofge00dina_1:
 downloading encyclopediaofge00dina_1.lcpdf: 100%|██████████| 54.6M/54.6M [00:02<00:00, 28.6MiB/s]
 error downloading file downloads/encyclopediaofge00dina_1/encyclopediaofge00dina_1/encyclopediaofge00dina_1.pdf, exception raised: 401 Client Error: Unauthorized for url: https://dn790001.ca.archive.org/0/items/encyclopediaofge00dina_1/encyclopediaofge00dina_1.pdf


Error downloading from encyclopediaofge00dina_1: 401 Client Error: Unauthorized for url: https://dn790001.ca.archive.org/0/items/encyclopediaofge00dina_1/encyclopediaofge00dina_1.pdf
Found “(no title)” (ID: encyclopediaofwo0000unse_d8h7)


encyclopediaofwo0000unse_d8h7:
 downloading encyclopediaofwo0000unse_d8h7.lcpdf: 100%|██████████| 46.3M/46.3M [00:01<00:00, 27.6MiB/s]
 error downloading file downloads/encyclopediaofwo0000unse_d8h7/encyclopediaofwo0000unse_d8h7/encyclopediaofwo0000unse_d8h7.pdf, exception raised: 403 Client Error: Forbidden for url: https://ia803202.us.archive.org/26/items/encyclopediaofwo0000unse_d8h7/encyclopediaofwo0000unse_d8h7.pdf


Error downloading from encyclopediaofwo0000unse_d8h7: 403 Client Error: Forbidden for url: https://ia803202.us.archive.org/26/items/encyclopediaofwo0000unse_d8h7/encyclopediaofwo0000unse_d8h7.pdf
Found “(no title)” (ID: eustacemullinscollection1)


eustacemullinscollection1:
 error downloading file downloads/eustacemullinscollection1/eustacemullinscollection1/Eusatce Mullins Secrets of The Fed.pdf, exception raised: 403 Client Error: Forbidden for url: https://ia802902.us.archive.org/13/items/eustacemullinscollection1/Eusatce%20Mullins%20Secrets%20of%20The%20Fed.pdf


Error downloading from eustacemullinscollection1: 403 Client Error: Forbidden for url: https://ia802902.us.archive.org/13/items/eustacemullinscollection1/Eusatce%20Mullins%20Secrets%20of%20The%20Fed.pdf
Found “(no title)” (ID: michaeltsarion)


michaeltsarion:
 error downloading file downloads/michaeltsarion/michaeltsarion/Michael Tsarion - Astro-Theology And Sidereal Mythology.pdf, exception raised: 403 Client Error: Forbidden for url: https://ia902802.us.archive.org/7/items/michaeltsarion/Michael%20Tsarion%20-%20Astro-Theology%20And%20Sidereal%20Mythology.pdf


Error downloading from michaeltsarion: 403 Client Error: Forbidden for url: https://ia902802.us.archive.org/7/items/michaeltsarion/Michael%20Tsarion%20-%20Astro-Theology%20And%20Sidereal%20Mythology.pdf
Found “(no title)” (ID: moundbuildersofa0000roos)


moundbuildersofa0000roos:
 downloading moundbuildersofa0000roos.lcpdf: 100%|██████████| 38.1M/38.1M [00:01<00:00, 27.7MiB/s]
 error downloading file downloads/moundbuildersofa0000roos/moundbuildersofa0000roos/moundbuildersofa0000roos.pdf, exception raised: 401 Client Error: Unauthorized for url: https://dn720006.ca.archive.org/0/items/moundbuildersofa0000roos/moundbuildersofa0000roos.pdf


Error downloading from moundbuildersofa0000roos: 401 Client Error: Unauthorized for url: https://dn720006.ca.archive.org/0/items/moundbuildersofa0000roos/moundbuildersofa0000roos.pdf
Found “(no title)” (ID: n0djrpzdyullrq6h1we6qrpb1ii1owgv9wacm9ow)
No public files with extensions ('.pdf', '.lcpdf', '.txt', '.djvu', '.epub') for 'n0djrpzdyullrq6h1we6qrpb1ii1owgv9wacm9ow'
Found “(no title)” (ID: nitaiveda)
No public files with extensions ('.pdf', '.lcpdf', '.txt', '.djvu', '.epub') for 'nitaiveda'
Found “(no title)” (ID: paulwallis)
No public files with extensions ('.pdf', '.lcpdf', '.txt', '.djvu', '.epub') for 'paulwallis'
Found “(no title)” (ID: postcolonialamaz0000penr_e8i4)


postcolonialamaz0000penr_e8i4:
 downloading postcolonialamaz0000penr_e8i4.lcpdf: 100%|██████████| 21.2M/21.2M [00:01<00:00, 20.8MiB/s]
 error downloading file downloads/postcolonialamaz0000penr_e8i4/postcolonialamaz0000penr_e8i4/postcolonialamaz0000penr_e8i4.pdf, exception raised: 401 Client Error: Unauthorized for url: https://dn720006.ca.archive.org/0/items/postcolonialamaz0000penr_e8i4/postcolonialamaz0000penr_e8i4.pdf


Error downloading from postcolonialamaz0000penr_e8i4: 401 Client Error: Unauthorized for url: https://dn720006.ca.archive.org/0/items/postcolonialamaz0000penr_e8i4/postcolonialamaz0000penr_e8i4.pdf
Found “(no title)” (ID: retncrv2006vtarchaeologyjpetersentribute)
No public files with extensions ('.pdf', '.lcpdf', '.txt', '.djvu', '.epub') for 'retncrv2006vtarchaeologyjpetersentribute'
Found “(no title)” (ID: rwugbiupl1umttkibxzx4vjh4jd0c8nszblewbpb)
No public files with extensions ('.pdf', '.lcpdf', '.txt', '.djvu', '.epub') for 'rwugbiupl1umttkibxzx4vjh4jd0c8nszblewbpb'
Found “(no title)” (ID: siytdc-Aerial_Archaeology_-_STEM_in_30_-_Season_11_-_Episode_3)
No public files with extensions ('.pdf', '.lcpdf', '.txt', '.djvu', '.epub') for 'siytdc-Aerial_Archaeology_-_STEM_in_30_-_Season_11_-_Episode_3'
Found “(no title)” (ID: swastika-remixed-third-reich-documentary)
No public files with extensions ('.pdf', '.lcpdf', '.txt', '.djvu', '.epub') for 'swastika-remixed-third-reich-documentary

voltairecollection:
 error downloading file downloads/voltairecollection/voltairecollection/VOL9.pdf, exception raised: 401 Client Error: Unauthorized for url: https://dn790000.ca.archive.org/0/items/voltairecollection/VOL9.pdf


Error downloading from voltairecollection: 401 Client Error: Unauthorized for url: https://dn790000.ca.archive.org/0/items/voltairecollection/VOL9.pdf
Found “(no title)” (ID: wcn8gzsww7tumgb4jqtbmlmidolufmmsdabftrhj)
No public files with extensions ('.pdf', '.lcpdf', '.txt', '.djvu', '.epub') for 'wcn8gzsww7tumgb4jqtbmlmidolufmmsdabftrhj'
Found “(no title)” (ID: wgcgccnc0coljdauunpmvxjptjtutyvzibqbhw8r)
No public files with extensions ('.pdf', '.lcpdf', '.txt', '.djvu', '.epub') for 'wgcgccnc0coljdauunpmvxjptjtutyvzibqbhw8r'
Found “(no title)” (ID: worldheritagecon0000cave)


worldheritagecon0000cave:
 downloading worldheritagecon0000cave.lcpdf: 100%|██████████| 20.3M/20.3M [00:14<00:00, 1.43MiB/s]
 error downloading file downloads/worldheritagecon0000cave/worldheritagecon0000cave/worldheritagecon0000cave.pdf, exception raised: 401 Client Error: Unauthorized for url: https://dn790006.ca.archive.org/0/items/worldheritagecon0000cave/worldheritagecon0000cave.pdf


Error downloading from worldheritagecon0000cave: 401 Client Error: Unauthorized for url: https://dn790006.ca.archive.org/0/items/worldheritagecon0000cave/worldheritagecon0000cave.pdf
Found “(no title)” (ID: ybpzahaflaub4f0vhelm5dt1inay8qlsshvlvso4)
No public files with extensions ('.pdf', '.lcpdf', '.txt', '.djvu', '.epub') for 'ybpzahaflaub4f0vhelm5dt1inay8qlsshvlvso4'
Found “(no title)” (ID: youtube--3oSd8DC8Kc)
No public files with extensions ('.pdf', '.lcpdf', '.txt', '.djvu', '.epub') for 'youtube--3oSd8DC8Kc'
Found “(no title)” (ID: youtube-AW55J2zE3N4)
No public files with extensions ('.pdf', '.lcpdf', '.txt', '.djvu', '.epub') for 'youtube-AW55J2zE3N4'
Found “(no title)” (ID: youtube-DmKU44ZeuRQ)
No public files with extensions ('.pdf', '.lcpdf', '.txt', '.djvu', '.epub') for 'youtube-DmKU44ZeuRQ'
Found “(no title)” (ID: youtube-FsrYR52Tv54)
No public files with extensions ('.pdf', '.lcpdf', '.txt', '.djvu', '.epub') for 'youtube-FsrYR52Tv54'
Found “(no title)” (ID: youtube-Mhs