In [1]:
import os
import pandas as pd
import requests
from urllib.parse import urlparse
import re
from disk_image_handling.c64_disk_image_processing import Corpus

# 1. Load data

In [2]:
with open('csv/mw_import.csv', encoding='utf-16') as main_dataset, open('csv/diskmags_csdb.csv', encoding='utf-8') as csdb_dataset, open('csv/diskmags_demozoo.csv', encoding='utf-8') as demozoo_dataset, open('csv/diskmags_pouet.csv', encoding='utf-8') as pouet_dataset:
    df_main = pd.read_csv(main_dataset)
    df_csdb = pd.read_csv(csdb_dataset)
    df_demozoo = pd.read_csv(demozoo_dataset)
    df_pouet = pd.read_csv(pouet_dataset)

## 1.1 Create subset of German-language magazines for the Commodore 64

In [3]:
df_main_german = df_main[df_main['Magazine[Language]'].apply(lambda x: isinstance(x,str) and 'German' in x.split('; '))]
df_main_german = df_main_german[df_main_german['Magazine[Systems]'].apply(lambda x: isinstance(x,str) and 'Commodore 64' in x.split('; '))]
df_main_german.to_csv('csv/c64_diskmag_titles.csv', index=False)
df_main_german['systems_lower'] = df_main_german['Magazine[Systems]'].apply(lambda x: x.lower() if isinstance(x, str) else None)

## 1.2 Prepare other datsets

In [5]:
df_csdb['system_lower'] = df_csdb['platform'].apply(lambda x: x.lower() if isinstance(x, str) else None)
df_demozoo['system_lower'] = df_demozoo['platform'].apply(lambda x: x.lower() if isinstance(x, str) else None)
df_pouet['system_lower'] = df_pouet['platform'].apply(lambda x: x.lower() if isinstance(x, str) else None)

# 2. Get the list of all issues

In [6]:
german_diskmags_list = []

# Iterate over all rows of the main dataset
for index, row in df_main_german.iterrows():
    if not isinstance(row['systems_lower'], str): continue
    # Get values
    title = row['Title'].split('; ')
    system = row['systems_lower'].split('; ')
    source = row['Magazine[Source]'].split('; ')
    
    # CSDB
    if 'CSDB' in source:
        subset_csdb = df_csdb[df_csdb['title'].apply(lambda x: any(item in x.split(', ') for item in title))]
        subset_csdb_cleaned = subset_csdb[['title', 'issue', 'download_links', 'platform']]
        german_diskmags_list.append(subset_csdb_cleaned)
        
    # Demozoo
    if 'Demozoo' in source:
        subset_demozoo = df_demozoo[df_demozoo['title'].apply(lambda x: any(item in x.split(', ') for item in title))]
        subset_demozoo_cleaned = subset_demozoo[subset_demozoo['system_lower'].apply(lambda x: any(item in x for item in system))]
        if not subset_demozoo_cleaned.empty:
            subset_demozoo_cleaned = subset_demozoo_cleaned[['title', 'issue', 'download_links', 'platform']]
            german_diskmags_list.append(subset_demozoo_cleaned)
            
    # Pouet
    if 'Pouet' in source:
        subset_pouet = df_pouet[df_pouet['title'].apply(lambda x: any(item in x.split(', ') for item in title))]
        subset_pouet_cleaned = subset_pouet[subset_pouet['system_lower'].apply(lambda x: any(item in x for item in system))]
        if not subset_pouet_cleaned.empty:
            subset_pouet_cleaned = subset_pouet_cleaned[['title', 'issue', 'download_links', 'platform']]
            german_diskmags_list.append(subset_pouet_cleaned)
            
# Concat subsets to the dataframe
subset_issues_german = pd.concat(german_diskmags_list, ignore_index=True)
subset_issues_german = subset_issues_german[~subset_issues_german['download_links'].duplicated(keep='first')] # Remove duplicates
subset_issues_german = subset_issues_german[~subset_issues_german['issue'].duplicated(keep='first')] # Remove duplicates
subset_issues_german = subset_issues_german.sort_values(by='issue', key=lambda x: x.str.lower())

In [7]:
subset_issues_german.to_csv('csv/c64_diskmag_issues.csv', index=False)

# 3. Download disk images

In [8]:
# Get sorted list of titles
titles = subset_issues_german['title'].tolist()
titles = sorted(set(titles))

In [9]:
os.mkdir('../data/disk_images')
#os.chdir('disk_images')

In [10]:
supplement = []

for index, row in subset_issues_german.iterrows():
    # Get data
    title = row['title']
    issue = row['issue']
    sanitized_issue = ''.join(c for c in issue if c not in '\/:*?<>|')
    platform = row['platform']
    download_urls = row['download_links']

    # Create folder for each magazine
    path_magazine = os.path.join('disk_images', title)
    if not os.path.exists(path_magazine):
        os.mkdir(path_magazine)

    # Check download links, if empty -> skip iteration
    if isinstance(download_urls, str):
        download_urls = [elem for elem in download_urls.split(', ') if not elem.startswith('ftp')]
    else:
        continue

    # Create folder for the issue
    path_magazine_issue = os.path.join(path_magazine, sanitized_issue)
    if not os.path.exists(path_magazine_issue):
        os.mkdir(path_magazine_issue)

    files = set()
    used_urls = []
    for url in download_urls:
        #parsed_url = urlparse(url)
        #file_name = os.path.basename(parsed_url.path)
        #file_name = re.sub(r'[%#_-]+', '', file_name).lower()
        file_name = os.path.basename(url)
        file_name = re.sub(r'[\s%#_-]+', '', file_name).lower()
        if file_name in files:
            continue
        files.add(file_name)
        try:
            response = requests.get(url, allow_redirects=True)
            if response.status_code == 200:
                file_path = os.path.join(path_magazine_issue, file_name)
                with open(file_path, 'wb') as file:
                    file.write(response.content)
                print(f'Downloaded {file_name}')
                files.add(file_name)
                used_urls.append(url)
            else:
                print(f'Failed to download {file_name}')
        except (ConnectionError, OSError) as e:
            print(f'Unable to locate file {file_name}\n{e}\n')

    # Track donwloaded files
    files = ', '.join(files)
    used_urls = ', '.join(used_urls)
    record = {'title': title, 'issue': issue, 'files': files, 'urls': used_urls, 'system': platform}
    supplement.append(record)


# Save the records to a CSV files
df_downloads = pd.DataFrame(supplement)
df_downloads = df_downloads.sort_values('issue', key=lambda x: x.str.lower()) # Sort by title
df_downloads.to_csv('disk_images/downloaded_files.csv', index=False)

Downloaded actionnews01.zip
Downloaded actionnews02.zip
Downloaded actionnews03.zip
Downloaded actionnews04.zip
Downloaded actionnews05.zip
Downloaded actionnews06.zip
Downloaded actionnews08.zip
Downloaded actionnews09sphericaldesigns.zip
Downloaded actionnews01.zip
Downloaded actionnews02.zip
Downloaded actionnews03.zip
Downloaded actionnews04.zip
Downloaded actionnews05.zip
Downloaded actionnews06.zip
Downloaded actionnews08.zip
Downloaded actionnews09.zip
Downloaded bioblech01.zip
Downloaded bioblech02.zip
Downloaded bioblech03.zip
Downloaded biometal01.zip
Downloaded biometal02.zip
Downloaded biometal03.zip
Downloaded biometal04.zip
Downloaded cn5.zip
Downloaded darkstar13.zip
Downloaded digitalnews8802.zip
Unable to locate file download.php?id=8442
No connection adapters were found for 'ftp://ftp.scs-trc.net/pub/c64/Magazines/Digital_Talk/DigitalTalk_01.zip'
Downloaded dt01.zip
Downloaded digitaltalk01.zip
Downloaded digitaltalk02.lzh
Downloaded dt02.zip
Downloaded 003.zip
Downlo

# 4. Delete empty folders

In [11]:
path = 'disk_images'
# Iterate over magazines
for magazine in os.listdir(path):
    magazine_path = os.path.join(path, magazine)
    if os.path.isdir(magazine_path):
        # Iterate over issues
        for issue in os.listdir(magazine_path):
            issue_path = os.path.join(magazine_path, issue)
            if os.path.isdir(issue_path):
                if not os.listdir(issue_path):
                    print(issue_path)
                    os.rmdir(issue_path)

disk_images\Digital Talk\Digital Talk #01
disk_images\Digital Talk\Digital Talk #06
disk_images\Digital Talk\Digital Talk #07
disk_images\Digital Talk\Digital Talk #08
disk_images\Digital Talk\Digital Talk #11
disk_images\Digital Talk\Digital Talk #12
disk_images\Digital Talk\Digital Talk #46
disk_images\Digital Talk\digital talk #56
disk_images\Digital Talk\Digital Talk #57
disk_images\Digital Talk\Digital Talk #59
disk_images\Digital Talk\Digital Talk #61
disk_images\Digital Talk\Digital Talk #62 (Green Tears)
disk_images\Digital Talk\Digital Talk #74
disk_images\Scene World\Scene World #11
disk_images\Scene World\Scene World #7
disk_images\Scene World\Scene World #8
disk_images\Scene World\Scene World #9
disk_images\Tiger-Disk, Tiger Disk\Tiger Disk #01
disk_images\Tiger-Disk, Tiger Disk\Tiger Disk #02
disk_images\Tiger-Disk, Tiger Disk\tiger disk #98


# 5. Unpack zips

In [12]:
corpus = Corpus(corpus_name='german diskmags', corpus_path=r'../data/disk_images')

In [13]:
corpus.unpack(remove_zip=True)

Unpacking .zip files:   3%|▎         | 14/499 [00:00<00:07, 66.61file/s]

Unable to open the archive "digitaltalk50.zip": File is not a zip file
Unable to open the archive "digitaltalk29.zip": File is not a zip file


Unpacking .zip files:   9%|▉         | 46/499 [00:00<00:09, 49.92file/s]

Unable to open the archive "digitaltalk30.zip": File is not a zip file
Unable to open the archive "digitaltalk34.zip": File is not a zip file


Unpacking .zip files:  13%|█▎        | 64/499 [00:01<00:08, 52.06file/s]

Unable to open the archive "digitaltalk51.zip": File is not a zip file
Unable to open the archive "digitaltalk24.zip": File is not a zip file
Unable to open the archive "digitaltalk48.zip": File is not a zip file


Unpacking .zip files:  17%|█▋        | 87/499 [00:01<00:08, 48.07file/s]

Unable to open the archive "swo30all.zip": That compression method is not supported
Unable to open the archive "digitaltalk19.zip": File is not a zip file


Unpacking .zip files:  28%|██▊       | 139/499 [00:02<00:08, 44.16file/s]

Unable to open the archive "digitaltalk28.zip": File is not a zip file


Unpacking .zip files:  30%|███       | 150/499 [00:03<00:07, 44.15file/s]

Unable to open the archive "digitaltalk22.zip": File is not a zip file


Unpacking .zip files:  32%|███▏      | 160/499 [00:03<00:08, 38.60file/s]

Unable to open the archive "digitaltalk49.zip": File is not a zip file
Unable to open the archive "digitaltalk39.zip": File is not a zip file
Unable to open the archive "digitaltalk53.zip": File is not a zip file


Unpacking .zip files:  35%|███▌      | 175/499 [00:03<00:08, 38.70file/s]

Unable to open the archive "digitaltalk43.zip": File is not a zip file
Unable to open the archive "digitaltalk20.zip": File is not a zip file
Unable to open the archive "swo28.zip": That compression method is not supported


Unpacking .zip files:  37%|███▋      | 185/499 [00:04<00:08, 38.74file/s]

Unable to open the archive "digitaltalk14.zip": File is not a zip file


Unpacking .zip files:  38%|███▊      | 191/499 [00:04<00:07, 41.64file/s]

Unable to open the archive "digitaltalk37.zip": File is not a zip file


Unpacking .zip files:  43%|████▎     | 216/499 [00:04<00:06, 41.39file/s]

Unable to open the archive "digitaltalk17.zip": File is not a zip file


Unpacking .zip files:  47%|████▋     | 233/499 [00:05<00:05, 48.88file/s]

Unable to open the archive "digitaltalk18.zip": File is not a zip file


Unpacking .zip files:  51%|█████     | 253/499 [00:05<00:05, 42.70file/s]

Unable to open the archive "digitaltalk26.zip": File is not a zip file
Unable to open the archive "digitaltalk52.zip": File is not a zip file


Unpacking .zip files:  54%|█████▎    | 268/499 [00:06<00:05, 40.50file/s]

Unable to open the archive "digitaltalk41.zip": File is not a zip file


Unpacking .zip files:  58%|█████▊    | 287/499 [00:06<00:05, 38.55file/s]

Unable to open the archive "digitaltalk16.zip": File is not a zip file


Unpacking .zip files:  62%|██████▏   | 311/499 [00:07<00:05, 35.66file/s]

Unable to open the archive "digitaltalk35.zip": File is not a zip file


Unpacking .zip files:  64%|██████▍   | 321/499 [00:07<00:04, 39.83file/s]

Unable to open the archive "digitaltalk27.zip": File is not a zip file
Unable to open the archive "digitaltalk38.zip": File is not a zip file


Unpacking .zip files:  76%|███████▌  | 378/499 [00:08<00:02, 44.13file/s]

Unable to open the archive "swo30.zip": That compression method is not supported
Unable to open the archive "digitaltalk13.zip": File is not a zip file


Unpacking .zip files:  85%|████████▍ | 422/499 [00:09<00:01, 51.14file/s]

Unable to open the archive "swo29.zip": That compression method is not supported


Unpacking .zip files:  93%|█████████▎| 466/499 [00:10<00:00, 42.76file/s]

Unable to open the archive "digitaltalk15.zip": File is not a zip file


Unpacking .zip files: 100%|██████████| 499/499 [00:12<00:00, 41.32file/s]


In [14]:
path = 'disk_images'
# Iterate over magazines
for magazine in os.listdir(path):
    magazine_path = os.path.join(path, magazine)
    if os.path.isdir(magazine_path):
        # Iterate over issues
        for issue in os.listdir(magazine_path):
            issue_path = os.path.join(magazine_path, issue)
            if os.path.isdir(issue_path):
                if not os.listdir(issue_path):
                    print(issue_path)
                    os.rmdir(issue_path)

disk_images\Digital Talk\Digital Talk #13
disk_images\Digital Talk\Digital Talk #14
disk_images\Digital Talk\Digital Talk #15
disk_images\Digital Talk\Digital Talk #16
disk_images\Digital Talk\Digital Talk #17
disk_images\Digital Talk\Digital Talk #18
disk_images\Digital Talk\Digital Talk #19
disk_images\Digital Talk\Digital Talk #20
disk_images\Digital Talk\Digital Talk #22
disk_images\Digital Talk\Digital Talk #24
disk_images\Digital Talk\Digital Talk #26
disk_images\Digital Talk\Digital Talk #27
disk_images\Digital Talk\Digital Talk #28
disk_images\Digital Talk\Digital Talk #29
disk_images\Digital Talk\Digital Talk #30
disk_images\Digital Talk\Digital Talk #34
disk_images\Digital Talk\Digital Talk #35
disk_images\Digital Talk\Digital Talk #37
disk_images\Digital Talk\Digital Talk #38
disk_images\Digital Talk\Digital Talk #39
disk_images\Digital Talk\Digital Talk #41
disk_images\Digital Talk\Digital Talk #43
disk_images\Digital Talk\Digital Talk #48
disk_images\Digital Talk\Digital T