In [1]:
import os
import pandas as pd
import requests
import urllib.request
from urllib.parse import urlparse, quote
import re
from disk_image_handling.c64_disk_image_processing import Corpus

# 1. Load data

In [2]:
with open('csv/mw_import.csv', encoding='utf-16') as main_dataset, open('csv/diskmags_csdb.csv', encoding='utf-8') as csdb_dataset, open('csv/diskmags_demozoo.csv', encoding='utf-8') as demozoo_dataset, open('csv/diskmags_pouet.csv', encoding='utf-8') as pouet_dataset:
    df_main = pd.read_csv(main_dataset)
    df_csdb = pd.read_csv(csdb_dataset)
    df_demozoo = pd.read_csv(demozoo_dataset)
    df_pouet = pd.read_csv(pouet_dataset)

## 1.1 Create subset of German-language magazines for the Commodore 64

In [3]:
df_main_german = df_main[df_main['Magazine[Language]'].apply(lambda x: isinstance(x,str) and 'German' in x)]
df_main_german = df_main_german[df_main_german['Magazine[Systems]'].apply(lambda x: isinstance(x,str) and 'Commodore 64' in x)]
df_main_german['systems_lower'] = df_main_german['Magazine[Systems]'].apply(lambda x: x.lower() if isinstance(x, str) else None)
df_main_german.to_csv('csv/c64_diskmag_titles.csv', index=False)

## 1.2 Prepare other datsets

In [4]:
df_csdb['system_lower'] = df_csdb['platform'].apply(lambda x: x.lower() if isinstance(x, str) else None)
df_demozoo['system_lower'] = df_demozoo['platform'].apply(lambda x: x.lower() if isinstance(x, str) else None)
df_pouet['system_lower'] = df_pouet['platform'].apply(lambda x: x.lower() if isinstance(x, str) else None)

# 2. Get the list of all issues

In [5]:
columns = ['title', 
           'issue', 
           'issue_normalized',
           'link',
           'download_links', 
           'release_converted', 
           'group']
issue_col = pd.DataFrame(columns=columns)

In [6]:
for index, row in df_main_german.iterrows():
    if not isinstance(row['systems_lower'], str): continue
    # Get values
    title = row['Title']
    system = row['systems_lower'].split('; ')
    source = row['Magazine[Source]'].split('; ')
    
    if 'CSDB' in source:
        subset_csdb = df_csdb[df_csdb['title']==title]
        subset_csdb = subset_csdb[columns]
        subset_csdb['source'] = 'CSDB'
        issue_col = pd.concat([issue_col, subset_csdb])
        
    if 'Demozoo' in source:
        subset_demozoo = df_demozoo[df_demozoo['title']==title]
        subset_demozoo = subset_demozoo[subset_demozoo['platform'].str.contains('Commodore 64')]
        subset_demozoo = subset_demozoo[columns]
        subset_demozoo['source'] = 'Demozoo'
        issue_col = pd.concat([issue_col, subset_demozoo])
        
    if 'Pouet' in source:
        subset_pouet = df_pouet[df_pouet['title']==title]
        subset_pouet = subset_pouet[subset_pouet['platform'].str.contains('Commodore 64')]
        subset_pouet = subset_pouet[columns]
        subset_pouet['source'] = 'Pouet'
        issue_col = pd.concat([issue_col, subset_pouet])   

In [7]:
issue_col = issue_col.sort_values(by='issue_normalized', key=lambda x: x.str.lower())

In [8]:
issue_col.to_csv('csv/c64_diskmag_issues.csv', index=False)

# 3. Start download

In [9]:
base_path = 'disk_images_automatically_collected/'
#os.mkdir('../data/disk_images')
#os.chdir('disk_images')

In [10]:
downloaded_issues = set()

In [11]:
for index, row in issue_col.iterrows():
    title = row['title']
    issue = row['issue_normalized']
    download_links = row['download_links']
    
    if isinstance(download_links, str):
        download_links = download_links.split(', ')
    else:
        continue
    
    if issue in downloaded_issues:
        continue
    
    for url in download_links:
        if 'download.php' in url:
            continue
        try:
            folder_path = os.path.join(base_path, f'{title}/{issue}')
            parsed_url = urlparse(url)
            filename = os.path.basename(parsed_url.path)
            if not os.path.exists(folder_path):
                os.makedirs(folder_path)
            full_path = os.path.join(folder_path, filename)
            url = url.replace(' ', '')
            encoded_url = quote(url, safe=':/?#')
            urllib.request.urlretrieve(url, filename=full_path)
            downloaded_issues.add(issue)
            print(f'Downloading {filename}')
            
            break
        except Exception as e:
            print(f'Error downloading file: {e}')

Downloading actionnews01.zip
Downloading actionnews02.zip
Downloading actionnews03.zip
Downloading actionnews04.zip
Downloading Action_News_05.zip
Downloading Action_News_06.zip
Downloading actionnews08.zip
Downloading Action_News_08.zip
Downloading Action_News_09.zip
Downloading atn1.zip
Downloading atn2.zip
Downloading atn3.zip
Downloading atn4.zip
Downloading atn5.zip
Downloading atn6.zip
Downloading Bioblech_01.zip
Downloading Bioblech_02.zip
Downloading bioblech03.zip
Downloading biometal01.zip
Downloading biometal02.zip
Downloading biometal03.zip
Downloading biometal04.zip
Downloading chaos_1_.zip
Downloading Chaos 
Downloading Chaos_03.zip
Downloading CN5.zip
Downloading DarkStar13.zip
Downloading Digital-News-April-88-2Sides-ESI.zip
Downloading Digital_News_88_02.zip
Downloading Digital-News-88-01.zip
Downloading Digital_News_88_02.zip
Downloading Digital_news_RADWAR.d64
Downloading digitaltalk_01.zip
Downloading DigitalTalk_01.zip
Downloading DT100.zip
Downloading DT101.zip
Do