In [1]:
import time
import re
import pandas as pd
import requests
import json
from bs4 import BeautifulSoup

In [8]:
class RadioUKScraper:
    def __init__(self):
        self.base_url = 'https://www.radio-uk.co.uk'
        self.all_channels = []
        self.channel_file = 'all_channels.json'
        self.channel_data_file = 'channel_data.json'
        self.data = []
        self.channel_no = 0

    def get_channel_urls(self, overwrite=False):
        if not overwrite:
            try:
                with open(self.channel_file, 'r', encoding='utf-8') as f:
                    self.all_channels = json.load(f)
                return f"SUCCESS: Channel URLs loaded from {self.channel_file} with {len(self.all_channels)} entries."
            except FileNotFoundError:
                print(f"File {self.channel_file} not found. Fetching channel URLs...")
                overwrite = True
        if overwrite:
            for x in range(1, 62):
                soup = BeautifulSoup(requests.get(f'{self.base_url}/?page={x}').text, 'html.parser')
                for tile in soup.select('.mdc-grid-tile'):
                    title_tag = tile.select_one('.mdc-grid-tile__title')
                    a_tag = tile.find('a', href=True)
                    if title_tag and a_tag:
                        title = title_tag.get_text(strip=True)
                        href = a_tag['href']
                        self.all_channels.append({'title': title, 'href': href})

            # Writing it back as a JSON File
            with open(self.channel_file, 'w', encoding='utf-8') as f:
                json.dump(self.all_channels, f, ensure_ascii=False, indent=2)
        return f"SUCCESS: Channel URLs for {len(self.all_channels)} fetched successfully."
    
    def scrape_channel_data(self, use_file=True):
        current_run = 0
        if use_file:
            try:
                with open(self.channel_data_file, 'r', encoding='utf-8') as f:
                    self.data = json.load(f)
                    self.channel_no = len(self.data)
                    print(f"Loaded {self.channel_no} channels from {self.channel_data_file}.")
            except FileNotFoundError:
                print(f"File {self.channel_data_file} not found. Starting fresh scrape.")
        
        for channel in self.all_channels:
            if current_run < self.channel_no:
                print(f"Skipping channel {current_run}/{len(self.all_channels)}: {channel['title']} since already run priorly.", end='\r')
            else:
                print(f"------------ Processing channel: {channel['title']} --- {self.channel_no + 1}/{len(self.all_channels)} ---------------------", end='\r')
                soup = BeautifulSoup(requests.get(self.base_url + channel['href']).text, 'html.parser')

                frequencies = ','.join([x.text.strip() for x in soup.select('.frequency-item')])
                if frequencies == '':
                    frequencies = 'Internet/Community'
                email = ','.join([x.text for x in soup.select('span.small-margin-top-bottom') if "@" in x.text])
                categories = ''.join([','.join(list(x.stripped_strings)[1:]) for x in soup.select('.categories')])
                self.data.append({
                    'title': channel['title'],
                    'href': self.base_url + channel['href'],
                    'frequencies': frequencies,
                    'email': email,
                    'categories': categories
                })
                self.channel_no += 1
                if (current_run % 300 == 0) or (current_run == len(self.all_channels) - 1):
                    print(f"Processed {current_run} channels so far. Saving progress...")
                    with open(self.channel_data_file, 'w', encoding='utf-8') as f:
                        json.dump(self.data, f, ensure_ascii=False, indent=2)
            current_run += 1
        return self.data

In [9]:
radiouk = RadioUKScraper()

In [10]:
radiouk.get_channel_urls(overwrite=False)

'SUCCESS: Channel URLs loaded from all_channels.json with 3665 entries.'

In [11]:
predf = radiouk.scrape_channel_data(use_file=True)

Loaded 3601 channels from channel_data.json.
Processed 3664 channels so far. Saving progress...s Report --- 3665/3665 ---------------------------5 ---------------------


In [14]:
pd.DataFrame(predf).to_csv('radiouk_channels.csv', index=False, encoding='utf-8-sig')