In [2]:

import requests
from bs4 import BeautifulSoup

GENERIC_XML_URL = "https://cdn.littlefox.co.kr/cn/captionxml/C0001143.xml"


# Series Class Definition
Represents a single series on LFC

In [30]:
import requests
from bs4 import BeautifulSoup

class Series:
    def __init__(self, series_title, series_id):
        """
        Initialize the Series class with the given series ID.
        
        :param series_id: The ID of the series.
        """
        self.main_url = f'https://chinese.littlefox.com/en/story/contents_list/{series_id}'

    def get_page_count(self):
        """
        Get the total number of pages for the series.
        
        :return: The maximum page number or None if not found.
        """
        try:
            response = requests.get(self.main_url)
            response.raise_for_status()
        except requests.RequestException as e:
            print(f"Error fetching main URL: {e}")
            return None

        soup = BeautifulSoup(response.text, 'html.parser')
        paging_div = soup.find('div', class_='lf_paging')

        if paging_div:
            page_numbers = [
                int(a_tag.text) for a_tag in paging_div.find_all('a') if a_tag.text.isdigit()
            ]
            if page_numbers:
                return max(page_numbers)
        return None

    def get_page_urls(self):
        """
        Generate a list of URLs for all pages in the series.
        
        :return: A list of page URLs or None if no pages are found.
        """
        max_page_count = self.get_page_count()
        if max_page_count:
            return [f'{self.main_url}?&page={page}' for page in range(1, max_page_count + 1)]
        else:
            print("No pages found")
            return None

    def get_ep_ids(self):
        """
        Extract episode IDs from all pages in the series.
        
        :return: A list of episode IDs.
        """
        page_urls = self.get_page_urls()
        if not page_urls:
            return []

        ids = []
        for url in page_urls:
            try:
                response = requests.get(url)
                response.raise_for_status()
            except requests.RequestException as e:
                print(f"Error fetching page URL {url}: {e}")
                continue

            soup = BeautifulSoup(response.text, 'html.parser')
            items = soup.find_all('div', class_='item')
            ids.extend(
                input_element.get('value')
                for item in items
                if (input_element := item.find('input', class_='LF_CHK s2 contentsCheck'))
            )
        return ids

In [29]:
single_stories_1 = Series('DP000732')
len(single_stories_1.get_ep_ids())

71

# Series Extraction
Extract the series title and id from the homepage.
Since the first 6 series do not have subtitles, they will be ignored. 

In [36]:
homepage_url = 'https://chinese.littlefox.com/en/story'
try:
    response = requests.get(homepage_url)
    response.raise_for_status()
except requests.RequestException as e:
    print(f"Error fetching main URL: {e}")

soup = BeautifulSoup(response.text, 'html.parser')
contents_divs = soup.find_all('div', class_='constents_wrap')
series = []

for div in contents_divs[6:]:
    id = div['data-smid']
    title = div.find('div', class_='thumb_titl').find('a').text
    print(title)
    series.append(Series(title, id))

data-smid: DP000777, title: Nihao Chinese!
data-smid: DP000778, title: Introduction to Tones
data-smid: DP000779, title: Introduction to Simple Finals
data-smid: DP000780, title: Introduction to Initials
data-smid: DP000782, title: Introduction to Compound Finals
data-smid: DP000781, title: Tone Change Rules
data-smid: DP000732, title: Single Stories
data-smid: DP000733, title: Mrs. Kelly's Class
data-smid: DP000791, title: Who Am I?
data-smid: DP000783, title: Where Am I?
data-smid: DP000740, title: The Big Green Forest
data-smid: DP000742, title: Bat and Friends
data-smid: DP000792, title: Car School
data-smid: DP000796, title: Dino Buddies
data-smid: DP000738, title: Single Stories
data-smid: DP000799, title: Space Patrol
data-smid: DP000794, title: Peter Rabbit and Friends
data-smid: DP000739, title: Wacky Ricky 
data-smid: DP000751, title: Magic Marker
data-smid: DP000750, title: Bird and Kip
data-smid: DP000737, title: Sam and Lucky 
data-smid: DP000746, title: Meet the Animals
d