In [3]:
import requests
from bs4 import BeautifulSoup
import csv

# Function to clean text (remove leading/trailing whitespaces)
def clean_text(text):
    return text.strip() if text else ''

# Function to extract drama information from a given URL
def extract_drama_info(url):
    drama_data = {}
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')

    # Extracting data
    drama_data['ID'] = clean_text(soup.find('div', class_='box-movie-adv').find('div', class_='movie-hover-title').text)
    drama_data['Title'] = clean_text(soup.find('h1', class_='film-title').text)
    drama_data['Genre'] = clean_text(soup.find('li', class_='show-genres').text.split(':', 1)[1])
    drama_data['Tags'] = clean_text(soup.find('li', class_='show-tags').text.split(':', 1)[1].replace('(Vote or add tags)', ''))
    drama_data['Synopsis'] = clean_text(soup.find('meta', {'name': 'description'})['content'])
    drama_data['Rank'] = clean_text(soup.find('li', class_='rank').text.split(':', 1)[1])
    drama_data['Popularity'] = clean_text(soup.find('li', class_='popularity').text.split(':', 1)[1])
    drama_data['Score'] = clean_text(soup.find('li', class_='score').text.split(':', 1)[1])
    drama_data['Episodes'] = clean_text(soup.find('li', class_='episodes').text.split(':', 1)[1])
    drama_data['Duration'] = clean_text(soup.find('li', class_='duration').text.split(':', 1)[1])
    drama_data['Watchers'] = clean_text(soup.find('span', class_='number').text)
    drama_data['Start_date'] = clean_text(soup.find('span', class_='aired').text.split('-', 1)[0])
    drama_data['End_date'] = clean_text(soup.find('span', class_='aired').text.split('-', 1)[1])
    drama_data['Day_aired'] = clean_text(soup.find('span', class_='show-status').text)

    # Extracting main role actors
    main_role_actors = [clean_text(actor.text) for actor in soup.find_all('li', class_='list-item col-sm-4') if
                        actor.find('small', class_='text-muted') and
                        actor.find('small', class_='text-muted').text == 'Main Role']
    drama_data['Main Role'] = ', '.join(main_role_actors)

    return drama_data

# Main scraping logic
top_korean_dramas_url = 'https://mydramalist.com/shows/top_korean_dramas'
page = requests.get(top_korean_dramas_url)
soup = BeautifulSoup(page.content, 'html.parser')

# Extracting drama URLs from the main page
drama_urls = [a['href'] for a in soup.find_all('a', class_='title text-primary')]

# Scraping each drama's information and storing in a list
drama_list = []
for drama_url in drama_urls:
    drama_data = extract_drama_info(f'https://mydramalist.com{drama_url}')
    drama_list.append(drama_data)

# Writing data to CSV file
csv_filename = 'top_korean_dramas100.csv'
with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['ID', 'Title', 'Genre', 'Tags', 'Synopsis', 'Rank', 'Popularity',
                  'Score', 'Episodes', 'Duration', 'Watchers', 'Start_date', 'End_date',
                  'Day_aired', 'Main Role']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for drama_data in drama_list:
        writer.writerow(drama_data)

print(f'Data has been successfully scraped and saved to {csv_filename}.')


Data has been successfully scraped and saved to top_korean_dramas100.csv.
