## **1. Install Selenium**

In [4]:
# %%shell
# # Ubuntu no longer distributes chromium-browser outside of snap
# #
# # Proposed solution: https://askubuntu.com/questions/1204571/how-to-install-chromium-without-snap

# # Add debian buster
# cat > /etc/apt/sources.list.d/debian.list <<'EOF'
# deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
# deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
# deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
# EOF

# # Add keys
# apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
# apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
# apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A

# apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
# apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
# apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg

# # Prefer debian repo for chromium* packages only
# # Note the double-blank lines between entries
# cat > /etc/apt/preferences.d/chromium.pref << 'EOF'
# Package: *
# Pin: release a=eoan
# Pin-Priority: 500


# Package: *
# Pin: origin "deb.debian.org"
# Pin-Priority: 300


# Package: chromium*
# Pin: origin "deb.debian.org"
# Pin-Priority: 700
# EOF

# # Install chromium and chromium-driver
# apt-get update
# apt-get install chromium chromium-driver

# # Install selenium
# pip install selenium

## **2. Import libraries**

In [5]:
import pandas as pd
import os
import requests
import time
import re
import random

from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

## **3. Crawl poem**

In [None]:
WEBDRIVER_DELAY_TIME_INT = 10
TIMEOUT_INT = 10
service = Service(ChromeDriverManager().install())
# service = Service(executable_path=r'/usr/bin/chromedriver') # use for run on Google Colab
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('window-size=1920x1080')
chrome_options.headless = True
driver = webdriver.Chrome(service=service, options=chrome_options)
driver.implicitly_wait(TIMEOUT_INT)
wait = WebDriverWait(driver, WEBDRIVER_DELAY_TIME_INT)

In [7]:
def clean_poem_html(html):
    html = re.sub(r'<img.*?>', '', html, flags=re.IGNORECASE)
    html = re.sub(r'<i>.*?</i>', '', html, flags=re.IGNORECASE | re.DOTALL)
    html = re.sub(r'<b>(.*?)</b>(?!\s*(?:<br\s*/?>\s*){2,})', r'\1', html, flags=re.IGNORECASE)
    html = re.sub(r'<br\s*/?>', '\n', html, flags=re.IGNORECASE)
    html = re.sub(r'</?p>', '', html, flags=re.IGNORECASE)

    return html.strip()

def process_poem_content(html, poem_src, poem_url, default_title=''):
    cleaned = clean_poem_html(html)

    pattern = re.compile(r'<b>(.*?)</b>\s*\n{2,}', flags=re.IGNORECASE)
    matches = list(pattern.finditer(cleaned))

    poems = []
    if matches:
        for i, match in enumerate(matches):
            title = match.group(1).strip()
            start = match.end()
            end = matches[i+1].start() if i + 1 < len(matches) else len(cleaned)
            content = cleaned[start:end].strip('\n')
            poems.append({
                'title': title,
                'content': content,
                'source': poem_src,
                'url': poem_url
            })
    else:
        poems.append({
            'title': default_title,
            'content': cleaned,
            'source': poem_src,
            'url': poem_url
        })
    return poems

In [8]:
def extract_poem_links(driver, page_idx):
    main_url = f'https://www.thivien.net/searchpoem.php?PoemType=16&ViewType=1&Country=2&Age[]=3&Page={page_idx}'

    driver.get(main_url)
    time.sleep(random.uniform(3, 5))

    content_tags_xpath = '//*[@class="page-content container"]//div[@class="page-content-main"]//div[@class="list-item"]'

    content_tags = driver.find_elements(By.XPATH, content_tags_xpath)
    poem_links = []

    for tag in content_tags:
        try:
            link_element = tag.find_element(By.XPATH, './/h4[@class="list-item-header"]/a')
            poem_title = link_element.text
            poem_url = link_element.get_attribute('href')
            poem_links.append({'title': poem_title, 'url': poem_url})
        except Exception as e:
            print(f"Error extracting link: {e}")
            continue

    return poem_links

In [9]:
def scrape_poem(driver, poem_url):
    driver.get(poem_url)
    time.sleep(random.uniform(3, 5))

    poem_content_tag = WebDriverWait(driver, 10).until(
        EC.visibility_of_element_located((By.CSS_SELECTOR, 'div.poem-content'))
    )

    html_content = poem_content_tag.get_attribute('innerHTML')

    try:
        poem_src_tag = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//div[@class="small"]'))
        )
        poem_src = poem_src_tag.text
    except Exception:
        poem_src = ''

    return process_poem_content(html_content, poem_src, poem_url)

In [10]:
def scrape_poems(driver, num_pages=10):
    datasets = []
    for page_idx in tqdm(range(1, num_pages + 1)):
        poem_links = extract_poem_links(driver, page_idx)
        for poem in poem_links:
            poem_url = poem['url']
            try:
                poems = scrape_poem(driver, poem_url)
                datasets.extend(poems)
            except Exception as e:
                print(f"Error processing {poem_url}: {e}")
                continue
    return datasets

In [11]:
# Scrape poems from 10 pages
datasets = scrape_poems(driver, num_pages=10)
driver.quit()

 40%|████      | 4/10 [04:04<06:13, 62.22s/it]

Error processing https://www.thivien.net/Th%C3%A1i-B%C3%A1-T%C3%A2n/V%C6%B0%E1%BB%A3t-tr%E1%BB%9F-ng%E1%BA%A1i-l%C3%A0-kh%C3%B3/poem-0HMLBkZ0f1EJMprmg1Mqew: Message: 
Stacktrace:
	GetHandleVerifier [0x00470B43+25139]
	(No symbol) [0x004013F4]
	(No symbol) [0x002E04E3]
	(No symbol) [0x003283D7]
	(No symbol) [0x0032872B]
	(No symbol) [0x00371002]
	(No symbol) [0x0034D014]
	(No symbol) [0x0036E778]
	(No symbol) [0x0034CDC6]
	(No symbol) [0x0031BDE9]
	(No symbol) [0x0031D124]
	GetHandleVerifier [0x00774373+3185251]
	GetHandleVerifier [0x0079291A+3309578]
	GetHandleVerifier [0x0078CF42+3286578]
	GetHandleVerifier [0x00507AE0+643536]
	(No symbol) [0x0040A20D]
	(No symbol) [0x004070B8]
	(No symbol) [0x00407257]
	(No symbol) [0x003F9E00]
	BaseThreadInitThunk [0x75975D49+25]
	RtlInitializeExceptionChain [0x76EFCDEB+107]
	RtlGetAppContainerNamedObjectPath [0x76EFCD71+561]

Error processing https://www.thivien.net/Th%C3%A1i-B%C3%A1-T%C3%A2n/Xinh-%C4%91%E1%BA%B9p-kh%C3%B4ng-c%C3%B3-ngh%C4%A9a/poem

100%|██████████| 10/10 [13:57<00:00, 83.71s/it]


In [12]:
datasets

[{'title': '',
  'content': 'Bạn xấu như chiếc bóng\nCứ bám riết theo anh\nKhi anh sáng, rực rỡ\nNhư mặt trời long lanh\n\nNhưng họ sẽ biến mất\nKhi trời phủ mây đen\nTức là khi anh đói\nTrong túi không có tiền',
  'source': '[Thông tin 1 nguồn tham khảo đã được ẩn]',
  'url': 'https://www.thivien.net/Th%C3%A1i-B%C3%A1-T%C3%A2n/B%E1%BA%A1n-x%E1%BA%A5u-nh%C6%B0-chi%E1%BA%BFc-b%C3%B3ng/poem-4r1MApTHwgzgxicR8oy7nQ'},
 {'title': '',
  'content': 'Cái làm ta hạnh phúc\nThực ra cũng chẳng nhiều\nChỉ cần có ai đó\nĐể ta thầm thương yêu\n\nRồi thêm chút công việc\nCho ta làm hàng ngày\nCuối cùng, chút mơ mộng\nĐể đưa ta lên mây',
  'source': '[Thông tin 1 nguồn tham khảo đã được ẩn]',
  'url': 'https://www.thivien.net/Th%C3%A1i-B%C3%A1-T%C3%A2n/C%C3%A1i-l%C3%A0m-ta-h%E1%BA%A1nh-ph%C3%BAc/poem-t24-M5Dn2cxmEDVSrQhtCw'},
 {'title': '',
  'content': 'Chiều vừa xốp trên tay\nChợt nghe thoáng ong bay\nCó ai vừa chết nhỉ\nMây thắt tang trăng gầy\n\nỚt đỏ sao cứ đỏ\nTáo chín cho thật vàng\nEm đẹp cho 

In [13]:
len(datasets)

106

## **4. Save data to file**

In [14]:
df = pd.DataFrame(datasets)
df.to_csv('poem_dataset.csv', index=True)

In [15]:
df

Unnamed: 0,title,content,source,url
0,,Bạn xấu như chiếc bóng\nCứ bám riết theo anh\n...,[Thông tin 1 nguồn tham khảo đã được ẩn],https://www.thivien.net/Th%C3%A1i-B%C3%A1-T%C3...
1,,Cái làm ta hạnh phúc\nThực ra cũng chẳng nhiều...,[Thông tin 1 nguồn tham khảo đã được ẩn],https://www.thivien.net/Th%C3%A1i-B%C3%A1-T%C3...
2,,Chiều vừa xốp trên tay\nChợt nghe thoáng ong b...,[Thông tin 1 nguồn tham khảo đã được ẩn],https://www.thivien.net/L%C3%A2m-Huy-Nhu%E1%BA...
3,,Chơi thân không có nghĩa\nKhông cãi nhau bao g...,[Thông tin 1 nguồn tham khảo đã được ẩn],https://www.thivien.net/Th%C3%A1i-B%C3%A1-T%C3...
4,,"Có thể buồn chút ít\nMột mình, không người yêu...",[Thông tin 1 nguồn tham khảo đã được ẩn],https://www.thivien.net/Th%C3%A1i-B%C3%A1-T%C3...
...,...,...,...,...
101,,Chợ trưa ai đợi mẹ\nBến khuya ai đợi đò\nSông ...,,https://www.thivien.net/%C4%90%E1%BB%97-Qu%E1%...
102,,Người ấy đến rất say\ntừ cao trời bay xuống\nm...,Thu 84,https://www.thivien.net/Nguy%E1%BB%85n-Thu%E1%...
103,,Tôi giật mình đánh rơi\nCon mèo con xuống giến...,[Thông tin 1 nguồn tham khảo đã được ẩn],https://www.thivien.net/T%C3%B4-H%C3%A0/%C3%81...
104,,"Ôi, con sóng chết khô,\nvật vờ trong bùn quánh...",,https://www.thivien.net/%C4%90%E1%BB%97-Qu%E1%...


In [None]:
# !zip -r poem_dataset.zip poem_dataset.csv

'zip' is not recognized as an internal or external command,
operable program or batch file.


In [17]:
import shutil
shutil.make_archive("poem_dataset", 'zip', root_dir=".", base_dir="poem_dataset.csv")

'e:\\OneDrive\\1.0 DS & AI\\AIO2024\\AIO-Exercise\\Module_08\\Text Project Poem Generation\\poem_dataset.zip'