In [None]:
!wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | apt-key add -
!echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list
!apt-get update
!apt-get install -y google-chrome-stable


In [None]:
!google-chrome --version


Google Chrome 138.0.7204.92 


Replace "138.0.7204.92" with whateever Chrome version your using.

In [None]:
!wget https://storage.googleapis.com/chrome-for-testing-public/138.0.7204.92/linux64/chromedriver-linux64.zip
!unzip chromedriver-linux64.zip
!mv chromedriver-linux64/chromedriver /usr/local/bin/
!chmod +x /usr/local/bin/chromedriver


In [None]:
!pip install selenium


In [None]:
import csv
import time
import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException

# Set up Chrome options
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

# Set up WebDriver
driver = webdriver.Chrome(service=Service('/usr/local/bin/chromedriver'), options=options)

# File paths
input_file = "technology_article_links.txt"
output_file = "blogdata.csv"
progress_file = "progress.txt"

# XPaths
xpaths = {
    "title": "/html/body/div[1]/div[6]/div/div/div[1]/div[1]/h1",
    "featured_image": "/html/body/div[1]/div[6]/div/div/div[2]/div/a",
    "content": "/html/body/div[1]/div[6]/div/div/div[3]/article/div[1]",
    "tags_parent": "/html/body/div[1]/div[6]/div/div/div[3]/article/div[2]"
}

# Read URLs
with open(input_file, "r") as f:
    urls = [line.strip() for line in f if line.strip()]

# Load progress if it exists
start_index = 0
if os.path.exists(progress_file):
    with open(progress_file, "r") as pf:
        try:
            start_index = int(pf.read().strip())
            print(f"🔁 Resuming from index {start_index}")
        except:
            start_index = 0

# Open CSV and append or create header if needed
file_exists = os.path.exists(output_file)
with open(output_file, "a", newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    if not file_exists:
        writer.writerow(["title", "featuredimage", "content", "tags", "author", "blogurl"])

    for i, url in enumerate(urls[start_index:], start=start_index):
        print(f"[{i+1}/{len(urls)}] Visiting: {url}")
        try:
            driver.get(url)
            time.sleep(2)

            try:
                title = driver.find_element("xpath", xpaths["title"]).text
                print("  ✓ Title found")
            except NoSuchElementException:
                title = "N/A"
                print("  ✗ Title not found")

            try:
                image_href = driver.find_element("xpath", xpaths["featured_image"]).get_attribute("href")
                print("  ✓ Featured image found")
            except NoSuchElementException:
                image_href = "N/A"
                print("  ✗ Featured image not found")

            try:
                content = driver.find_element("xpath", xpaths["content"]).text
                print("  ✓ Content found")
            except NoSuchElementException:
                content = "N/A"
                print("  ✗ Content not found")

            try:
                tag_parent = driver.find_element("xpath", xpaths["tags_parent"])
                tag_elements = tag_parent.find_elements("tag name", "a")
                tags = ", ".join([tag.text for tag in tag_elements]) if tag_elements else "N/A"
                print(f"  ✓ {len(tag_elements)} tag(s) found")
            except NoSuchElementException:
                tags = "N/A"
                print("  ✗ Tags not found")

            # Static values
            author = "KahawaTungu"
            blogurl = url

            # Write to CSV
            writer.writerow([title, image_href, content, tags, author, blogurl])
            csvfile.flush()
            print("  → Row saved to CSV")

        except Exception as e:
            print(f"  ⚠️ Failed to process {url}: {e}")
            writer.writerow(["Error", "Error", "Error", "Error", "Error", url])
            csvfile.flush()

        # Save progress
        with open(progress_file, "w") as pf:
            pf.write(str(i + 1))

        print()

driver.quit()
print(f"\n✅ Done. Data saved to {output_file}")
