In [None]:
#Fetching URLs for data scraping and saving it into 5 different files.

import requests
import csv
import logging
from bs4 import BeautifulSoup

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# Coins to scrape
COINS = ["dogecoin", "bitcoin", "ethereum", "solana", "hamster"]

# Base URL pattern
BASE_URL = "https://thenewscrypto.com/page/{}/?s={}"  # Format: (page_num, coin_name)

# User-Agent to avoid getting blocked
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
}

def get_article_links(coin, page_num):
    """Fetches up to 10 article links from a specific coin search results page."""
    url = BASE_URL.format(page_num, coin)
    logging.info(f"Scraping {coin} - Page {page_num}: {url}")
    
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        if response.status_code != 200:
            logging.warning(f"Failed to fetch {coin} page {page_num}, status code: {response.status_code}")
            return []
        
        soup = BeautifulSoup(response.text, "html.parser")
        articles = soup.find_all("h3", class_="card-title fs-17")
        
        # Extract up to 10 article links
        links = []
        for article in articles[:10]:
            link_tag = article.find("a")
            if link_tag and "href" in link_tag.attrs:
                links.append(link_tag["href"])

        logging.info(f"Found {len(links)} articles for {coin} on page {page_num}")
        return links
    
    except requests.exceptions.RequestException as e:
        logging.error(f"Error fetching {coin} page {page_num}: {e}")
        return []

def scrape_articles_for_coin(coin):
    """Scrapes up to 10 article links for a specific coin and saves them to a CSV file."""
    links = get_article_links(coin, 1)  # Fetch only the first page
    
    if links:
        output_csv = f"{coin}_news_urls.csv"
        with open(output_csv, "w", newline="", encoding="utf-8") as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(["Coin", "Article URL"])  # CSV header
            for link in links:
                writer.writerow([coin, link])
        
        logging.info(f"Saved {len(links)} articles for {coin} to {output_csv}")
    else:
        logging.warning(f"No articles found for {coin}.")

# Run the scraper for each coin
if __name__ == "__main__":
    for coin in COINS:
        scrape_articles_for_coin(coin)


In [None]:
#Extracting the data from the web URLs
import trafilatura
import csv
import pdfkit  #to use this we need to install wkhtmltopdf from https://wkhtmltopdf.org/downloads.html

# Coins to process
COINS = ["dogecoin", "bitcoin", "ethereum", "solana", "hamster"]

def process_coin(coin):
    csv_file = f"{coin}_news_urls.csv"
    urls = []

    # Read URLs from CSV
    try:
        with open(csv_file, newline='', encoding='utf-8') as file:
            reader = csv.reader(file)
            next(reader)  # Skip header
            for row in reader:
                if row:
                    clean_url = row[1].strip().strip("'")
                    urls.append(clean_url)
    except FileNotFoundError:
        print(f"⚠️ CSV file not found for {coin}")
        return

    print(f"Loaded URLs for {coin}:", urls)

    articles_content = ""

    for url in urls:
        print(f"Fetching: {url}")
        try:
            downloaded = trafilatura.fetch_url(url)
            if downloaded:
                content = trafilatura.extract(downloaded)
                if content:
                    print(f"✅ Extracted content from {url}")
                    articles_content += f"<p>{content}</p>\n"
                else:
                    print(f"❌ No content extracted from {url}")
            else:
                print(f"❌ Failed to download {url}")
        except Exception as e:
            print(f"⚠️ Error fetching {url}: {e}")

    # Save extracted content to an HTML file
    if articles_content:
        html_file = f"{coin}_articles.html"
        pdf_file = f"{coin}_data.pdf"
        
        with open(html_file, 'w', encoding='utf-8') as file:
            file.write(articles_content)
        print(f"✅ {html_file} successfully created!")
        
        pdfkit.from_file(html_file, pdf_file)
        print(f"✅ PDF successfully generated: {pdf_file}")
    else:
        print(f"❌ No articles extracted for {coin}, check URLs or website restrictions.")

# Run the process for each coin
if __name__ == "__main__":
    for coin in COINS:
        process_coin(coin)


In [None]:
#Converting the pdf data into txt
import pdfplumber

def extract_text_fast(pdf_files, output_file):
    with open(output_file, "w", encoding="utf-8") as f:
        for pdf_path in pdf_files:
            with pdfplumber.open(pdf_path) as pdf:
                for i, page in enumerate(pdf.pages):
                    text = page.extract_text()
                    if text:  # Only write non-empty text
                        f.write(text + "\n")

                    # Print progress every 100 pages
                    if (i + 1) % 100 == 0:
                        print(f"Processed {i + 1}/{len(pdf.pages)} pages from {pdf_path}...")

# List of PDF files generated from the previous script
pdf_files = ["dogecoin_data.pdf", "bitcoin_data.pdf", "ethereum_data.pdf", "solana_data.pdf", "hamster_data.pdf"]
output_file = "output.txt"
extract_text_fast(pdf_files, output_file)

print("Extraction completed! Check output.txt")
