<a href="https://colab.research.google.com/github/vai-sys/3D-website/blob/main/scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install requests beautifulsoup4





In [None]:
import requests
from bs4 import BeautifulSoup
import json

# URL of the website to scrape
url = "https://thehackernews.com"  # Use the main URL of the website you want to scrape

# Fetch the page content
response = requests.get(url)
if response.status_code == 200:
    # Parse the content with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # List to store the scraped data
    articles = []

    # Find all article containers (assuming 'body-post clear' class wraps each post)
    for post in soup.find_all('div', class_='body-post clear'):
        # Extract the article title
        title_tag = post.find('h2', class_='home-title')
        title = title_tag.text.strip() if title_tag else 'No Title'

        # Extract the article link
        link_tag = post.find('a', class_='story-link')
        link = link_tag['href'] if link_tag else 'No Link'

        # Extract the tags
        tags_span = post.find('span', class_='h-tags')
        tags = tags_span.text.strip() if tags_span else 'No Tags'

        # Extract the description
        desc_div = post.find('div', class_='home-desc')
        description = desc_div.text.strip() if desc_div else 'No Description'

        # Store each article's information as a dictionary
        article_info = {
            'title': title,
            'link': link,
            'tags': tags,
            'description': description
        }

        # Append to the articles list
        articles.append(article_info)

    # Save the data to a JSON file
    with open('articles.json', 'w') as json_file:
        json.dump(articles, json_file, indent=4)

    print("Data has been scraped and saved to articles.json")
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")


Data has been scraped and saved to articles.json


In [None]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime

# Define the URL
url = "https://thehackernews.com/"

# Send a request to the website
response = requests.get(url)
response.raise_for_status()  # Raise an error if request failed

# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Prepare a list to store article data
articles_data = []

# Find articles on the main page
articles = soup.find_all('div', class_='body-post clear')

for article in articles:
    # Extract title
    title_tag = article.find('h2', class_='home-title')
    title = title_tag.text if title_tag else "Unknown"

    # Extract URL
    url_tag = article.find('a', class_='story-link')
    article_url = url_tag['href'] if url_tag else "Unknown"

    # Extract description
    description_tag = article.find('div', class_='home-desc')
    description = description_tag.text.strip() if description_tag else "Description not available"

    # Extract tags (if available)
    tags = []
    tags_tag = article.find('span', class_='h-tags')
    if tags_tag:
        tags = [tag.strip() for tag in tags_tag.text.split('/')]

    # Extract publication date (if available)
    date_tag = article.find('span', class_='h-datetime')
    date_str = date_tag.text.strip() if date_tag else "Unknown"
    try:
        date = datetime.strptime(date_str, '%b %d, %Y').strftime('%Y-%m-%d')
    except:
        date = "Unknown"

    # Define incident type based on tags (example categorization)
    incident_type = "Unknown"
    if "Ransomware" in tags:
        incident_type = "Ransomware"
    elif "Phishing" in tags:
        incident_type = "Phishing"
    # Additional rules can be added for other incident types

    # Append the collected data to articles_data list
    articles_data.append({
        "title": title,
        "date": date,
        "tags": tags,
        "description": description,
        "url": article_url,
        "incident_type": incident_type,
        "threat_actor": "Unknown"  # This would likely require further analysis
    })

# Print the scraped data in JSON format
import json
print(json.dumps(articles_data, indent=4))


[
    {
        "title": "North Korean Hackers Target Crypto Firms with Hidden Risk Malware on macOS",
        "date": "Unknown",
        "tags": [
            "Cryptocurrency",
            "Malware"
        ],
        "description": "A threat actor with ties to the Democratic People's Republic of Korea (DPRK) has been observed targeting cryptocurrency-related businesses with a multi-stage malware capable of infecting Apple macOS devices .  Cybersecurity company SentinelOne, which dubbed the campaign Hidden Risk , attributed it with high confidence to BlueNoroff, which has been previously linked to malware families such as RustBucket , KANDYKORN , ObjCShellz , RustDoor  (aka Thiefbucket ), and TodoSwift .  The activity \"uses emails propagating fake news about cryptocurrency trends to infect targets via a malicious application disguised as a PDF file,\" researchers Raffaele Sabato, Phil Stokes, and Tom Hegel said  in a report shared with The Hacker News.  \"The campaign likely began as

In [22]:
class SecurityNewsScraper:
    def __init__(self):  # Fixed: __init__ instead of _init_
        self.urls = [
            'https://thehackernews.com/',
            'https://www.darkreading.com/latest-news',
            'https://www.csoonline.com/in/',
        ]

    async def fetch_page(self, session, url):
        """Fetch a single page"""
        try:
            async with session.get(url, ssl=True) as response:
                if response.status == 200:
                    return await response.text()
                else:
                    print(f"Error fetching {url}: Status {response.status}")
                    return None
        except Exception as e:
            print(f"Error fetching {url}: {str(e)}")
            return None

    def parse_hacker_news(self, soup):
        """Parse TheHackerNews articles"""
        articles = []
        for post in soup.find_all('div', class_='body-post'):
            try:
                title = post.find('h2', class_='home-title')
                date = post.find('div', class_='item-label')
                description = post.find('div', class_='home-desc')

                articles.append({
                    'date': date.text.strip() if date else None,
                    'title': title.text.strip() if title else None,
                    'description': description.text.strip() if description else None,
                    'source': 'TheHackerNews',
                    'scraped_at': datetime.now().isoformat()
                })
            except Exception as e:
                print(f"Error parsing HackerNews article: {str(e)}")
        return articles

    def parse_dark_reading(self, soup):
        """Parse DarkReading articles"""
        articles = []
        for post in soup.find_all('div', class_='article-content'):
            try:
                title = post.find('h3')
                date = post.find('time')
                description = post.find('p')

                articles.append({
                    'date': date.text.strip() if date else None,
                    'title': title.text.strip() if title else None,
                    'description': description.text.strip() if description else None,
                    'source': 'DarkReading',
                    'scraped_at': datetime.now().isoformat()
                })
            except Exception as e:
                print(f"Error parsing DarkReading article: {str(e)}")
        return articles

    def parse_csoonline(self, soup):
        """Parse CSOOnline articles"""
        articles = []
        for post in soup.find_all('div', class_='article-text'):
            try:
                title = post.find('h3')
                date = post.find('time')
                description = post.find('p')

                articles.append({
                    'date': date.text.strip() if date else None,
                    'title': title.text.strip() if title else None,
                    'description': description.text.strip() if description else None,
                    'source': 'CSOOnline',
                    'scraped_at': datetime.now().isoformat()
                })
            except Exception as e:
                print(f"Error parsing CSOOnline article: {str(e)}")
        return articles

    async def scrape_data(self):
        """Main scraping function"""
        async with aiohttp.ClientSession() as session:
            # Fetch all pages concurrently
            tasks = [self.fetch_page(session, url) for url in self.urls]
            pages = await asyncio.gather(*tasks)

            all_articles = []

            # Parse each page based on its source
            for url, page_content in zip(self.urls, pages):
                if page_content:
                    soup = BeautifulSoup(page_content, 'html.parser')

                    if 'thehackernews.com' in url:
                        articles = self.parse_hacker_news(soup)
                    elif 'darkreading.com' in url:
                        articles = self.parse_dark_reading(soup)
                    elif 'csoonline.com' in url:
                        articles = self.parse_csoonline(soup)
                    else:
                        articles = []

                    all_articles.extend(articles)

            return all_articles

    def real_time_scraping(self, interval_minutes=10):
        """Run continuous scraping with specified interval"""
        while True:
            try:
                print(f"\nStarting scraping at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}...")

                # Run the async scraping
                loop = asyncio.get_event_loop()
                articles = loop.run_until_complete(self.scrape_data())

                # Convert to DataFrame
                df = pd.DataFrame(articles)

                # Clean and process the data
                df = df.dropna(subset=['title'])  # Remove entries without titles
                df = df.drop_duplicates(subset=['title'])  # Remove duplicates

                # Save to JSON
                filename = f'security_news_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json'
                df.to_json(filename, orient='records', indent=2)
                print(f"Saved {len(df)} articles to {filename}")

                # Display sample of results
                print("\nLatest articles:")
                display(df[['source', 'title']].head())

                print(f"\nWaiting {interval_minutes} minutes before next scrape...")
                time.sleep(interval_minutes * 60)

            except Exception as e:
                print(f"Error in scraping cycle: {str(e)}")
                print("Waiting 1 minute before retrying...")
                time.sleep(60)

# Usage
scraper = SecurityNewsScraper()
scraper.real_time_scraping(interval_minutes=10)



Starting scraping at 2024-11-12 12:54:30...
Saved 10 articles to security_news_20241112_125430.json

Latest articles:


Unnamed: 0,source,title
0,TheHackerNews,5 Ways Behavioral Analytics is Revolutionizing...
1,TheHackerNews,New Ymir Ransomware Exploits Memory for Stealt...
2,TheHackerNews,"THN Recap: Top Cybersecurity Threats, Tools, a..."
3,TheHackerNews,New GootLoader Campaign Targets Users Searchin...
4,TheHackerNews,The ROI of Security Investments: How Cybersecu...



Waiting 10 minutes before next scrape...


KeyboardInterrupt: 