**Install necassary modules**

In [None]:
!pip install pandas numpy requests beautifulsoup4 tldextract python-whois scikit-learn transformers torch tensorflow nanoid gensim


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import requests
import tldextract
import whois
from datetime import datetime
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import re
from transformers import pipeline
from sklearn.ensemble import RandomForestClassifier
from gensim.models import Word2Vec
import tensorflow as tf
import nanoid
from PIL import Image
from io import BytesIO


In [None]:
# Create comprehensive Malayalam news sources database
def create_trusted_sources_db():
    return {
        # Major News Sources
        'manoramaonline.com': {'name': 'Malayala Manorama', 'credibility_score': 0.95, 'established': 1888, 'headquarters': 'Kottayam'},
        'mathrubhumi.com': {'name': 'Mathrubhumi', 'credibility_score': 0.95, 'established': 1923, 'headquarters': 'Kozhikode'},
        'madhyamam.com': {'name': 'Madhyamam', 'credibility_score': 0.90, 'established': 1987, 'headquarters': 'Calicut'},
        'deshabhimani.com': {'name': 'Deshabhimani', 'credibility_score': 0.85, 'established': 1942, 'headquarters': 'Thiruvananthapuram'},
        'keralakaumudi.com': {'name': 'Kerala Kaumudi', 'credibility_score': 0.90, 'established': 1911, 'headquarters': 'Thiruvananthapuram'},
        'chandrikadaily.com': {'name': 'Chandrika', 'credibility_score': 0.80, 'region': 'Kozhikode'},
        'janmabhumi.in': {'name': 'Janmabhumi', 'credibility_score': 0.80, 'region': 'Thiruvananthapuram'},
        'sirajlive.com': {'name': 'Siraj Daily', 'credibility_score': 0.75, 'region': 'Kozhikode'},
        'metrovaartha.com': {'name': 'Metro Vaartha', 'credibility_score': 0.75, 'region': 'Kochi'},
        'southlive.in': {'name': 'South Live', 'credibility_score': 0.70},
        'thejasnews.com': {'name': 'Thejas News', 'credibility_score': 0.70},
        '24newslive.com': {'name': '24 News', 'credibility_score': 0.75},
        'asianetnews.com': {'name': 'Asianet News', 'credibility_score': 0.85},
        'deepika.com': {'name': 'Deepika', 'credibility_score': 0.85},
        'janamtv.com': {'name': 'Janam TV', 'credibility_score': 0.75},
        'reporter.live': {'name': 'Reporter', 'credibility_score': 0.75},
        'mangalam.com': {'name': 'Mangalam', 'credibility_score': 0.75},
        'keralabhooshanam.com': {'name': 'Kerala Bhooshanam', 'credibility_score': 0.70},
        'suprabhaatham.com': {'name': 'Suprabhaatham', 'credibility_score': 0.70},
        'malayalamvaarthakal.com': {'name': 'Malayalam Vaarthakal', 'credibility_score': 0.70},

        # Fact Checkers
        'malayalam.factcrescendo.com': {'name': 'Fact Crescendo Malayalam', 'credibility_score': 0.90, 'type': 'Fact Checker'},
        'malayalam.vishvasnews.com': {'name': 'Vishvas News Malayalam', 'credibility_score': 0.85, 'type': 'Fact Checker'},
        'factcheck.malayalam.samayam.com': {'name': 'Samayam Fact Check', 'credibility_score': 0.85, 'type': 'Fact Checker'}
    }


In [None]:
# Enhanced Link2Vec Model Implementation
class Link2VecModel:
    def __init__(self, embedding_dim=100):
        self.embedding_dim = embedding_dim
        self.model = None
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }

    def extract_webpage_content(self, url):
        try:
            response = requests.get(url, headers=self.headers, timeout=10)
            soup = BeautifulSoup(response.content, 'html.parser')

            title = soup.title.string if soup.title else ""
            text_content = ' '.join([p.text for p in soup.find_all(['p', 'article', 'section'])])

            images = []
            for img in soup.find_all('img'):
                img_url = img.get('src', '')
                if img_url:
                    if not img_url.startswith(('http://', 'https://')):
                        img_url = urljoin(url, img_url)
                    images.append(img_url)

            return {
                'title': title,
                'text': text_content,
                'images': images[:5]
            }
        except Exception as e:
            print(f"Error extracting content: {str(e)}")
            return None

    def preprocess_url(self, url):
        parsed = urlparse(url)
        components = [
            parsed.scheme,
            parsed.netloc,
            *parsed.path.split('/'),
            *parsed.query.split('&'),
            parsed.fragment
        ]

        content = self.extract_webpage_content(url)
        if content and content['title']:
            components.extend(content['title'].lower().split())

        return [c for c in components if c]


In [None]:
# URL Feature Extractor with Enhanced Content Analysis
class URLFeatureExtractor:
    def __init__(self, link2vec_model):
        self.trusted_sources = create_trusted_sources_db()
        self.suspicious_tlds = ['xyz', 'top', 'buzz', 'guru', 'club', 'online']
        self.link2vec = link2vec_model

    def extract_features(self, url):
        try:
            parsed_url = urlparse(url)
            domain = tldextract.extract(url).registered_domain

            content = self.link2vec.extract_webpage_content(url)

            features = {
                'url_length': len(url),
                'domain_length': len(domain),
                'path_length': len(parsed_url.path),
                'num_dots': url.count('.'),
                'num_hyphens': url.count('-'),
                'num_underscores': url.count('_'),
                'num_slashes': url.count('/'),
                'num_equals': url.count('='),
                'num_digits': sum(c.isdigit() for c in url),
                'has_https': int(url.startswith('https://')),
                'is_trusted_domain': int(domain in self.trusted_sources),
                'has_suspicious_tld': int(tldextract.extract(url).suffix in self.suspicious_tlds),
                'has_title': int(bool(content and content['title'])),
                'has_images': int(bool(content and content['images'])),
                'content_length': len(content['text']) if content else 0
            }

            embedding = self.link2vec.get_url_embedding(url)
            embedding_features = {f'embedding_{i}': v for i, v in enumerate(embedding)}

            try:
                domain_info = whois.whois(domain)
                creation_date = domain_info.creation_date
                if isinstance(creation_date, list):
                    creation_date = creation_date[0]
                features['domain_age'] = (datetime.now() - creation_date).days
            except:
                features['domain_age'] = -1

            return {**features, **embedding_features}

        except Exception as e:
            print(f"Error extracting features: {str(e)}")
            return None


In [None]:
class URLFeatureExtractor:
    def __init__(self, link2vec_model):
        self.trusted_sources = create_trusted_sources_db()
        self.suspicious_tlds = ['xyz', 'top', 'buzz', 'guru', 'club', 'online']
        self.link2vec = link2vec_model

    def extract_features(self, url):
        try:
            parsed_url = urlparse(url)
            domain = tldextract.extract(url).registered_domain

            # Basic features
            basic_features = {
                'url_length': len(url),
                'domain_length': len(domain),
                'path_length': len(parsed_url.path),
                'num_dots': url.count('.'),
                'num_hyphens': url.count('-'),
                'num_underscores': url.count('_'),
                'num_slashes': url.count('/'),
                'num_equals': url.count('='),
                'num_digits': sum(c.isdigit() for c in url),
                'has_https': int(url.startswith('https://')),
                'is_trusted_domain': int(domain in self.trusted_sources),
                'has_suspicious_tld': int(tldextract.extract(url).suffix in self.suspicious_tlds)
            }

            # Add Link2Vec embedding
            embedding = self.link2vec.get_url_embedding(url)
            embedding_features = {f'embedding_{i}': v for i, v in enumerate(embedding)}

            # Add domain age
            try:
                domain_info = whois.whois(domain)
                creation_date = domain_info.creation_date
                if isinstance(creation_date, list):
                    creation_date = creation_date[0]
                basic_features['domain_age'] = (datetime.now() - creation_date).days
            except:
                basic_features['domain_age'] = -1

            # Combine all features
            return {**basic_features, **embedding_features}

        except Exception as e:
            print(f"Error extracting features: {str(e)}")
            return None

In [None]:
class URLCredibilityAnalyzer:
    def __init__(self, link2vec_model):
        self.feature_extractor = URLFeatureExtractor(link2vec_model)
        self.classifier = RandomForestClassifier()

    def analyze_url(self, url):
        try:
            domain = tldextract.extract(url).registered_domain
            features = self.feature_extractor.extract_features(url)

            if features is None:
                return {
                    'credibility_score': 0.0,
                    'status': 'error',
                    'message': 'Failed to extract features',
                    'warning_flags': []  # Add empty warning flags
                }

            # Check trusted sources first
            if domain in self.feature_extractor.trusted_sources:
                source_info = self.feature_extractor.trusted_sources[domain]
                return {
                    'credibility_score': source_info['credibility_score'],
                    'status': 'trusted',
                    'source_name': source_info['name'],
                    'features': features,
                    'warning_flags': []  # Add empty warning flags for trusted sources
                }

            # Calculate credibility score using Link2Vec features
            link2vec_score = self._calculate_link2vec_score(features)
            base_score = 0.5

            if features.get('has_https', False):
                base_score += 0.1
            if features.get('domain_age', 0) > 365:
                base_score += 0.1
            if features.get('has_suspicious_tld', False):
                base_score -= 0.2

            final_score = (base_score + link2vec_score) / 2
            warning_flags = self._get_warning_flags(features)

            return {
                'credibility_score': max(min(final_score, 1.0), 0.0),
                'status': 'analyzed',
                'features': features,
                'warning_flags': warning_flags
            }

        except Exception as e:
            return {
                'credibility_score': 0.0,
                'status': 'error',
                'message': str(e),
                'warning_flags': []  # Add empty warning flags for errors
            }

    def _get_warning_flags(self, features):
        flags = []

        # Use get() method with default values to avoid KeyError
        if not features.get('has_https', False):
            flags.append('No HTTPS security')
        if features.get('domain_age', 0) < 180:
            flags.append('Recently registered domain')
        if features.get('has_suspicious_tld', False):
            flags.append('Suspicious top-level domain')

        return flags


In [None]:
class URLExistenceChecker:
    def __init__(self):
        self.session = requests.Session()
        self.timeout = 10
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }

    def check_url_exists(self, url):
        try:
            if not url.startswith(('http://', 'https://')):
                url = 'https://' + url

            response = self.session.head(
                url,
                timeout=self.timeout,
                allow_redirects=True,
                headers=self.headers
            )

            if response.status_code != 200:
                response = self.session.get(
                    url,
                    timeout=self.timeout,
                    allow_redirects=True,
                    headers=self.headers
                )

            return {
                'exists': True,
                'status_code': response.status_code,
                'accessible': response.status_code == 200,
                'final_url': response.url,
                'is_redirect': len(response.history) > 0,
                'content_type': response.headers.get('content-type', '')
            }

        except requests.ConnectionError:
            return {
                'exists': False,
                'error': 'Connection failed',
                'reason': 'Unable to connect to server'
            }
        except requests.Timeout:
            return {
                'exists': False,
                'error': 'Timeout',
                'reason': 'Request timed out'
            }
        except requests.RequestException as e:
            return {
                'exists': False,
                'error': 'Request failed',
                'reason': str(e)
            }

In [None]:
class ContentScraper:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }

    def scrape_content(self, url):
        try:
            response = requests.get(url, headers=self.headers, timeout=15)
            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract main content
            content = {
                'title': self._extract_title(soup),
                'meta_description': self._extract_meta_description(soup),
                'article_text': self._extract_article_text(soup),
                'images': self._extract_images(soup, url),
                'author': self._extract_author(soup),
                'publish_date': self._extract_publish_date(soup),
                'social_shares': self._extract_social_shares(soup)
            }

            return content

        except Exception as e:
            return {
                'error': str(e),
                'status': 'failed'
            }

    def _extract_title(self, soup):
        title = soup.title.string if soup.title else ""
        h1 = soup.find('h1')
        if h1:
            title = h1.text.strip()
        return title

    def _extract_meta_description(self, soup):
        meta = soup.find('meta', {'name': ['description', 'og:description']})
        return meta['content'] if meta else ""

    def _extract_article_text(self, soup):
        # Remove unwanted elements
        for element in soup.find_all(['script', 'style', 'nav', 'footer', 'header']):
            element.decompose()

        # Extract text from article content
        article_content = []
        for element in soup.find_all(['article', 'div', 'section']):
            paragraphs = element.find_all('p')
            if paragraphs:
                article_content.extend([p.text.strip() for p in paragraphs])

        return '\n'.join(article_content)

    def _extract_images(self, soup, base_url):
        images = []
        for img in soup.find_all('img'):
            src = img.get('src', '')
            if src:
                if not src.startswith(('http://', 'https://')):
                    src = urljoin(base_url, src)
                images.append({
                    'url': src,
                    'alt': img.get('alt', ''),
                    'width': img.get('width', ''),
                    'height': img.get('height', '')
                })
        return images

    def _extract_author(self, soup):
        author = soup.find(['meta', 'span', 'div', 'a'], {
            'name': 'author',
            'class': ['author', 'byline'],
            'itemprop': 'author'
        })
        return author.text.strip() if author else ""

    def _extract_publish_date(self, soup):
        date = soup.find(['meta', 'time'], {
            'property': ['article:published_time', 'og:published_time'],
            'itemprop': 'datePublished'
        })
        return date['content'] if date else ""

    def _extract_social_shares(self, soup):
        shares = {
            'facebook': 0,
            'twitter': 0,
            'whatsapp': 0
        }
        share_elements = soup.find_all(['span', 'div'], {
            'class': ['share-count', 'social-count']
        })
        for element in share_elements:
            count_text = element.text.strip()
            if count_text.isdigit():
                shares['total'] = int(count_text)
        return shares


In [None]:
def verify_url(url, link2vec_model):
    # First check URL existence
    existence_checker = URLExistenceChecker()
    existence_result = existence_checker.check_url_exists(url)

    print(f"\n📊 URL Analysis Results")
    print("=" * 50)
    print(f"🔗 URL: {url}")
    print("-" * 50)

    if not existence_result['exists']:
        print("❌ Status: Not Accessible")
        print(f"Error: {existence_result['error']}")
        print(f"Reason: {existence_result['reason']}")
        return existence_result

    # Extract webpage content
    try:
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Get title and metadata
        title = soup.title.string if soup.title else "No title found"
        meta_desc = soup.find('meta', {'name': 'description'})
        description = meta_desc['content'] if meta_desc else "No description found"

        # Count images and extract their sources
        images = soup.find_all('img')
        image_count = len(images)
        image_sources = [img.get('src', '') for img in images if img.get('src')]

        # Get article date if available
        pub_date = None
        date_meta = soup.find('meta', {'property': ['article:published_time', 'og:published_time']})
        if date_meta:
            pub_date = date_meta['content']

        # Perform Link2Vec and credibility analysis
        analyzer = URLCredibilityAnalyzer(link2vec_model)
        result = analyzer.analyze_url(url)

        # Print comprehensive results
        print(f"✅ Status: {'Accessible' if existence_result['accessible'] else 'Not Accessible'}")
        print(f"📝 Title: {title}")
        print(f"🖼️ Images Found: {image_count}")
        print(f"📅 Published Date: {pub_date if pub_date else 'Not found'}")
        print(f"🔒 HTTPS: {'Yes' if url.startswith('https://') else 'No'}")
        print(f"↪️ Redirects: {'Yes' if existence_result['is_redirect'] else 'No'}")

        if existence_result['is_redirect']:
            print(f"➡️ Final URL: {existence_result['final_url']}")

        print("\n📊 Credibility Analysis")
        print("-" * 50)

        if result['status'] == 'trusted':
            print(f"✅ Trusted Source: {result['source_name']}")
            print(f"⭐ Credibility Score: {result['credibility_score']:.2f}")
            if 'established' in result.get('features', {}):
                print(f"📅 Established: {result['features']['established']}")

        elif result['status'] == 'analyzed':
            print(f"⭐ Credibility Score: {result['credibility_score']:.2f}")

        if result['warning_flags']:
            print("\n⚠️ Warning Flags:")
            for flag in result['warning_flags']:
                print(f"• {flag}")

        # Additional security checks
        print("\n🔒 Security Analysis")
        print("-" * 50)
        ssl_valid = existence_result.get('status_code') == 200 and url.startswith('https://')
        print(f"SSL Certificate: {'✅ Valid' if ssl_valid else '❌ Invalid/Missing'}")

        return {
            'existence_check': existence_result,
            'credibility_check': result,
            'content_info': {
                'title': title,
                'description': description,
                'image_count': image_count,
                'image_sources': image_sources[:5],  # First 5 images
                'published_date': pub_date
            },
            'security_info': {
                'ssl_valid': ssl_valid,
                'is_https': url.startswith('https://'),
                'redirect_count': len(existence_result.get('history', []))
            },
            'final_score': result.get('credibility_score', 0.0),
            'is_accessible': existence_result['accessible'],
            'is_trusted': result['status'] == 'trusted'
        }

    except Exception as e:
        print(f"❌ Error analyzing content: {str(e)}")
        return None

# Test the enhanced verification
test_urls = [
"https://www.manoramaonline.com/sports/other-sports/2024/12/29/historic-win-kerala-in-national-senior-handball-championship-final.html",
"https://www.sirajlive.com/plane-crash-in-south-korea-28-dead.html",
]

for url in test_urls:
    result = verify_url(url, link2vec_model)
    print("\n" + "="*50)



📊 URL Analysis Results
🔗 URL: https://www.manoramaonline.com/sports/other-sports/2024/12/29/historic-win-kerala-in-national-senior-handball-championship-final.html
--------------------------------------------------
✅ Status: Accessible
📝 Title: ദേശീയ സീനിയർ പുരുഷ ഹാൻഡ്‌ബോൾ ചാംപ്യൻഷിപ്: സർവീസസിനെ തകർത്ത് കേരളം ഫൈനലിൽ | Manorama Online News - Handball | Changanassery | Sports News
🖼️ Images Found: 14
📅 Published Date: Not found
🔒 HTTPS: Yes
↪️ Redirects: No

📊 Credibility Analysis
--------------------------------------------------
✅ Trusted Source: Malayala Manorama
⭐ Credibility Score: 0.95

🔒 Security Analysis
--------------------------------------------------
SSL Certificate: ✅ Valid


📊 URL Analysis Results
🔗 URL: https://www.sirajlive.com/plane-crash-in-south-korea-28-dead.html
--------------------------------------------------
✅ Status: Accessible
📝 Title: ദക്ഷിണ കൊറിയയില്‍ വിമാന ദുരന്തം; 179 പേര്‍ മരിച്ചു | Sirajlive.com
🖼️ Images Found: 44
📅 Published Date: Not found
🔒 HTTPS: Ye

In [None]:
def verify_and_scrape_url(url, link2vec_model):
    # First verify the URL
    verification_result = verify_url(url, link2vec_model)

    if verification_result is None:
        return None

    if verification_result['is_accessible']:
        scraper = ContentScraper()
        content = scraper.scrape_content(url)

        print("\n📄 Content Analysis")
        print("-" * 50)
        print(f"Title: {content['title']}")
        print(f"Author: {content['author']}")
        print(f"Published: {content['publish_date']}")
        print(f"Images Found: {len(content['images'])}")
        print(f"\nArticle Preview: {content['article_text']}...")

        # Add content to verification result
        verification_result['content'] = content

    return verification_result

# Test the enhanced verification
test_urls = [
    "https://www.mathrubhumi.com/special-pages/mt-vasudevan-nair/articles/mv-shreyams-kumar-writes-about-mt-vasudevan-nair-1.10178804"

]

for url in test_urls:
    result = verify_and_scrape_url(url, link2vec_model)
    print("\n" + "="*50)



📊 URL Analysis Results
🔗 URL: https://www.mathrubhumi.com/special-pages/mt-vasudevan-nair/articles/mv-shreyams-kumar-writes-about-mt-vasudevan-nair-1.10178804
--------------------------------------------------
✅ Status: Accessible
📝 Title: 'സ്വന്തം കൈപ്പടയില്‍ കത്തെഴുതുമായിരുന്ന പത്രാധിപര്‍ ഏത് സ്ഥാപനത്തിനാണ് അഭിമാനമാവാത്തത്!', MT Vasudevan Nair death, Indian author, malayalam script writer, passed away, MT vasudevan nair age
🖼️ Images Found: 25
📅 Published Date: 2024-12-25T05:01:00Z
🔒 HTTPS: Yes
↪️ Redirects: No

📊 Credibility Analysis
--------------------------------------------------
✅ Trusted Source: Mathrubhumi
⭐ Credibility Score: 0.95

🔒 Security Analysis
--------------------------------------------------
SSL Certificate: ✅ Valid

📄 Content Analysis
--------------------------------------------------
Title: 'സ്വന്തം കൈപ്പടയില്‍ കത്തെഴുതുമായിരുന്ന പത്രാധിപര്‍ ഏത് സ്ഥാപനത്തിനാണ് അഭിമാനമാവാത്തത്!'
Author: 
Published: 
Images Found: 25

Article Preview: 
MALAYALAM
ENGLISH
Newspaper
