

*   Vrutika Prajapati
*   U01994496




In [2]:
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline


In [2]:
def rate_url_validity(user_query: str, url: str) -> dict:
    """
    Evaluates the validity of a given URL by computing various metrics including
    domain trust, content relevance, fact-checking, bias, and citation scores.

    Args:
        user_query (str): The user's original query.
        url (str): The URL to analyze.

    Returns:
        dict: A dictionary containing scores for different validity aspects.
    """

    # === Step 1: Fetch Page Content ===
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        page_text = " ".join([p.text for p in soup.find_all("p")])  # Extract paragraph text
    except Exception as e:
        return {"error": f"Failed to fetch content: {str(e)}"}

    # === Step 2: Domain Authority Check (Moz API) ===
    # Replace with actual Moz API call
    domain_trust = 60  # Placeholder value (Scale: 0-100)

    # === Step 3: Content Relevance (Semantic Similarity using Hugging Face) ===
    model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
    similarity_score = util.pytorch_cos_sim(model.encode(user_query), model.encode(page_text)).item() * 100

    # === Step 4: Fact-Checking (Google Fact Check API) ===
    fact_check_score = check_facts(page_text)

     # === Step 5: Bias Detection (NLP Sentiment Analysis) ===
    sentiment_pipeline = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment")
    sentiment_result = sentiment_pipeline(page_text[:512])[0]  # Process first 512 characters
    bias_score = 100 if sentiment_result["label"] == "POSITIVE" else 50 if sentiment_result["label"] == "NEUTRAL" else 30

    # === Step 6: Citation Check (Google Scholar via SerpAPI) ===
    citation_count = check_google_scholar(url)
    citation_score = min(citation_count * 10, 100)  # Normalize

    # === Step 7: Compute Final Validity Score ===
    final_score = (
        (0.3 * domain_trust) +
        (0.3 * similarity_score) +
        (0.2 * fact_check_score) +
        (0.1 * bias_score) +
        (0.1 * citation_score)
    )

    return {
        "Domain Trust": domain_trust,
        "Content Relevance": similarity_score,
        "Fact-Check Score": fact_check_score,
        "Bias Score": bias_score,
        "Citation Score": citation_score,
        "Final Validity Score": final_score
    }

In [3]:
# === Helper Function: Fact-Checking via Google API ===
def check_facts(text: str) -> int:
    """
    Cross-checks text against Google Fact Check API.
    Returns a score between 0-100 indicating factual reliability.
    """
    api_url = f"https://toolbox.google.com/factcheck/api/v1/claimsearch?query={text[:200]}"
    try:
        response = requests.get(api_url)
        data = response.json()
        if "claims" in data and data["claims"]:
            return 80  # If found in fact-checking database
        return 40  # No verification found
    except:
        return 50  # Default uncertainty score

In [4]:
# === Helper Function: Citation Count via Google Scholar API ===
def check_google_scholar(url: str) -> int:
    """
    Checks Google Scholar citations using SerpAPI.
    Returns the count of citations found.
    """
    serpapi_key = "YOUR_SERPAPI_KEY"
    params = {"q": url, "engine": "google_scholar", "api_key": serpapi_key}
    try:
        response = requests.get("https://serpapi.com/search", params=params)
        data = response.json()
        return len(data.get("organic_results", []))
    except:
        return -1  # Assume no citations found

In [5]:
user_prompt = "I have just been on an international flight, can I come back home to hold my 1-month-old newborn?"
url_to_check = "https://www.bhtp.com/blog/when-safe-to-travel-with-newborn/"

result = rate_url_validity(user_prompt, url_to_check)
print(result)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Device set to use cpu


{'Domain Trust': 60, 'Content Relevance': 56.91344738006592, 'Fact-Check Score': 50, 'Bias Score': 30, 'Citation Score': 0, 'Final Validity Score': 48.074034214019775}


In [None]:
!pip install textstat

In [8]:
import requests
from bs4 import BeautifulSoup
from textstat import flesch_kincaid_grade
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.sentiment import SentimentIntensityAnalyzer
import re
from datetime import datetime
import nltk
nltk.download('vader_lexicon')

def rate_url_validity(user_query: str, url: str) -> dict:
    """
    Evaluates the validity of a given URL based on various factors without external APIs.

    Args:
        user_query (str): The user's original query.
        url (str): The URL to analyze.

    Returns:
        dict: A dictionary containing scores for different validity aspects.
    """

    # === Step 1: Fetch Page Content ===
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        page_text = " ".join([p.text for p in soup.find_all("p")])  # Extract paragraph text
    except Exception as e:
        return {"error": f"Failed to fetch content: {str(e)}"}

    # === Step 2: Content Readability (Flesch-Kincaid Grade) ===
    readability_score = flesch_kincaid_grade(page_text)

    # === Step 3: Keyword Matching (TF-IDF) ===
    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = vectorizer.fit_transform([user_query, page_text])
    similarity_score = tfidf_matrix[0, :].dot(tfidf_matrix[1, :].T).toarray()[0][0] * 100

    # === Step 4: Content Length & Depth ===
    content_length = len(page_text.split())  # Number of words
    content_depth = len(page_text.split('\n'))  # Number of paragraphs (based on line breaks)

    # === Step 5: External vs. Internal Links (Citation Quality) ===
    external_links_count = 0
    internal_links_count = 0
    for link in soup.find_all("a", href=True):
        href = link['href']
        if href.startswith("http"):
            external_links_count += 1
        else:
            internal_links_count += 1

    # === Step 6: Page Update Frequency (Last Modified) ===
    last_modified = response.headers.get('Last-Modified', None)
    if last_modified:
        last_modified_date = datetime.strptime(last_modified, "%a, %d %b %Y %H:%M:%S GMT")
        page_update_score = (datetime.now() - last_modified_date).days
    else:
        page_update_score = 365  # Assume outdated if no last modified date

    # === Step 7: Clickbait Detection (Basic Keyword Filter) ===
    clickbait_keywords = ['shocking', 'unbelievable', 'you won\'t believe', 'this one trick', 'amazing']
    clickbait_score = sum([1 for keyword in clickbait_keywords if re.search(r'\b' + keyword + r'\b', page_text.lower())]) * 10

    # === Step 8: Author Credibility ===
    author_name = None
    for author_tag in soup.find_all(["meta", "span"], {"name": "author"}):
        author_name = author_tag.get("content") or author_tag.text.strip()
    author_score = 80 if author_name else 0  # If author found, consider 80 score

    # === Step 9: Sentiment Consistency ===
    sentiment_analyzer = SentimentIntensityAnalyzer()
    sentiment_scores = sentiment_analyzer.polarity_scores(page_text)
    sentiment_score = 100 if sentiment_scores['compound'] >= 0 else 50

    # === Step 10: Advertisement & Sponsored Content Detection ===
    ad_keywords = ['advertisement', 'sponsored', 'buy now', 'promoted']
    ad_score = sum([1 for keyword in ad_keywords if re.search(r'\b' + keyword + r'\b', page_text.lower())]) * 10

    # === Step 11: Fake Reviews or User-Generated Content Detection ===
    user_generated_keywords = ['review', 'testimonial', 'comment']
    review_score = sum([1 for keyword in user_generated_keywords if re.search(r'\b' + keyword + r'\b', page_text.lower())]) * 10

    # === Step 12: Compute Final Validity Score ===
    final_score = (
        (0.1 * readability_score) +
        (0.2 * similarity_score) +
        (0.1 * content_length) +
        (0.1 * content_depth) +
        (0.15 * external_links_count) +
        (0.05 * page_update_score) +
        (0.1 * clickbait_score) +
        (0.1 * author_score) +
        (0.05 * sentiment_score) +
        (0.05 * ad_score) +
        (0.05 * review_score)
    )

    return {
        "Readability Score": readability_score,
        "Keyword Matching Score": similarity_score,
        "Content Length": content_length,
        "Content Depth": content_depth,
        "External Links": external_links_count,
        "Page Update Frequency": page_update_score,
        "Clickbait Score": clickbait_score,
        "Author Credibility Score": author_score,
        "Sentiment Score": sentiment_score,
        "Ad Score": ad_score,
        "Review Score": review_score,
        "Final Validity Score": final_score
    }

# Example Usage
user_query = "I have just been on an international flight, can I come back home to hold my 1-month-old newborn?"
url_to_check = "https://www.bhtp.com/blog/when-safe-to-travel-with-newborn/"

result = rate_url_validity(user_query, url_to_check)
print(result)


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


{'Readability Score': 8.7, 'Keyword Matching Score': 14.043013790557927, 'Content Length': 4537, 'Content Depth': 1, 'External Links': 36, 'Page Update Frequency': 6, 'Clickbait Score': 10, 'Author Credibility Score': 0, 'Sentiment Score': 100, 'Ad Score': 0, 'Review Score': 10, 'Final Validity Score': 469.67860275811165}


In [9]:


user_prompt = "I have just been on an international flight, can I come back home to hold my 1-month-old newborn?"
url_to_check = "https://www.quora.com/How-soon-can-I-take-my-newborn-with-me-when-I-fly-internationally"

result = rate_url_validity(user_prompt, url_to_check)
print(result)

{'Readability Score': 1.5, 'Keyword Matching Score': 0.0, 'Content Length': 9, 'Content Depth': 1, 'External Links': 0, 'Page Update Frequency': 365, 'Clickbait Score': 0, 'Author Credibility Score': 0, 'Sentiment Score': 50, 'Ad Score': 0, 'Review Score': 0, 'Final Validity Score': 21.9}


In [10]:
user_prompt = "I have just been on an international flight, can I come back home to hold my 1-month-old newborn?"
url_to_check = "https://www.mayoclinic.org/healthy-lifestyle/infant-and-toddler-health/expert-answers/air-travel-with-infant/faq-20058539"

result = rate_url_validity(user_prompt, url_to_check)
print(result)

{'Readability Score': 9.1, 'Keyword Matching Score': 1.6545745424836964, 'Content Length': 711, 'Content Depth': 2, 'External Links': 108, 'Page Update Frequency': 365, 'Clickbait Score': 0, 'Author Credibility Score': 0, 'Sentiment Score': 100, 'Ad Score': 0, 'Review Score': 10, 'Final Validity Score': 112.49091490849675}
