

*   Vrutika Prajapati
*   U01994496




In [52]:
import requests
import json
import re
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

class URLValidator:
    """
    A production-ready URL validation class that evaluates the credibility of a webpage
    using multiple factors: domain trust, content relevance, fact-checking, bias detection, and citations.
    """

    def __init__(self):
        self.serpapi_key = '3ca058cff0f926a3db441da5c1aae0868ffc7b98d95c9040e0989a1dfd918390'

        # Load models once to avoid redundant API calls
        self.similarity_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
        self.fake_news_classifier = pipeline("text-classification", model="mrm8488/bert-tiny-finetuned-fake-news-detection")
        self.sentiment_analyzer = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment")

    def fetch_page_content(self, url: str) -> str:
        """Fetches and extracts text content from the given URL with error handling."""
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")
            return " ".join([p.text for p in soup.find_all("p")])
        except requests.RequestException as e:
            print(f"Error fetching {url}: {e}")
            return ""
    def save_to_csv(self, user_prompt: str, url_to_check: str, func_rating: float, custom_rating: float):
        with open('credibility_scores.csv', mode='a', newline='') as file:
            writer = csv.writer(file)
            writer.writerow([user_prompt, url_to_check, func_rating, custom_rating])
    def preprocess_text(self, text: str) -> str:
        """Cleans text by removing stopwords, special characters, and lowercasing."""
        text = re.sub(r'\W+', ' ', text.lower())
        stop_words = set(stopwords.words('english'))
        return " ".join([word for word in text.split() if word not in stop_words])

    def get_domain_trust(self, content: str) -> int:
        """Computes the domain trust score based on content credibility."""
        if not content:
            return 50  # Default score
        result = self.fake_news_classifier(content[:512])[0]
        return 100 if result["label"] == "REAL" else 30 if result["label"] == "FAKE" else 50

    def compute_similarity_score(self, user_query: str, content: str) -> int:
        """Computes semantic similarity between user query and page content."""
        if not content:
            return 0
        query_clean = self.preprocess_text(user_query)
        content_clean = self.preprocess_text(content)
        return int(util.pytorch_cos_sim(self.similarity_model.encode(query_clean),
                                        self.similarity_model.encode(content_clean)).item() * 100)

    def check_facts(self, content: str) -> int:
        """Cross-checks extracted content with Google Fact Check API."""
        if not content:
            return 50
        api_url = f"https://toolbox.google.com/factcheck/api/v1/claimsearch?query={content[:200]}"
        try:
            response = requests.get(api_url)
            data = response.json()
            return 80 if "claims" in data and data["claims"] else 40
        except:
            return 50

    def detect_bias(self, content: str) -> int:
        """Uses NLP sentiment analysis to detect potential bias in content."""
        if not content:
            return 50
        sentiment_result = self.sentiment_analyzer(content[:512])[0]
        return 100 if sentiment_result["label"] == "POSITIVE" else 50 if sentiment_result["label"] == "NEUTRAL" else 30

    def check_google_scholar(self, url: str) -> int:
        """Checks Google Scholar citations using SerpAPI."""
        params = {"q": url, "engine": "google_scholar", "api_key": self.serpapi_key}
        try:
            response = requests.get("https://serpapi.com/search", params=params)
            data = response.json()
            return min(len(data.get("organic_results", [])) * 10, 100)
        except:
            return 0
    def get_domain_trust_huggingface(self, content: str) -> int:
      """ Uses a Hugging Face fake news detection model to assess credibility with summarized content. """
      if not content:
          return 50
      summarized_content = summarize_text(content)
      result = self.fake_news_classifier(summarized_content[:512])[0]  # Process only first 512 characters
      return 100 if result["label"] == "REAL" else 30 if result["label"] == "FAKE" else 50


    def get_star_rating(self, score: float) -> tuple:
        """Converts a score (0-100) into a 1-5 star rating."""
        stars = max(1, min(5, round(score / 20)))
        return stars, "⭐" * stars

    def generate_explanation(self, domain_trust, similarity_score, fact_check_score, bias_score, citation_score) -> str:
        """Generates a human-readable explanation for the score."""
        reasons = []
        if domain_trust < 50:
            reasons.append("The source has low domain authority.")
        if similarity_score < 50:
            reasons.append("The content is not highly relevant to your query.")
        if fact_check_score < 50:
            reasons.append("Limited fact-checking verification found.")
        if bias_score < 50:
            reasons.append("Potential bias detected in the content.")
        if citation_score < 30:
            reasons.append("Few citations found for this content.")
        return " ".join(reasons) if reasons else "This source is highly credible and relevant."

    def rate_url_validity(self, user_query: str, url: str) -> dict:
        """Main function to evaluate the validity of a webpage."""
        content = self.fetch_page_content(url)
        domain_trust = self.get_domain_trust(content)
        similarity_score = self.compute_similarity_score(user_query, content)
        fact_check_score = self.check_facts(content)
        bias_score = self.detect_bias(content)
        citation_score = self.check_google_scholar(url)

        final_score = (
            (0.3 * domain_trust) +
            (0.3 * similarity_score) +
            (0.2 * fact_check_score) +
            (0.1 * bias_score) +
            (0.1 * citation_score)
        )

        stars, icon = self.get_star_rating(final_score)
        explanation = self.generate_explanation(domain_trust, similarity_score, fact_check_score, bias_score, citation_score)

        return {
            "raw_score": {
                "Domain Trust": domain_trust,
                "Content Relevance": similarity_score,
                "Fact-Check Score": fact_check_score,
                "Bias Score": bias_score,
                "Citation Score": citation_score,
                "Final Validity Score": final_score
            },
            "stars": {
                "score": stars,
                "icon": icon
            },
            "explanation": explanation
        }



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [53]:


# Instantiate the URLValidator class
validator = URLValidator()

# Define user prompt and URL
user_prompt = "I have just been on an international flight, can I come back home to hold my 1-month-old newborn?"
url_to_check = "https://www.mayoclinic.org/healthy-lifestyle/infant-and-toddler-health/expert-answers/air-travel-with-infant/faq-20058539"

# Run the validation
result = validator.rate_url_validity(user_prompt, url_to_check)

# Print the results
import json
print(json.dumps(result, indent=2))

Device set to use cpu
Device set to use cpu


{
  "raw_score": {
    "Domain Trust": 50,
    "Content Relevance": 46,
    "Fact-Check Score": 50,
    "Bias Score": 30,
    "Citation Score": 0,
    "Final Validity Score": 41.8
  },
  "stars": {
    "score": 2,
    "icon": "\u2b50\u2b50"
  },
  "explanation": "The content is not highly relevant to your query. Potential bias detected in the content. Few citations found for this content."
}


In [42]:
import csv
import os

# Instantiate the URLValidator class
validator = URLValidator()

# Define user prompt and URL
user_prompt = "I have just been on an international flight, can I come back home to hold my 1-month-old newborn?"
url_to_check = "https://www.mayoclinic.org/healthy-lifestyle/infant-and-toddler-health/expert-answers/air-travel-with-infant/faq-20058539"

# Run the validation
result = validator.rate_url_validity(user_prompt, url_to_check)

# Define the CSV file path
csv_file_path = 'url_validation_results.csv'

# Check if the file exists (for appending or creating)
file_exists = os.path.isfile(csv_file_path)

# Open the CSV file in append mode
with open(csv_file_path, mode='a', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    # If the file doesn't exist, write the header
    if not file_exists:
        writer.writerow([
            "User Prompt", "URL", "Star Rating", "Final Validity Score"
        ])

    # Write the current data row
    writer.writerow([
        user_prompt,  # User prompt
        url_to_check,  # URL to check
        result['stars']['score'],  # Star rating
        result['raw_score']['Final Validity Score']  # Final validity score
    ])

print(f"CSV file saved at {csv_file_path}")


Device set to use cpu
Device set to use cpu


CSV file saved at url_validation_results.csv


In [63]:
import csv
import os

# Instantiate the URLValidator class
validator = URLValidator()

# Define user prompt and URL
user_prompt = "what is AI"
url_to_check = "https://cloud.google.com/learn/what-is-artificial-intelligence"
# Run the validation
result = validator.rate_url_validity(user_prompt, url_to_check)
print(result)
# Define the CSV file path
csv_file_path = 'url_validation_results.csv'

# Check if the file exists (for appending or creating)
file_exists = os.path.isfile(csv_file_path)

# Open the CSV file in append mode
with open(csv_file_path, mode='a', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    # If the file doesn't exist, write the header
    if not file_exists:
        writer.writerow([
            "User Prompt", "URL", "Star Rating", "Final Validity Score"
        ])

    # Write the current data row
    writer.writerow([
        user_prompt,  # User prompt
        url_to_check,  # URL to check
        result['stars']['score'],  # Star rating
        result['raw_score']['Final Validity Score']  # Final validity score
    ])

print(f"CSV file saved at {csv_file_path}")


Device set to use cpu
Device set to use cpu


{'raw_score': {'Domain Trust': 50, 'Content Relevance': 54, 'Fact-Check Score': 50, 'Bias Score': 30, 'Citation Score': 0, 'Final Validity Score': 44.2}, 'stars': {'score': 2, 'icon': '⭐⭐'}, 'explanation': 'Potential bias detected in the content. Few citations found for this content.'}
CSV file saved at url_validation_results.csv


In [64]:
# Open the CSV file and read its contents
with open(csv_file_path, mode='r', newline='', encoding='utf-8') as file:
    reader = csv.reader(file)

    # Iterate over the rows in the CSV file and print them
    for row in reader:
        print(row)


['User Prompt', 'URL', 'Star Rating', 'Final Validity Score']
['I have just been on an international flight, can I come back home to hold my 1-month-old newborn?', 'https://www.mayoclinic.org/healthy-lifestyle/infant-and-toddler-health/expert-answers/air-travel-with-infant/faq-20058539', '2', '42.2']
['What are the benefits of a vegetarian diet?', 'https://www.nhs.uk/live-well/eat-well/how-to-eat-a-balanced-diet/the-vegetarian-diet/', '2', '40.0']
['What are the benefits of a vegetarian diet?', 'https://www.nhs.uk/live-well/eat-well/how-to-eat-a-balanced-diet/the-vegetarian-diet/', '2', '40.0']
['What are the benefits of a vegetarian diet?', 'https://pubmed.ncbi.nlm.nih.gov/37226630/#:~:text=Plant%2Dbased%20diets%20have%20the,the%20risk%20of%20cardiovascular%20disease.', '2', '43.8']
['How to improve mental health during stressful times?', 'https://www.who.int/news-room/questions-and-answers/item/stress#:~:text=Stress%20is%20a%20natural%20human,experiences%20stress%20to%20some%20degree