In [10]:
import os
os.environ["GOOGLE_API_KEY"] = "AIzaSyC29gObkycJDBjVkEWjhJoJO-HVB0pC00E"

In [None]:
!pip install wikipedia langchain_google_genai

In [12]:
import re
from datetime import datetime
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline
import wikipedia
from langchain import PromptTemplate, LLMChain
from langchain_google_genai import GoogleGenerativeAI
import google.generativeai as genai
from concurrent.futures import ThreadPoolExecutor
import os

class EnhancedFactChecker:
    def __init__(self, google_api_key):
        # Initialize NLTK
        self._setup_nltk()

        # Initialize Gemini
        genai.configure(api_key=google_api_key)
        self.gemini = GoogleGenerativeAI(
            model="gemini-pro",
            google_api_key=google_api_key,
            temperature=0.1
        )

        # Initialize ML pipeline for zero-shot classification
        self.classifier = pipeline("zero-shot-classification")

        # Initialize TF-IDF vectorizer
        self.vectorizer = TfidfVectorizer(stop_words='english')

        # Setup LangChain prompt template
        self.fact_check_prompt = PromptTemplate(
            input_variables=["claim"],
            template="""
            Please analyze the following claim and provide a detailed fact-check:
            Claim: {claim}

            Provide your analysis in the following format:
            1. Verification status
            2. Supporting evidence
            3. Confidence score (0-100)
            4. Sources
            """
        )
        self.chain = LLMChain(llm=self.gemini, prompt=self.fact_check_prompt)

    def _setup_nltk(self):
        """Download required NLTK resources"""
        resources = ['punkt', 'stopwords', 'averaged_perceptron_tagger']
        for resource in resources:
            try:
                nltk.data.find(f'tokenizers/{resource}')
            except LookupError:
                nltk.download(resource)

    def _search_wikipedia(self, query, max_results=3):
        """Search Wikipedia for relevant information"""
        try:
            # Search for relevant Wikipedia pages
            search_results = wikipedia.search(query, results=max_results)
            wiki_data = []

            for title in search_results:
                try:
                    page = wikipedia.page(title, auto_suggest=False)
                    wiki_data.append({
                        'title': page.title,
                        'content': page.summary,
                        'url': page.url
                    })
                except wikipedia.exceptions.DisambiguationError as e:
                    continue
                except wikipedia.exceptions.PageError:
                    continue

            return wiki_data
        except Exception as e:
            print(f"Wikipedia search error: {str(e)}")
            return []

    def _analyze_with_gemini(self, claim, context=""):
        """Use Gemini for advanced analysis"""
        prompt = f"""
        Analyze the following claim for factual accuracy:
        Claim: {claim}

        Additional context: {context}

        Please provide:
        1. Factual accuracy assessment
        2. Key points of verification
        3. Potential misinformation indicators
        4. Confidence level (0-100)
        """

        try:
            response = self.gemini.generate_text(prompt)
            return response.text
        except Exception as e:
            print(f"Gemini analysis error: {str(e)}")
            return None

    def _check_claim_probability(self, claim):
        """Use zero-shot classification to assess claim probability"""
        try:
            result = self.classifier(
                claim,
                candidate_labels=["fact", "opinion", "misinformation"],
                hypothesis_template="This text is {}."
            )
            return {
                'labels': result['labels'],
                'scores': result['scores']
            }
        except Exception as e:
            print(f"Classification error: {str(e)}")
            return None

    def _analyze_temporal_consistency(self, claim):
        """Analyze temporal aspects and future claims"""
        date_pattern = r'\b\d{4}\b|\b\d{1,2}/\d{1,2}/\d{4}\b'
        dates = re.findall(date_pattern, claim)
        current_year = datetime.now().year

        temporal_analysis = {
            'dates_found': dates,
            'has_future_dates': False,
            'temporal_inconsistencies': []
        }

        for date in dates:
            try:
                year = int(date) if len(date) == 4 else int(date.split('/')[-1])
                if year > current_year:
                    temporal_analysis['has_future_dates'] = True
                    temporal_analysis['temporal_inconsistencies'].append(
                        f"Claims future date: {year}"
                    )
            except ValueError:
                continue

        return temporal_analysis

    def comprehensive_fact_check(self, claim):
        """Perform comprehensive fact-checking using multiple methods"""
        results = {
            'claim': claim,
            'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'analyses': {}
        }

        # Parallel execution of different analysis methods
        with ThreadPoolExecutor() as executor:
            # Submit all analysis tasks
            wiki_future = executor.submit(self._search_wikipedia, claim)
            probability_future = executor.submit(self._check_claim_probability, claim)
            temporal_future = executor.submit(self._analyze_temporal_consistency, claim)

            # Get Wikipedia data
            wiki_data = wiki_future.result()
            if wiki_data:
                context = "\n".join([d['content'] for d in wiki_data])
            else:
                context = ""

            # Submit Gemini analysis with Wikipedia context
            gemini_future = executor.submit(self._analyze_with_gemini, claim, context)

            # Collect all results
            results['analyses']['wikipedia'] = {
                'found_articles': len(wiki_data),
                'articles': wiki_data
            }

            results['analyses']['probability'] = probability_future.result()
            results['analyses']['temporal'] = temporal_future.result()
            results['analyses']['gemini'] = gemini_future.result()

        # Use LangChain for final analysis
        try:
            langchain_analysis = self.chain.run(claim=claim)
            results['analyses']['langchain'] = langchain_analysis
        except Exception as e:
            print(f"LangChain analysis error: {str(e)}")
            results['analyses']['langchain'] = None

        # Calculate final credibility score
        credibility_score = self._calculate_credibility_score(results['analyses'])
        results['credibility_score'] = credibility_score
        results['verdict'] = self._get_verdict(credibility_score)

        return results

    def _calculate_credibility_score(self, analyses):
        """Calculate overall credibility score based on all analyses"""
        score = 1.0

        # Adjust score based on probability analysis
        if analyses['probability']:
            fact_score = analyses['probability']['scores'][
                analyses['probability']['labels'].index('fact')
            ]
            misinfo_score = analyses['probability']['scores'][
                analyses['probability']['labels'].index('misinformation')
            ]
            score *= (fact_score / (fact_score + misinfo_score))

        # Adjust for temporal inconsistencies
        if analyses['temporal']['has_future_dates']:
            score *= 0.5

        # Adjust based on Wikipedia findings
        if analyses['wikipedia']['found_articles'] > 0:
            score *= 1.2

        # Cap score between 0 and 1
        return max(0.0, min(1.0, score))

    def _get_verdict(self, credibility_score):
        """Convert credibility score to verdict"""
        if credibility_score > 0.8:
            return "Highly Likely True"
        elif credibility_score > 0.6:
            return "Likely True"
        elif credibility_score > 0.4:
            return "Uncertain"
        elif credibility_score > 0.2:
            return "Likely False"
        else:
            return "Highly Likely False"

# Example usage
def main():
    # Get API key from environment variable
    google_api_key = os.getenv('GOOGLE_API_KEY')
    if not google_api_key:
        raise ValueError("Please set GOOGLE_API_KEY environment variable")

    fact_checker = EnhancedFactChecker(google_api_key)

    # Example claims
    claims = [
        "The Earth is flat and scientists are hiding the truth!",
        "Water boils at 100 degrees Celsius at sea level.",
        "A new species of dinosaur was discovered on Mars in 2025!"
    ]

    for claim in claims:
        print(f"\nAnalyzing claim: {claim}")
        results = fact_checker.comprehensive_fact_check(claim)

        print(f"\nVerdict: {results['verdict']}")
        print(f"Credibility Score: {results['credibility_score']:.2f}")

        print("\nDetailed Analysis:")
        print("- Wikipedia References:",
              results['analyses']['wikipedia']['found_articles'])

        if results['analyses']['probability']:
            print("- Classification Probabilities:")
            for label, score in zip(
                results['analyses']['probability']['labels'],
                results['analyses']['probability']['scores']
            ):
                print(f"  {label}: {score:.2f}")

        print("- Temporal Analysis:", results['analyses']['temporal'])
        print("- Gemini Analysis:", results['analyses']['gemini'])
        print("- LangChain Analysis:", results['analyses']['langchain'])

if __name__ == "__main__":
    main()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
No model was supplied, defaulted to facebook/bart-large-mnli and revision d7645e1 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

  self.chain = LLMChain(llm=self.gemini, prompt=self.fact_check_prompt)



Analyzing claim: The Earth is flat and scientists are hiding the truth!
Gemini analysis error: 'GoogleGenerativeAI' object has no attribute 'generate_text'


  langchain_analysis = self.chain.run(claim=claim)



Verdict: Highly Likely False
Credibility Score: 0.05

Detailed Analysis:
- Wikipedia References: 3
- Classification Probabilities:
  misinformation: 0.71
  opinion: 0.26
  fact: 0.03
- Temporal Analysis: {'dates_found': [], 'has_future_dates': False, 'temporal_inconsistencies': []}
- Gemini Analysis: None
- LangChain Analysis: **1. Verification status:** False

**2. Supporting evidence:**

* **Scientific evidence:** The Earth's curvature has been proven through numerous scientific observations and experiments, including:
    * The horizon appears curved when viewed from a high altitude.
    * Ships disappear over the horizon bottom-first as they sail away.
    * The Earth's shadow on the moon during a lunar eclipse is always round.
* **Historical evidence:** Explorers have circumnavigated the globe, proving that it is not flat.
* **Technological evidence:** Satellites and other spacecraft have captured images of the Earth from space, clearly showing its spherical shape.

**3. Confiden