#Custom news Analysis

In [None]:
pip install praw faiss-gpu textstat transformers sentence-transformers

In [None]:
import requests
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from sentence_transformers import SentenceTransformer, util
import re
import spacy
from textstat import flesch_reading_ease
import faiss
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

# Guardian API Key
guardian_api_key = "Enter_your_api"
news_api_key = "Enter_your_api"


# Models and Tools Setup
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
sentiment_analyzer = pipeline('sentiment-analysis')
nlp = spacy.load("en_core_web_sm")

# FAISS Index Setup
embedding_dim = 384  # Dimension of 'all-MiniLM-L6-v2' embeddings
index = faiss.IndexFlatL2(embedding_dim)

# Global Variables to Store Articles
article_store = []  # To store articles for retrieval
article_embeddings = []  # To store embeddings

# 1. Text Cleaning Function
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text.strip()

def fetch_related_content(query):
    # Fetch from Guardian API
    search_url_guardian = f"https://content.guardianapis.com/search"
    params_guardian = {
        "q": query,
        "api-key": guardian_api_key,
        "show-fields": "headline,standfirst"
    }
    response_guardian = requests.get(search_url_guardian, params=params_guardian)
    articles_guardian = response_guardian.json().get('response', {}).get('results', [])

    # Fetch from NewsAPI
    search_url_newsapi = f"https://newsapi.org/v2/everything"
    params_newsapi = {
        "q": query,
        "apiKey": news_api_key,
        "language": "en",
        "sortBy": "relevancy"
    }
    response_newsapi = requests.get(search_url_newsapi, params=params_newsapi)
    articles_newsapi = response_newsapi.json().get('articles', [])

    # Process Guardian API articles
    for article in articles_guardian:
        title = article.get("webTitle", "")
        snippet = article["fields"].get("standfirst", "")
        if snippet:
            combined_text = f"{title} - {snippet}"
            article_store.append(combined_text)  # Store title and snippet
            embedding = similarity_model.encode(clean_text(combined_text), convert_to_tensor=False)
            article_embeddings.append(embedding)
            index.add(np.array([embedding], dtype=np.float32))  # Add to FAISS index

    # Process NewsAPI articles
    for article in articles_newsapi:
        title = article.get("title", "")
        snippet = article.get("description", "")
        if title and snippet:
            combined_text = f"{title} - {snippet}"
            article_store.append(combined_text)  # Store title and snippet
            embedding = similarity_model.encode(clean_text(combined_text), convert_to_tensor=False)
            article_embeddings.append(embedding)
            index.add(np.array([embedding], dtype=np.float32))
def summarize_content(content):
    content = clean_text(content)
    input_text = "summarize: " + content
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True)
    outputs = model.generate(inputs['input_ids'], max_length=100)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# 3. Summarization
def create_knowledge_graph(summary):
    doc = nlp(summary)
    graph = nx.DiGraph()

    # Simple relationships for common dependency types
    dependency_map = {
        "nsubj": "subject of",  # Subject
        "dobj": "object of",  # Direct object
        "prep": "related to",  # Preposition
        "amod": "describes",  # Adjective modifier
        "pobj": "prepositional object",  # Prepositional object
        "advmod": "modifies",  # Adverbial modifier
        "ROOT": "main action",  # Main verb (root of the sentence)
        "attr": "attribute of",  # Attribute (usually a noun)
        "acomp": "complement",  # Adjective complement
    }

    # Extract entities and relationships
    entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ['ORG', 'GPE', 'LOC', 'PERSON', 'NORP']]  # Keep relevant entity types
    for ent in entities:
        if ent[1] == 'ORG' or ent[1] == 'GPE' or ent[1] == 'LOC' or ent[1] == 'PERSON':  # Check for proper nouns
            graph.add_node(ent[0], label=ent[1])

    # Store connected nodes
    connected_nodes = set()

    for token in doc:
        # We only want to process nouns (either common nouns or proper nouns)
        if token.pos_ in ['NOUN', 'PROPN']:
            # Add nodes and edges for noun-related dependencies
            if token.dep_ in dependency_map and token.head.pos_ == "VERB":
                # Add an edge with a simple relationship description
                relation = dependency_map.get(token.dep_, "related to")
                graph.add_edge(token.head.text, token.text, label=relation)
                connected_nodes.add(token.head.text)
                connected_nodes.add(token.text)

    # Remove isolated nodes (those not in connected nodes)
    nodes_to_remove = [node for node in graph.nodes if node not in connected_nodes]
    graph.remove_nodes_from(nodes_to_remove)

    # Plot the graph with only connected noun nodes
    plt.figure(figsize=(12, 8))
    pos = nx.spring_layout(graph)
    nx.draw(graph, pos, with_labels=True, node_size=3000, node_color="lightblue", font_size=10, font_weight="bold")
    plt.title("Knowledge Graph - Noun Nodes Only")

    # Save the plot as PNG image
    plt.savefig("knowledge_graph.png", format="PNG")

    # Show the plot
    plt.show()

    # Return only connected nodes (nouns)
    return list(connected_nodes)

# 5. Scores Calculation
def sentiment_consistency(input_text, related_snippet):
    input_sentiment = sentiment_analyzer(input_text)[0]['label']
    related_sentiment = sentiment_analyzer(related_snippet)[0]['label']
    return 1 if input_sentiment == related_sentiment else 0

def fact_density_score(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents]
    return len(entities) / len(text.split())

def readability_score(text):
    return flesch_reading_ease(text)

def lexical_diversity_score(text):
    words = text.split()
    unique_words = set(words)
    return len(unique_words) / len(words)

# 6. Aggregation
def final_verdict(scores, weights):
    weighted_sum = sum(score * weight for score, weight in zip(scores, weights))
    return weighted_sum / sum(weights)
def retrieve_similar_articles(news_text, top_k=3):
    query_embedding = similarity_model.encode(clean_text(news_text), convert_to_tensor=False)
    distances, indices = index.search(np.array([query_embedding], dtype=np.float32), top_k)
    results = []
    for idx in indices[0]:
        if idx < len(article_store):  # Valid index check
            results.append(article_store[idx])
    return results
# 7. Main Function to Detect Fake News
def is_fake_news(news_text):
    if not index.is_trained or len(article_store) == 0:
        print("No related content indexed. Cannot verify.")
        return "Unverified"

    similar_articles = retrieve_similar_articles(news_text, top_k=2)
    if not similar_articles:
        print("No similar articles found. Cannot verify.")
        return "Unverified"

    max_similarity = 0  # Track maximum similarity score
    best_article_summary = ""  # Variable to store the summary of the most similar article
    scores = []
    sentiment_scores = []
    fact_density_scores = []
    readability_scores = []
    lexical_diversity_scores = []

    # Concatenate top 3 most similar articles
    concatenated_articles = " ".join(similar_articles)
    concatenated_summary = summarize_content(concatenated_articles)

    # Generate knowledge graph for concatenated summary
    print("Generating Knowledge Graph for Concatenated Articles Summary...")
    create_knowledge_graph(concatenated_summary)

    for article_snippet in similar_articles:
        title, snippet = article_snippet.split(" - ", 1)  # Extract title and snippet

        summary = summarize_content(snippet)
        similarity = util.pytorch_cos_sim(
            similarity_model.encode(news_text, convert_to_tensor=True),
            similarity_model.encode(summary, convert_to_tensor=True)
        ).item()

        if similarity > max_similarity:
            max_similarity = similarity
            best_article_summary = summary  # Update best article summary

        sentiment_score = sentiment_consistency(news_text, snippet)
        fact_density = fact_density_score(news_text)
        readability = readability_score(news_text)
        lexical_diversity = lexical_diversity_score(news_text)

        # Store individual scores for averaging later
        sentiment_scores.append(sentiment_score)
        fact_density_scores.append(fact_density)
        readability_scores.append(readability)
        lexical_diversity_scores.append(lexical_diversity)

        # Weighted scores
        scores.append(final_verdict(
            [similarity, fact_density, lexical_diversity],
            [0.5, 0.1, 0.1]
        ))

    # Calculate average scores
    avg_sentiment = sum(sentiment_scores) / len(sentiment_scores) if sentiment_scores else 0
    avg_fact_density = sum(fact_density_scores) / len(fact_density_scores) if fact_density_scores else 0
    avg_readability = sum(readability_scores) / len(readability_scores) if readability_scores else 0
    avg_lexical_diversity = sum(lexical_diversity_scores) / len(lexical_diversity_scores) if lexical_diversity_scores else 0

    # Print the averages, max similarity, and final verdict
    print(f"Average Scores:\n"
          f"Sentiment Consistency: {avg_sentiment}\n"
          f"Fact Density: {avg_fact_density}\n"
          f"Readability: {avg_readability}\n"
          f"Lexical Diversity: {avg_lexical_diversity}\n")

    print(f"Maximum Similarity: {max_similarity}")
    print(f"Best Article Summary: {best_article_summary}")
    if max_similarity > 0.4:
        return "Real News"
    elif max_similarity > 0.25:
        return "Likely Real News"
    elif max_similarity > 0.2:
        return "Unverified"
    else:
        return "Likely Fake News"


# Example Usage
fetch_related_content("Andrew garfield dates shraddha kapoor")
news_text = """
Andrew garfield dates shraddha kapoor
"""

result = is_fake_news(news_text)
print(f"Verdict: {result}")


#Webscraping for xustome news

In [None]:
from bs4 import BeautifulSoup
import urllib.request

def is_news_title(title):
    news_keywords = [
        "breaking", "live", "update", "report", "news",
        "analysis", "opinion", "interview", "world",
        "politics", "economy", "sports", "entertainment"
    ]
    return any(keyword in title.lower() for keyword in news_keywords)

def scrape_data(url):
    try:
        response = urllib.request.urlopen(url)
        html = response.read()
        soup = BeautifulSoup(html, 'html.parser')
        title = soup.title.string if soup.title else "No title found"

        paragraph = soup.find('p')
        first_paragraph = paragraph.get_text(strip=True) if paragraph else "No paragraph found."
        return title, news_analysis, first_paragraph
    except Exception as e:
        return None, f"An error occurred: {e}", None

# Input URL from the user
url = input("Enter a URL to scrape: ").strip()
title, news_analysis, first_paragraph = scrape_data(url)

if title:
    print("Title:", title)
    print("First Paragraph:", first_paragraph)
else:
    print(news_analysis)

#Trending topics analysis


In [None]:
from pytrends.request import TrendReq

# Initialize Pytrends
pytrends = TrendReq(hl='en-US', tz=360)
trending_searches = pytrends.trending_searches()
print(trending_searches.head())


#Live Broadcast model

In [None]:
import re
from transformers import GPT2LMHeadModel, GPT2Tokenizer, BartForConditionalGeneration, BartTokenizer
import requests
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from sentence_transformers import SentenceTransformer, util
import re
import faiss
import numpy as np

# Guardian API Key
guardian_api_key = "8fc95a30-a0c7-4ad9-8a62-0d8d3af818cc"

# Models and Tools Setup
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
sentiment_analyzer = pipeline('sentiment-analysis')

# FAISS Index Setup
embedding_dim = 384  # Dimension of 'all-MiniLM-L6-v2' embeddings
index = faiss.IndexFlatL2(embedding_dim)

# Global Variables to Store Articles
article_store = []  # To store articles for retrieval
article_embeddings = []  # To store embeddings

# Step 1: Load Pretrained Models
model_name = "gpt2"  # You can use GPT2 for text generation
summarizer_model_name = "facebook/bart-large-cnn"  # Pretrained model for summarization
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

summarizer_model = BartForConditionalGeneration.from_pretrained(summarizer_model_name)
summarizer_tokenizer = BartTokenizer.from_pretrained(summarizer_model_name)

# Step 2: Define Preprocessing Function to Clean Conversational Text
def clean_conversational_text(text):
    conversational_phrases = [
        "I'm sorry", "I believe", "I think", "In my opinion",
        "you know", "I feel", "I just", "Actually", "Let me tell you"
    ]

    for phrase in conversational_phrases:
        text = text.replace(phrase, "")

    # Further cleanup (remove unnecessary personal pronouns like "I")
    text = re.sub(r'\bI\b', '', text)
    text = re.sub(r'\bI\'m\b', '', text)
    return text.strip()

# Step 3: Define Postprocessing Function to Ensure Formal Tone in the Output
def postprocess_output(output):
    conversational_phrases = [
        "I'm sorry", "I believe", "I think", "In my opinion",
        "you know", "I feel", "I just", "Actually", "Let me tell you"
    ]

    for phrase in conversational_phrases:
        output = output.replace(phrase, "")

    # Further cleanup to remove personal pronouns if necessary
    output = re.sub(r'\bI\b', '', output)
    output = re.sub(r'\bI\'m\b', '', output)

    return output.strip()

# Step 4: Define Text Generation Function
def generate_news_content(input_text):
    # Clean input text to remove conversational phrases
    clean_input = clean_conversational_text(input_text)

    # Prepare input text for the model, without adding the prompt phrase in the final output
    input_text = clean_input  # We only use the clean input for text generation
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True)

    # Generate text using the model
    outputs = model.generate(
        inputs['input_ids'],
        max_new_tokens=150,
        num_beams=5,  # Controls diversity and precision of output
        no_repeat_ngram_size=2,  # Prevents repetitive phrases
        temperature=0.7,  # Controls randomness in output
        top_k=50  # Limits the number of possible words
    )

    # Decode the generated output
    news_content = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Postprocess the output to remove any conversational elements
    return postprocess_output(news_content)

# Step 5: Summarize the Whole Text First
def summarize_text(text):
    # Clean text before summarizing
    clean_text = clean_conversational_text(text)

    # Tokenize the text and prepare it for summarization
    inputs = summarizer_tokenizer(clean_text, return_tensors="pt", max_length=1024, truncation=True)

    # Summarize the text
    summary_ids = summarizer_model.generate(
        inputs['input_ids'],
        num_beams=4,
        max_length=150,
        early_stopping=True
    )

    # Decode the summary
    summary = summarizer_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Step 6: Read Input from a Text File
def read_input_from_file(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    return text

# Step 7: Split Text into Chunks and Process Each Chunk
def split_text_into_chunks(text, max_chunk_length=150):
    # Split the text into chunks of max_chunk_length tokens
    chunks = []
    input_ids = tokenizer.encode(text, truncation=True, max_length=max_chunk_length)

    # Split if necessary
    for i in range(0, len(input_ids), max_chunk_length):
        chunks.append(input_ids[i:i + max_chunk_length])

    return chunks
# 1. Text Cleaning Function
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text.strip()

# 2. Fetch and Store Related Content for List of Queries
def fetch_related_content(queries):
    for query in queries:
        search_url = f"https://content.guardianapis.com/search"
        params = {
            "q": query,
            "api-key": guardian_api_key,
            "show-fields": "headline,standfirst"
        }
        response = requests.get(search_url, params=params)
        articles = response.json().get('response', {}).get('results', [])

        for article in articles:
            title = article.get("webTitle", "")
            snippet = article["fields"].get("standfirst", "")
            if snippet:
                combined_text = f"{title} - {snippet}"
                article_store.append(combined_text)  # Store title and snippet
                embedding = similarity_model.encode(clean_text(combined_text), convert_to_tensor=False)
                article_embeddings.append(embedding)
                index.add(np.array([embedding], dtype=np.float32))  # Add to FAISS index

# 3. Summarization
def summarize_content(content):
    content = clean_text(content)
    input_text = "summarize: " + content
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True)
    outputs = model.generate(inputs['input_ids'], max_length=100)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# 4. RAG Similarity Search
def retrieve_similar_articles(news_text, top_k=3):
    query_embedding = similarity_model.encode(clean_text(news_text), convert_to_tensor=False)
    distances, indices = index.search(np.array([query_embedding], dtype=np.float32), top_k)
    results = []
    for idx in indices[0]:
        if idx < len(article_store):  # Valid index check
            results.append(article_store[idx])
    return results

# 5. Aggregation
def final_verdict(scores, weights):
    weighted_sum = sum(score * weight for score, weight in zip(scores, weights))
    return weighted_sum / sum(weights)

# 6. Main Function to Detect Fake News
def is_fake_news(news_text):
    if not index.is_trained or len(article_store) == 0:
        print("No related content indexed. Cannot verify.")
        return "Unverified", 0

    similar_articles = retrieve_similar_articles(news_text, top_k=3)
    if not similar_articles:
        print("No similar articles found. Cannot verify.")
        return "Unverified", 0

    max_similarity = 0  # Track maximum similarity score
    for article_snippet in similar_articles:
        title, snippet = article_snippet.split(" - ", 1)  # Extract title and snippet

        summary = summarize_content(snippet)
        similarity = util.pytorch_cos_sim(
            similarity_model.encode(news_text, convert_to_tensor=True),
            similarity_model.encode(summary, convert_to_tensor=True)
        ).item()
        max_similarity = max(max_similarity, similarity)  # Update max similarity

    print(f"Maximum Similarity: {max_similarity}")
    if max_similarity > 0.4:
        return "Real News", max_similarity
    elif max_similarity > 0.25:
        return "Likely Real News", max_similarity
    elif max_similarity > 0.2:
        return "Unverified", max_similarity
    else:
        return "Likely Fake News", max_similarity


# 7. Function to Process Multiple News
def process_news_list(news_list):
    fetch_related_content(news_list)  # Fetch related content for all news articles
    results = {}
    for idx, news_text in enumerate(news_list):
        print(f"Processing News {idx + 1}...")
        verdict, max_similarity = is_fake_news(news_text)
        results[news_list[idx]] = {
            "Verdict": verdict,
            "Max Similarity": f"{max_similarity:.2f}"  # Display similarity with 2 decimal precision
        }
    return results

# Step 8: Example Usage
input_file_path = "output_audio.txt"  # Change to the path of your input text file
input_text = read_input_from_file(input_file_path)

# Summarize the entire text first
summary = summarize_text(input_text)

# Split the summarized text into chunks (if necessary)
chunks = split_text_into_chunks(summary, max_chunk_length=150)

# Generate news-like content from each chunk
generated_contents = []
for chunk in chunks:
    chunk_text = tokenizer.decode(chunk, skip_special_tokens=True)
    generated_content = generate_news_content(chunk_text)
    generated_contents.append(generated_content)


generated_contents1=generated_contents[0].split('\n')


results = process_news_list(generated_contents1)
for news, result in results.items():
    print(f"{news} - Verdict: {result['Verdict']} | Max Similarity: {result['Max Similarity']}")


#Evalusation over the LIAR dataset


In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel, T5ForConditionalGeneration, T5Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import faiss

# Load a Small Chunk of the Dataset
df_fake = pd.read_csv('/content/Fake.csv').sample(n=50, random_state=42)  # Sample 500 fake articles
df_real = pd.read_csv('/content/True.csv').sample(n=50, random_state=42)  # Sample 500 real articles

# Add Labels
df_fake['label'] = 0  # Fake
df_real['label'] = 1  # Real

# Combine and Shuffle Data
data = pd.concat([df_fake, df_real], ignore_index=True).sample(frac=1).reset_index(drop=True)

# Split into Train and Test
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Load Embedding Model (Efficient Model)
embed_model_name = "sentence-transformers/all-mpnet-base-v2"
embed_tokenizer = AutoTokenizer.from_pretrained(embed_model_name)
embed_model = AutoModel.from_pretrained(embed_model_name)

# Enable GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embed_model.to(device)

# Function to Encode Texts in Batches
def encode_texts(texts, batch_size=32):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        inputs = embed_tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = embed_model(**inputs)
            batch_embeddings = outputs.pooler_output.cpu().numpy()  # Pooler output for sentence embeddings
        embeddings.append(batch_embeddings)
    return np.vstack(embeddings)

# Encode and Index Training Data
train_embeddings = encode_texts(train_data['text'].tolist())
index = faiss.IndexFlatL2(train_embeddings.shape[1])  # FAISS index
index.add(train_embeddings)

# Load Lightweight Generative Model
generator_model_name = "google/flan-t5-small"
generator_tokenizer = T5Tokenizer.from_pretrained(generator_model_name)
generator_model = T5ForConditionalGeneration.from_pretrained(generator_model_name).to(device)

# Retrieve Relevant Context
def rag_retrieve(article, k=5):
    article_embedding = encode_texts([article])
    _, retrieved_indices = index.search(article_embedding, k)
    return train_data.iloc[retrieved_indices[0]]['text'].tolist()

# Predict Using RAG
def rag_predict(article):
    retrieved_texts = rag_retrieve(article, k=3)
    input_text = f"Classify: {article[:512]} Context: {' '.join(retrieved_texts)}"
    inputs = generator_tokenizer(input_text, return_tensors="pt", truncation=True).to(device)
    outputs = generator_model.generate(**inputs, max_length=50)
    prediction = generator_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return prediction

# Evaluate the Model
def evaluate_model(test_data):
    y_true = test_data['label'].tolist()
    y_pred = []

    for _, row in test_data.iterrows():
        prediction = rag_predict(row['text'])
        if "fake" in prediction.lower():
            y_pred.append(0)  # Fake
        elif "real" in prediction.lower():
            y_pred.append(1)  # Real
        else:
            y_pred.append(0)  # Default to Fake if unclear

    # Calculate Metrics
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)

    return accuracy, f1, precision, recall

# Run Evaluation
accuracy, f1, precision, recall = evaluate_model(test_data)

# Print Metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
