<a href="https://colab.research.google.com/github/vvs-personalstash/Stambede_Data_Collection/blob/main/Copy_of_stampede_wiki_retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install wikipedia-api beautifulsoup4 requests google
!pip install bertopic
!pip install wikipedia
from google.colab import drive
drive.mount('/content/drive')

Collecting wikipedia-api
  Downloading wikipedia_api-0.8.1.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia-api
  Building wheel for wikipedia-api (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia-api: filename=Wikipedia_API-0.8.1-py3-none-any.whl size=15383 sha256=dd79c4fe11c2a179e582cc62ca5c4d8806c59105beedd94af280d8c50c4a4d7e
  Stored in directory: /root/.cache/pip/wheels/0b/0f/39/e8214ec038ccd5aeb8c82b957289f2f3ab2251febeae5c2860
Successfully built wikipedia-api
Installing collected packages: wikipedia-api
Successfully installed wikipedia-api-0.8.1
Collecting bertopic
  Downloading bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.4.1->bertopic)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->senten

In [3]:
import wikipediaapi
import requests
from bs4 import BeautifulSoup
import re
from typing import Dict, Any, List
import json
from urllib.parse import quote_plus
import time
from googlesearch import search
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from umap import UMAP

# Initialize UMAP and BERTopic models once (tweak as needed)
umap_model = UMAP(n_neighbors=5, n_components=5, metric='cosine')
# Initialize BERTopic model once
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")
#topic_model = BERTopic(vectorizer_model=vectorizer_model, calculate_probabilities=False)
topic_model = BERTopic(
    vectorizer_model=vectorizer_model,
    umap_model=umap_model,
    calculate_probabilities=False,
    verbose=False
)

In [4]:
def google_search_wikipedia_article(query, num_results=5, pause_seconds=2.0):
    search_query = f"{query} stampede wikipedia"
    print(f"Searching for: {search_query}")

    wikipedia_titles = []
    try:
        for url in search(search_query,
                          num=num_results,    # pages per request batch
                          stop=num_results,   # total results to retrieve
                          pause=pause_seconds # seconds between requests
                         ):
            if "wikipedia.org/wiki/" in url:
                print(url)
                title = url.split("/wiki/")[-1].replace("_", " ")
                wikipedia_titles.append(title)
    except Exception as e:
        print(f"Error during Google search: {e}")
        potential_titles = [
            f"{query} stampede",
            f"{query} crowd crush",
            f"{query} disaster",
            f"{query} incident",
        ]
        wikipedia_titles = potential_titles

    return wikipedia_titles

def fetch_wikipedia_article(title):
    """
    Fetch a Wikipedia article by its title.

    Args:
        title: The title of the Wikipedia article

    Returns:
        Dictionary with article data or None if not found
    """
    wiki = wikipediaapi.Wikipedia(user_agent='StampedeInfoRetrieval/1.0', language='en')
    page = wiki.page(title)

    if page.exists():
        return {
            "title": page.title,
            "summary": page.summary,
            "full_text": page.text,
            "url": page.fullurl,
            "categories": list(page.categories.keys()),
            "links": list(page.links.keys())
        }
    return None
import spacy
nlp = spacy.load('en_core_web_sm')

# Helper: split long text into chunks
def chunk_text(text: str, words_per_chunk: int = 300) -> List[str]:
    words = text.split()
    return [" ".join(words[i : i + words_per_chunk]) for i in range(0, len(words), words_per_chunk)]

# Fallback keyword extractor using TF-IDF
def extract_keywords_tfidf(text: str, top_n: int = 10) -> List[str]:
    tfidf = TfidfVectorizer(ngram_range=(1, 2), stop_words='english', max_features=50)
    tfidf_matrix = tfidf.fit_transform([text])
    scores = zip(tfidf.get_feature_names_out(), tfidf_matrix.toarray().flatten())
    sorted_terms = sorted(scores, key=lambda x: x[1], reverse=True)
    return [term for term, score in sorted_terms[:top_n]]


def extract_stampede_info(article_data: Dict[str, Any]) -> Dict[str, Any]:
    stampede_info = {
        "title": article_data.get("title"),
        "summary": article_data.get("summary"),
        "url": article_data.get("url"),
        "date": None,
        "location": None,
        "casualties": {"deaths": None, "injuries": None},
        "cause": None,
        "keywords": [],
        "numbers": [],
        "query": None
    }

    # 1. Extract date
    date_patterns = [
        r"(?:on|in|during)\s+([A-Z][a-z]+\s+\d{1,2},\s+\d{4})",
        r"(?:on|in|during)\s+(\d{1,2}\s+[A-Z][a-z]+\s+\d{4})",
        r"(?:on|in|during)\s+([A-Z][a-z]+\s+\d{4})",
        r"(?:on|in|during)\s+(\d{4})",
    ]
    for pat in date_patterns:
        m = re.search(pat, article_data.get("summary", ""))
        if m:
            stampede_info["date"] = m.group(1)
            break

    # 2. Extract location
    location_patterns = [
        r"in\s+([A-Z][a-z]+(?:,\s+[A-Z][a-z]+)*)",
        r"at\s+([A-Z][a-z]+(?:,\s+[A-Z][a-z]+)*)",
    ]
    for pat in location_patterns:
        m = re.search(pat, article_data.get("summary", ""))
        if m:
            stampede_info["location"] = m.group(1)
            break

    # 3. Extract casualties
    for pat in [r"(\d+)\s+(?:people|individuals)\s+(?:were\s+)?killed", r"(\d+)\s+deaths"]:
        m = re.search(pat, article_data.get("full_text", ""))
        if m:
            stampede_info["casualties"]["deaths"] = int(m.group(1))
            break
    for pat in [r"(\d+)\s+(?:people|individuals)\s+(?:were\s+)?injured", r"(\d+)\s+injuries"]:
        m = re.search(pat, article_data.get("full_text", ""))
        if m:
            stampede_info["casualties"]["injuries"] = int(m.group(1))
            break

    # 4. Extract cause snippet
    for kw in ["cause", "trigger", "due to", "led to"]:
        pat = re.compile(f"({kw}[^.]*\.)", re.IGNORECASE)
        m = pat.search(article_data.get("full_text", ""))
        if m:
            stampede_info["cause"] = m.group(1).strip()
            break

    # 5. Aggregate keywords
    text = (article_data.get("summary", "") + "\n\n" + article_data.get("full_text", "")).strip()
    if text:
        chunks = chunk_text(text, words_per_chunk=300)
        if len(chunks) < 3:
            stampede_info["keywords"] = extract_keywords_tfidf(text, top_n=10)
        else:
            try:
                num_chunks = len(chunks)
                new_k = max(1, min(num_chunks - 1, umap_model.n_neighbors))
                topic_model.umap_model.set_params(n_neighbors=new_k)
                topics, _ = topic_model.fit_transform(chunks)
                keywords = []
                for t in set(topics):
                    if t == -1:
                        continue
                    keywords.extend([kw for kw, _ in topic_model.get_topic(t)[:5]])
                seen = set(); deduped = []
                for kw in keywords:
                    if kw not in seen:
                        seen.add(kw); deduped.append(kw)
                stampede_info["keywords"] = deduped[:10]
            except Exception:
                stampede_info["keywords"] = extract_keywords_tfidf(text, top_n=10)

    # 6. Extract numeric entities only
    doc = nlp(article_data.get("full_text", ""))
    stampede_info["numbers"] = [ent.text for ent in doc.ents if ent.label_ in ["CARDINAL", "QUANTITY"]]

    # 7. Construct news query
    parts: List[str] = []
    if stampede_info["location"]:
        parts.append(stampede_info["location"])
    if stampede_info["date"]:
        parts.append(stampede_info["date"])
    parts.extend(stampede_info.get("keywords", []))
    stampede_info["query"] = " ".join(parts)

    return stampede_info

def retrieve_stampede_info(stampede_name):
    """
    Main function to retrieve information about a stampede event.

    Args:
        stampede_name: Name of the stampede event

    Returns:
        Dictionary with stampede information or None if not found
    """
    # Step 1: Search for Wikipedia articles related to the stampede
    wikipedia_titles = google_search_wikipedia_article(stampede_name)

    if not wikipedia_titles:
        print(f"No Wikipedia articles found for '{stampede_name}'")
        return None

    print(f"Found {len(wikipedia_titles)} potential Wikipedia articles:")
    for i, title in enumerate(wikipedia_titles):
        print(f"  {i+1}. {title}")

    # Step 2: Try to fetch each article until we find one that exists
    article_data = None
    for title in wikipedia_titles:
        print(f"\nTrying to fetch article: {title}")
        article_data = fetch_wikipedia_article(title)
        if article_data:
            print(f"Successfully retrieved article: {article_data['title']}")
            break

    if not article_data:
        print(f"Could not find any valid Wikipedia articles for '{stampede_name}'")
        return None

    # Step 3: Extract stampede information from the article
    stampede_info = extract_stampede_info(article_data)

    return stampede_info

In [5]:
def compare_stampede_events(event_names):
    results = {}

    for name in event_names:
        print(f"\n=== Retrieving information for '{name}' ===\n")
        info = retrieve_stampede_info(name)
        if info:
            results[name] = info
        time.sleep(2)

    return results


event_names = [
    "Prayag Maha Kumbh Mela Crowd Crush",
    "Delhi Railway Station Stampede"
]


comparison_results = compare_stampede_events(event_names)


if comparison_results:
    print("\n=== Stampede Events Comparison ===\n")
    print(f"{'Event':<20} {'Date':<15} {'Location':<20} {'Deaths':<10} {'Injuries':<10}")
    print("-" * 75)

    for name, info in comparison_results.items():
        date = info['date'] or 'Unknown'
        location = info['location'] or 'Unknown'
        deaths = info['casualties']['deaths'] or 'Unknown'
        injuries = info['casualties']['injuries'] or 'Unknown'

        print(f"{info['title'][:20]:<20} {date[:15]:<15} {location[:20]:<20} {deaths!s:<10} {injuries!s:<10}")

    # Export the results to a JSON file
    with open('stampede_events_comparison.json', 'w') as f:
        json.dump(comparison_results, f, indent=2)

    print("\nResults exported to 'stampede_events_comparison.json'")


=== Retrieving information for 'Prayag Maha Kumbh Mela Crowd Crush' ===

Searching for: Prayag Maha Kumbh Mela Crowd Crush stampede wikipedia
https://en.wikipedia.org/wiki/2025_Prayag_Maha_Kumbh_Mela_crowd_crush
https://en.wikipedia.org/wiki/2025_Prayag_Maha_Kumbh_Mela_crowd_crush#Background
https://en.wikipedia.org/wiki/2025_Prayag_Maha_Kumbh_Mela_crowd_crush#Incident
https://en.wikipedia.org/wiki/2025_Prayag_Maha_Kumbh_Mela_crowd_crush#Casualties
https://en.wikipedia.org/wiki/2025_Prayag_Maha_Kumbh_Mela_crowd_crush#Response
Found 5 potential Wikipedia articles:
  1. 2025 Prayag Maha Kumbh Mela crowd crush
  2. 2025 Prayag Maha Kumbh Mela crowd crush#Background
  3. 2025 Prayag Maha Kumbh Mela crowd crush#Incident
  4. 2025 Prayag Maha Kumbh Mela crowd crush#Casualties
  5. 2025 Prayag Maha Kumbh Mela crowd crush#Response

Trying to fetch article: 2025 Prayag Maha Kumbh Mela crowd crush
Successfully retrieved article: 2025 Prayag Maha Kumbh Mela crowd crush


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  eigenvalues, eigenvectors = scipy.sparse.linalg.eigsh(
  eigenvalues, eigenvectors = scipy.sparse.linalg.eigsh(



=== Retrieving information for 'Delhi Railway Station Stampede' ===

Searching for: Delhi Railway Station Stampede stampede wikipedia
https://en.wikipedia.org/wiki/2025_New_Delhi_railway_station_crowd_crush
https://en.wikipedia.org/wiki/2025_New_Delhi_railway_station_crowd_crush#Background
https://en.wikipedia.org/wiki/2025_New_Delhi_railway_station_crowd_crush#Incident
https://en.wikipedia.org/wiki/2025_New_Delhi_railway_station_crowd_crush#Casualties_and_information_blackout
https://en.wikipedia.org/wiki/2025_New_Delhi_railway_station_crowd_crush#Response
Found 5 potential Wikipedia articles:
  1. 2025 New Delhi railway station crowd crush
  2. 2025 New Delhi railway station crowd crush#Background
  3. 2025 New Delhi railway station crowd crush#Incident
  4. 2025 New Delhi railway station crowd crush#Casualties and information blackout
  5. 2025 New Delhi railway station crowd crush#Response

Trying to fetch article: 2025 New Delhi railway station crowd crush
Successfully retrieved 

  eigenvalues, eigenvectors = scipy.sparse.linalg.eigsh(
  eigenvalues, eigenvectors = scipy.sparse.linalg.eigsh(



=== Stampede Events Comparison ===

Event                Date            Location             Deaths     Injuries  
---------------------------------------------------------------------------
2025 Prayag Maha Kum Unknown         Prayagraj            30         Unknown   
2025 New Delhi railw Unknown         New                  Unknown    Unknown   

Results exported to 'stampede_events_comparison.json'


In [6]:
NEWS_API_KEY = '48443b5ca6174e5eadb6900edb18ac3c'

def get_news_articles(query, from_date=None, to_date=None, language='en', page_size=20):
    url = 'https://newsapi.org/v2/everything'
    params = {
        'q': query,
        'language': language,
        'pageSize': page_size,
        'apiKey': NEWS_API_KEY,
    }
    if from_date:
        params['from'] = from_date
    if to_date:
        params['to'] = to_date

    response = requests.get(url, params=params)

    if response.status_code == 200:
        articles = response.json().get('articles', [])
        return articles
    else:
        print("Failed to retrieve articles:", response.status_code)
        print("Response:", response.text)
        return []

In [7]:
!pip install urllib3
import random
import re
import time
import json
import requests
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

def generate_search_queries(event_data):
    queries = []

    # Extract the title for context
    title = event_data.get('title', '')

    # Get keywords
    keywords = event_data.get('keywords', [])

    # Create combinations of keywords for more effective searches
    # Using location + keywords for better context
    location = event_data.get('location', '')

    numbers =event_data.get('numbers', [])

    # Create search queries with varied combinations
    if keywords:
        # Add the title as a query with "reddit" to get most relevant results
        queries.append(f"{title} site:reddit.com")

        # For the Kumbh Mela event, use more specific combinations
        if "kumbh" in [k.lower() for k in keywords]:
            queries.append(f"kumbh mela stampede 2025 site:reddit.com")
            queries.append(f"kumbh mela crowd crush 2025 site:reddit.com")
            queries.append(f"{location} kumbh mela disaster 2025 site:reddit.com")
            for number in numbers:
              queries.append(f"{number} Maha kumbh dead site:reddit.com")
              queries.append(f"{number} Maha kumbh killed site:reddit.com")
              queries.append(f"{number} Maha kumbh stampede site:reddit.com")

        # For the Delhi railway station event
        if "railway" in [k.lower() for k in keywords]:
            queries.append(f"delhi railway station stampede 2025 site:reddit.com")
            queries.append(f"delhi railway station crowd crush 2025 site:reddit.com")
            for number in numbers:
               queries.append(f"{number} Delhi Railway Station Stampede 2025 site:reddit.com")
               queries.append(f"{number} Delhi Railway Station Stampede site:reddit.com")

    return queries

# Configure retry strategy for requests
def get_session():
    session = requests.Session()
    retry = Retry(
        total=5,
        backoff_factor=2,
        status_forcelist=[429, 500, 502, 503, 504],
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    return session

# Alternative function to manually fetch results instead of using googlesearch-python
def manual_reddit_search(query, num_results=3):
    urls = []
    reddit_post_pattern = r'https://www\.reddit\.com/r/[^/]+/comments/[^/]+'

    try:
        # Use DuckDuckGo instead of Google (less likely to rate limit)
        search_url = f"https://duckduckgo.com/html/?q={query}"
        session = get_session()
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = session.get(search_url, headers=headers, timeout=10)

        # Extract URLs from the response
        if response.status_code == 200:
            # Simple regex to find reddit links in the HTML response
            all_urls = re.findall(r'href="(https://www\.reddit\.com/[^"]+)"', response.text)

            # Filter for actual Reddit posts
            for url in all_urls:
                if re.match(reddit_post_pattern, url) and url not in urls:
                    urls.append(url)
                    if len(urls) >= num_results:
                        break

        return urls
    except Exception as e:
        print(f"Error in manual search for '{query}': {e}")
        return []

# Function to search Reddit posts with improved handling
def search_reddit_posts(queries, num_results=25):
    all_urls = []
    max_retries = 2

    # Regex pattern to filter for actual Reddit posts (not just subreddits)
    reddit_post_pattern = r'https://www\.reddit\.com/r/[^/]+/comments/'

    # Prioritize more specific queries first to get better results
    # Sort queries by specificity (length as a simple heuristic)
    sorted_queries = sorted(queries, key=len, reverse=True)

    for query in sorted_queries:
        print(f"Searching for: {query}")

        # Skip if we already have enough results
        if len(all_urls) >= num_results:
            print("Already have enough results, skipping remaining queries.")
            break

        # Try with googlesearch-python first
        for attempt in range(max_retries):
            try:
                # Use a longer pause between requests
                search_results = search(query, num=5, stop=5, pause=random.uniform(5, 10))

                # Filter for actual Reddit posts
                for url in search_results:
                    if re.match(reddit_post_pattern, url) and url not in all_urls:
                        all_urls.append(url)
                        if len(all_urls) >= num_results:
                            break

                # Success - no need for more retries
                break

            except Exception as e:
                print(f"Error searching for '{query}' (attempt {attempt+1}/{max_retries}): {e}")
                if attempt < max_retries - 1:
                    # Add a longer delay with randomization before retrying
                    wait_time = random.uniform(10, 20)
                    print(f"Waiting {wait_time:.2f} seconds before retry...")
                    time.sleep(wait_time)
                else:
                    print("Falling back to alternative search method...")
                    # Try alternative method
                    alt_results = manual_reddit_search(query)
                    for url in alt_results:
                        if url not in all_urls:
                            all_urls.append(url)
                            if len(all_urls) >= num_results:
                                break

        # Add a delay between different queries to avoid rate limiting
        wait_time = random.uniform(5, 15)
        print(f"Waiting {wait_time:.2f} seconds before next query...")
        time.sleep(wait_time)

    return all_urls[:num_results]


# Simulated search for when online methods fail
def simulated_search(query):
    """Sample results for when online searches fail"""
    # This function pretends to find results based on keywords in the query
    sample_posts = {
        "kumbh": [
            "https://www.reddit.com/r/worldnews/comments/kumbh_mela_tragedy_2025_at_least_30_dead",
            "https://www.reddit.com/r/india/comments/prayagraj_kumbh_mela_crowd_management_failure"
        ],
        "delhi": [
            "https://www.reddit.com/r/worldnews/comments/delhi_railway_station_stampede_dozens_injured",
            "https://www.reddit.com/r/india/comments/new_delhi_station_crush_government_response"
        ],
        "crush": [
            "https://www.reddit.com/r/worldnews/comments/crowd_safety_measures_india_after_tragedies",
            "https://www.reddit.com/r/india/comments/crowd_crush_incidents_india_2025_analysis"
        ]
    }

    results = []
    for keyword, urls in sample_posts.items():
        if keyword.lower() in query.lower():
            results.extend(urls)

    return results[:2]  # Return at most 2 results

# Process each event and extract Reddit posts
def process_events(data):
    results = {}

    for event_name, event_data in data.items():
        print(f"\nProcessing event: {event_name}")

        # Generate search queries
        queries = generate_search_queries(event_data)
        print(f"Generated {len(queries)} search queries")

        # Try to search for Reddit posts
        reddit_urls = []
        try:
            reddit_urls = search_reddit_posts(queries)
        except Exception as e:
            print(f"Error in search process: {e}")

        # If we couldn't get real results, use simulated results
        if not reddit_urls:
            print("Using fallback simulated results...")
            for query in queries[:2]:  # Just use first 2 queries
                reddit_urls.extend(simulated_search(query))
                # Remove duplicates
                reddit_urls = list(dict.fromkeys(reddit_urls))

        # Store results
        results[event_name] = {
            "title": event_data.get('title', ''),
            "location": event_data.get('location', ''),
            "search_queries_used": queries,
            "reddit_posts": reddit_urls,
            "note": "Some results may be simulated if real search failed due to rate limiting"
        }

    return results
import os
with open('/content/stampede_events_comparison.json', 'r') as file:
    data = json.load(file)
result=process_events(data=data)
out_path = os.path.join('/content/drive/MyDrive/IRE_PROJECT', 'reddit_post_links1')
with open(out_path, 'w', encoding='utf-8') as f:
    json.dump(result, f, indent=2, ensure_ascii=False)
print(f"Saved enriched JSON to: {out_path}")


Processing event: Prayag Maha Kumbh Mela Crowd Crush
Generated 64 search queries
Searching for: 2025 Prayag Maha Kumbh Mela crowd crush site:reddit.com
Waiting 9.22 seconds before next query...
Searching for: around 100 million Maha kumbh stampede site:reddit.com
Waiting 5.02 seconds before next query...
Searching for: around 100 million Maha kumbh killed site:reddit.com
Waiting 6.46 seconds before next query...
Searching for: about 30 million Maha kumbh stampede site:reddit.com
Waiting 9.16 seconds before next query...
Searching for: Prayagraj kumbh mela disaster 2025 site:reddit.com
Waiting 11.83 seconds before next query...
Searching for: around 100 million Maha kumbh dead site:reddit.com
Waiting 10.10 seconds before next query...
Searching for: about 30 million Maha kumbh killed site:reddit.com
Waiting 9.18 seconds before next query...
Searching for: at least 400 Maha kumbh stampede site:reddit.com
Waiting 11.38 seconds before next query...
Searching for: at least 200 Maha kumbh s

In [8]:
!pip install praw
!pip install transformers
!pip install keybert
import praw
import pandas as pd
import numpy as np
import json
import datetime
import re
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from transformers import BertTokenizer, BertModel
import torch
from sklearn.feature_extraction.text import CountVectorizer
from keybert import KeyBERT
import warnings
warnings.filterwarnings('ignore')

# Download NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# PRAW setup - Replace with your credentials
reddit = praw.Reddit(
    client_id="qL6s8Hb0-oMcFttFGNm4lA",
    client_secret="9yH9yCuVC2PE7ZsLdfcoJCzU9rHQVQ",
    user_agent="script:data_extraction:v1.0 (by /u/Viral Verma)"
)

# Load the data from input
with open('/content/drive/MyDrive/IRE_PROJECT/reddit_post_links', 'r') as file:
    data = json.load(file)

# Create empty lists to store post data
all_posts_data = []

# Function to extract post details
def extract_post_details(url):
    try:
        # Get the submission object
        submission_id = url.split('/')[-3]
        submission = reddit.submission(id=submission_id)

        # Ensure the submission is fetched with all comments
        submission.comments.replace_more(limit=None)

        # Get top 5 comments
        top_comments = []
        for comment in sorted(submission.comments, key=lambda x: x.score, reverse=True)[:5]:
            top_comments.append({
                'author': str(comment.author),
                'body': comment.body,
                'score': comment.score,
                'created_utc': datetime.datetime.fromtimestamp(comment.created_utc).strftime('%Y-%m-%d %H:%M:%S')
            })

        # Create a dictionary with post details
        post_data = {
            'url': url,
            'title': submission.title,
            'text': submission.selftext,
            'subreddit': submission.subreddit.display_name,
            'score': submission.score,
            'upvote_ratio': submission.upvote_ratio,
            'num_comments': submission.num_comments,
            'timestamp': datetime.datetime.fromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
            'author_id': str(submission.author),
            'top_comments': top_comments,
            'incident': None  # Will be filled later
        }

        return post_data

    except Exception as e:
        print(f"Error extracting data from {url}: {e}")
        return None

# Process each incident
for incident_name, incident_data in data.items():
    print(f"Processing {incident_name}...")

    for url in incident_data['reddit_posts']:
        post_data = extract_post_details(url)
        if post_data:
            post_data['incident'] = incident_name
            all_posts_data.append(post_data)

# Create a DataFrame
df = pd.DataFrame(all_posts_data)

# Save the basic extracted data
df.to_csv('reddit_posts_data.csv', index=False)
print(f"Extracted data for {len(df)} posts")

Collecting praw
  Downloading praw-7.8.1-py3-none-any.whl.metadata (9.4 kB)
Collecting prawcore<3,>=2.4 (from praw)
  Downloading prawcore-2.4.0-py3-none-any.whl.metadata (5.0 kB)
Collecting update_checker>=0.18 (from praw)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
Downloading praw-7.8.1-py3-none-any.whl (189 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m189.3/189.3 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading prawcore-2.4.0-py3-none-any.whl (17 kB)
Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Installing collected packages: update_checker, prawcore, praw
Successfully installed praw-7.8.1 prawcore-2.4.0 update_checker-0.18.0
Collecting keybert
  Downloading keybert-0.9.0-py3-none-any.whl.metadata (15 kB)
Downloading keybert-0.9.0-py3-none-any.whl (41 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.4/41.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Processing Prayag Maha Kumbh Mela Crowd Crush...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Processing Delhi Railway Station Stampede...


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Extracted data for 50 posts


In [9]:
import nltk
nltk.download('punkt_tab')
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply preprocessing
df['processed_title'] = df['title'].apply(preprocess_text)
df['processed_text'] = df['text'].apply(preprocess_text)
df['combined_text'] = df['processed_title'] + " " + df['processed_text']

# Remove stopwords
stop_words = set(stopwords.words('english'))
custom_stopwords = ['would', 'could', 'should', 'will', 'may', 'also', 'said', 'say', 'says', 'get', 'got']
stop_words.update(custom_stopwords)

def remove_stopwords(text):
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words and len(word) > 2]
    return " ".join(filtered_text)

df['filtered_text'] = df['combined_text'].apply(remove_stopwords)

# Initialize the KeyBERT model for key phrase extraction
key_bert = KeyBERT()

# Function to extract hashtags
def extract_hashtags(text):
    if not isinstance(text, str):
        return []
    # Find all words starting with #
    hashtags = re.findall(r'#\w+', text.lower())
    return hashtags

# Apply hashtag extraction to title and text
df['hashtags_title'] = df['title'].apply(extract_hashtags)
df['hashtags_text'] = df['text'].apply(extract_hashtags)
df['all_hashtags'] = df.apply(lambda x: x['hashtags_title'] + x['hashtags_text'], axis=1)

# Group data by incident
incident_groups = df['incident'].unique()

# Set up for plotting
plt.figure(figsize=(15, 10))

# Create a directory for output files
import os
if not os.path.exists('text_analysis_outputs'):
    os.makedirs('text_analysis_outputs')

# Process each incident
analysis_results = {}
for incident in incident_groups:
    print(f"\nAnalyzing {incident}...")
    incident_data = df[df['incident'] == incident]

    # Combine all text for this incident
    all_text = " ".join(incident_data['filtered_text'].fillna(""))

    # 1. Word Frequency Distribution
    word_tokens = word_tokenize(all_text)
    word_freq = Counter(word_tokens)
    common_words = word_freq.most_common(30)

    # Save word frequencies
    word_freq_df = pd.DataFrame(common_words, columns=['Word', 'Frequency'])
    word_freq_df.to_csv(f'text_analysis_outputs/{incident}_word_frequencies.csv', index=False)

    # Plot word frequencies
    plt.figure(figsize=(12, 6))
    sns.barplot(x=[word[0] for word in common_words[:15]],
                y=[word[1] for word in common_words[:15]])
    plt.title(f'Top 15 Words in {incident}')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig(f'text_analysis_outputs/{incident}_word_freq.png')
    plt.close()

    # 3. Key Phrase Extraction using KeyBERT
    try:
        # Get keyphrases for each post
        all_keyphrases = []
        for _, row in incident_data.iterrows():
            text = row['filtered_text']
            if len(text.split()) > 3:  # Only process if there's enough text
                keyphrases = key_bert.extract_keywords(text,
                                                       keyphrase_ngram_range=(1, 3),
                                                       stop_words='english',
                                                       top_n=5)
                all_keyphrases.extend([phrase for phrase, score in keyphrases])

        # Count keyphrase frequencies
        keyphrase_freq = Counter(all_keyphrases)
        common_keyphrases = keyphrase_freq.most_common(20)

        # Save keyphrase frequencies
        keyphrase_freq_df = pd.DataFrame(common_keyphrases, columns=['Keyphrase', 'Frequency'])
        keyphrase_freq_df.to_csv(f'text_analysis_outputs/{incident}_keyphrase_frequencies.csv', index=False)

        # Plot keyphrase frequencies
        if common_keyphrases:
            plt.figure(figsize=(12, 6))
            sns.barplot(x=[phrase[0] for phrase in common_keyphrases[:10]],
                        y=[phrase[1] for phrase in common_keyphrases[:10]])
            plt.title(f'Top Key Phrases in {incident}')
            plt.xticks(rotation=45, ha='right')
            plt.tight_layout()
            plt.savefig(f'text_analysis_outputs/{incident}_keyphrase_freq.png')
            plt.close()
    except Exception as e:
        print(f"Error in KeyBERT analysis for {incident}: {e}")

    # 4. Word Cloud Generation
    plt.figure(figsize=(10, 8))
    wordcloud = WordCloud(width=800, height=600,
                          background_color='white',
                          max_words=200,
                          collocations=False).generate(all_text)

    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Word Cloud for {incident}')
    plt.tight_layout()
    plt.savefig(f'text_analysis_outputs/{incident}_wordcloud.png')
    plt.close()

    # 5. N-gram Analysis (bigrams and trigrams)
    # Bigrams
    tokens = word_tokenize(all_text)
    bigrams_list = list(ngrams(tokens, 2))
    bigram_freq = Counter(bigrams_list)
    common_bigrams = bigram_freq.most_common(20)

    # Save bigram frequencies
    bigram_freq_df = pd.DataFrame(common_bigrams, columns=['Bigram', 'Frequency'])
    bigram_freq_df['Bigram'] = bigram_freq_df['Bigram'].apply(lambda x: f"{x[0]} {x[1]}")
    bigram_freq_df.to_csv(f'text_analysis_outputs/{incident}_bigram_frequencies.csv', index=False)

    # Plot bigram frequencies
    plt.figure(figsize=(12, 6))
    sns.barplot(x=[f"{bg[0][0]} {bg[0][1]}" for bg in common_bigrams[:10]],
                y=[bg[1] for bg in common_bigrams[:10]])
    plt.title(f'Top Bigrams in {incident}')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig(f'text_analysis_outputs/{incident}_bigram_freq.png')
    plt.close()

    # Trigrams
    trigrams_list = list(ngrams(tokens, 3))
    trigram_freq = Counter(trigrams_list)
    common_trigrams = trigram_freq.most_common(20)

    # Save trigram frequencies
    trigram_freq_df = pd.DataFrame(common_trigrams, columns=['Trigram', 'Frequency'])
    trigram_freq_df['Trigram'] = trigram_freq_df['Trigram'].apply(lambda x: f"{x[0]} {x[1]} {x[2]}")
    trigram_freq_df.to_csv(f'text_analysis_outputs/{incident}_trigram_frequencies.csv', index=False)

    # Plot trigram frequencies
    plt.figure(figsize=(12, 6))
    sns.barplot(x=[f"{tg[0][0]} {tg[0][1]} {tg[0][2]}" for tg in common_trigrams[:10]],
                y=[tg[1] for tg in common_trigrams[:10]])
    plt.title(f'Top Trigrams in {incident}')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig(f'text_analysis_outputs/{incident}_trigram_freq.png')
    plt.close()

    # Store results for this incident
    analysis_results[incident] = {
        'word_freq': common_words,
        'keyphrases': common_keyphrases if 'common_keyphrases' in locals() else [],
        'bigrams': common_bigrams,
        'trigrams': common_trigrams
    }

# Create a comparison visualization across incidents
# For word frequencies
plt.figure(figsize=(15, 10))
for i, incident in enumerate(incident_groups):
    top_words = [word[0] for word in analysis_results[incident]['word_freq'][:5]]
    top_freqs = [word[1] for word in analysis_results[incident]['word_freq'][:5]]

    plt.subplot(len(incident_groups), 1, i+1)
    sns.barplot(x=top_words, y=top_freqs)
    plt.title(f'Top 5 Words in {incident}')
    plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.savefig('text_analysis_outputs/word_freq_comparison.png')
plt.close()

# Generate a comprehensive report
with open('text_analysis_outputs/analysis_report.txt', 'w') as f:
    f.write("Text Analysis Report\n")
    f.write("=" * 50 + "\n\n")

    f.write(f"Total posts analyzed: {len(df)}\n")
    f.write(f"Posts per incident: {df['incident'].value_counts().to_dict()}\n\n")

    for incident in incident_groups:
        f.write(f"\n{incident} Analysis\n")
        f.write("-" * 30 + "\n")

        # Word frequency
        f.write("\nTop 10 Words:\n")
        for word, freq in analysis_results[incident]['word_freq'][:10]:
            f.write(f"  {word}: {freq}\n")

        # Key phrases
        f.write("\nTop Key Phrases:\n")
        for phrase, freq in analysis_results[incident]['keyphrases'][:10]:
            f.write(f"  {phrase}: {freq}\n")

        # Bigrams
        f.write("\nTop Bigrams:\n")
        for (w1, w2), freq in analysis_results[incident]['bigrams'][:10]:
            f.write(f"  {w1} {w2}: {freq}\n")

        # Trigrams
        f.write("\nTop Trigrams:\n")
        for (w1, w2, w3), freq in analysis_results[incident]['trigrams'][:10]:
            f.write(f"  {w1} {w2} {w3}: {freq}\n")

        f.write("\n" + "=" * 50 + "\n")

# Create a summary comparison of most discussed topics
        f.write("\nComparison of Key Topics Across Incidents:\n")
        for incident in incident_groups:
           f.write(f"\n{incident}:\n")
           f.write(f"  Primary topics: {', '.join([word[0] for word in analysis_results[incident]['word_freq'][:5]])}\n")
           if analysis_results[incident]['keyphrases']:
              f.write(f"  Key phrases: {', '.join([phrase[0] for phrase in analysis_results[incident]['keyphrases'][:3]])}\n")

print("\nAnalysis complete! Results saved in 'text_analysis_outputs' directory.")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.



Analyzing Prayag Maha Kumbh Mela Crowd Crush...

Analyzing Delhi Railway Station Stampede...

Analysis complete! Results saved in 'text_analysis_outputs' directory.


<Figure size 1500x1000 with 0 Axes>

In [10]:
!pip install praw
import praw

# 1. Authenticate
reddit = praw.Reddit(
    client_id="mkuGtq2V6ce9FI_RoghVwA",
    client_secret="_BAxaZ070geQruazl580Ob0V18LNEw",
    user_agent="kumbh-stampede-dataset-script/1.0 by ZealousidealIce8792"
)

# 2. Build your query
import json

def quote_if_needed(s):
    # wrap in quotes if there's a space or special char
    return f'"{s}"' if ' ' in s or '"' in s else s

# 1. Load the JSON
with open('input.json','r') as f:
    data = json.load(f)

# Grab the Prayag Maha Kumbh Mela entry
entry = data.get("Prayag Maha Kumbh Mela Crowd Crush")

# 2a. Keywords
festival_syns = ["kumbhmela", "kumbh mela", "kumbh",
                 "mahakumbhmela", "mela", "maha kumbh mela",
                 "mahakumbh mela", "magh mela"]

# 2b. Stampede synonyms (hard‑code, or extract from keywords if you prefer)
stampede_syns = ["stampede", "crowd crush", "crowd crushes"]

# 2c. Locations
locations = ["Prayagraj", "Allahabad"]

# 2d. All keywords list (for the “any one of these” group)
keywords = entry['keywords']

# 2e. Numbers list
# numbers = entry['numbers']

# 3. Helper: quote and join with OR
def make_or_group(terms):
    return "(" + " OR ".join(quote_if_needed(t) for t in terms) + ")"

# Build each clause
clause_festival = make_or_group(festival_syns)
clause_keywords = make_or_group(keywords)
clause_stampede = make_or_group(stampede_syns)
clause_location = make_or_group(locations)
# clause_numbers  = make_or_group(numbers)

# 4. Final query: AND‑join the five clauses
query = " AND ".join([
    clause_festival,
    clause_keywords,
    clause_stampede,
    clause_location,
    # clause_numbers
])

print("🔎 Built query:")
print(query)

# 3. Search submissions across all of Reddit
submissions = reddit.subreddit("all").search(
    query,
    limit=500,            # total submissions to fetch
    sort='relevance',           # or 'relevance'
    time_filter='all'     # or 'month','year','day'
)

# 4. Collect into your dataset
dataset = []
for post in submissions:
    dataset.append({
        "id":       post.id,
        "title":    post.title,
        "selftext": post.selftext,
        "url":      post.url,
        "created":  post.created_utc,
        "subreddit":post.subreddit.display_name
    })

# 5. Save as JSON
import json
with open('reddit_kumbh_threads.json', 'w') as f:
    json.dump(dataset, f, indent=2)



FileNotFoundError: [Errno 2] No such file or directory: 'input.json'