# Misinformation Hashtag Tracker
## Information Access Specialist Toolkit

This notebook provides comprehensive tools for tracking, analyzing, and managing misinformation spread through social media networks. It focuses on hashtag analysis, narrative propagation detection, and "infodemic" management strategies.

**Key Capabilities:**
- Social media misinformation tracking
- Hashtag trend analysis and narrative mapping
- Network analysis of information flow
- Content verification and fact-checking workflows
- Digital literacy assessment tools
- Infodemic response strategies

## 1. Environment Setup and Dependencies

In [None]:
# Core data analysis libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import json
import re
import warnings
warnings.filterwarnings('ignore')

# Network analysis
import networkx as nx
from collections import Counter, defaultdict

# Text processing and NLP
import nltk
from textblob import TextBlob
from wordcloud import WordCloud

# Machine learning for classification
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

# Visualization
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Set up plotting parameters
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ Environment setup complete")

## 2. Data Collection and Preprocessing Framework

In [None]:
class SocialMediaCollector:
    """Framework for collecting social media data from multiple platforms"""
    
    def __init__(self):
        self.platforms = ['twitter', 'facebook', 'instagram', 'tiktok', 'telegram']
        self.collected_data = []
        
    def simulate_data_collection(self, num_posts=1000):
        """Simulate social media data collection for demonstration"""
        import random
        
        # Sample misinformation-related hashtags and keywords
        misinfo_hashtags = [
            '#fakenews', '#conspiracy', '#hoax', '#scam', '#lies',
            '#coverup', '#agenda', '#propaganda', '#manufactured',
            '#staged', '#crisis', '#actors', '#false', '#planted'
        ]
        
        # Sample legitimate hashtags that might get mixed in
        mixed_hashtags = [
            '#news', '#breaking', '#update', '#truth', '#facts',
            '#investigation', '#report', '#analysis', '#verified'
        ]
        
        all_hashtags = misinfo_hashtags + mixed_hashtags
        
        data = []
        base_date = datetime.now() - timedelta(days=30)
        
        for i in range(num_posts):
            # Generate synthetic post data
            num_hashtags = random.randint(1, 5)
            post_hashtags = random.sample(all_hashtags, num_hashtags)
            
            post = {
                'post_id': f'post_{i:06d}',
                'platform': random.choice(self.platforms),
                'timestamp': base_date + timedelta(
                    days=random.randint(0, 30),
                    hours=random.randint(0, 23),
                    minutes=random.randint(0, 59)
                ),
                'hashtags': post_hashtags,
                'engagement_score': random.randint(1, 10000),
                'shares': random.randint(0, 5000),
                'likes': random.randint(0, 20000),
                'comments': random.randint(0, 1000),
                'user_followers': random.randint(100, 100000),
                'verified_user': random.choice([True, False]),
                'content_length': random.randint(50, 280),
                'contains_media': random.choice([True, False]),
                'language': random.choice(['en', 'es', 'fr', 'de', 'pt']),
                'credibility_score': random.uniform(0, 1),  # 0 = low credibility, 1 = high
                'fact_checked': random.choice([True, False]),
                'misinformation_risk': random.choice(['low', 'medium', 'high'])
            }
            data.append(post)
        
        self.collected_data = pd.DataFrame(data)
        return self.collected_data
    
    def preprocess_data(self, df):
        """Clean and preprocess collected social media data"""
        # Convert timestamp to datetime
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        
        # Extract hashtag text without # symbol
        df['hashtag_text'] = df['hashtags'].apply(
            lambda x: [tag.replace('#', '').lower() for tag in x]
        )
        
        # Calculate viral potential score
        df['viral_score'] = (
            df['shares'] * 0.4 + 
            df['likes'] * 0.3 + 
            df['comments'] * 0.3
        ) / df['user_followers']
        
        # Add time-based features
        df['hour'] = df['timestamp'].dt.hour
        df['day_of_week'] = df['timestamp'].dt.dayofweek
        df['date'] = df['timestamp'].dt.date
        
        return df

# Initialize collector and generate sample data
collector = SocialMediaCollector()
sample_data = collector.simulate_data_collection(1500)
processed_data = collector.preprocess_data(sample_data.copy())

print(f"✅ Collected and processed {len(processed_data)} social media posts")
print(f"📊 Data shape: {processed_data.shape}")
processed_data.head()

## 3. Hashtag Analysis and Narrative Propagation Detection

In [None]:
class HashtagAnalyzer:
    """Comprehensive hashtag analysis for misinformation tracking"""
    
    def __init__(self, data):
        self.data = data
        self.hashtag_trends = None
        self.narrative_clusters = None
        
    def analyze_hashtag_frequency(self):
        """Analyze hashtag frequency and trending patterns"""
        # Flatten all hashtags
        all_hashtags = []
        for hashtag_list in self.data['hashtag_text']:
            all_hashtags.extend(hashtag_list)
        
        # Count frequencies
        hashtag_counts = Counter(all_hashtags)
        
        # Create trending analysis
        trending_df = pd.DataFrame([
            {'hashtag': tag, 'frequency': count, 'percentage': (count/len(all_hashtags))*100}
            for tag, count in hashtag_counts.most_common(50)
        ])
        
        self.hashtag_trends = trending_df
        return trending_df
    
    def detect_coordinated_campaigns(self, time_window_hours=6, min_posts=10):
        """Detect potential coordinated misinformation campaigns"""
        campaigns = []
        
        # Group by hashtag combinations
        hashtag_combos = self.data['hashtags'].apply(lambda x: tuple(sorted(x)))
        combo_groups = self.data.groupby(hashtag_combos)
        
        for combo, group in combo_groups:
            if len(group) >= min_posts:
                # Check if posts are clustered in time
                time_diffs = group['timestamp'].diff().dt.total_seconds() / 3600
                clustered_posts = (time_diffs <= time_window_hours).sum()
                
                if clustered_posts >= min_posts * 0.7:  # 70% of posts clustered
                    campaign_info = {
                        'hashtag_combo': combo,
                        'post_count': len(group),
                        'time_span_hours': (group['timestamp'].max() - group['timestamp'].min()).total_seconds() / 3600,
                        'avg_engagement': group['engagement_score'].mean(),
                        'platforms': group['platform'].unique().tolist(),
                        'risk_score': self._calculate_campaign_risk(group)
                    }
                    campaigns.append(campaign_info)
        
        return pd.DataFrame(campaigns).sort_values('risk_score', ascending=False)
    
    def _calculate_campaign_risk(self, group):
        """Calculate risk score for potential misinformation campaign"""
        risk_factors = {
            'high_misinformation_risk': (group['misinformation_risk'] == 'high').sum() / len(group),
            'low_credibility': (group['credibility_score'] < 0.3).sum() / len(group),
            'not_fact_checked': (~group['fact_checked']).sum() / len(group),
            'rapid_spread': (group['viral_score'] > group['viral_score'].quantile(0.8)).sum() / len(group)
        }
        
        return sum(risk_factors.values()) / len(risk_factors)
    
    def create_narrative_clusters(self, n_clusters=8):
        """Cluster hashtags to identify narrative themes"""
        # Create hashtag co-occurrence matrix
        all_hashtags = set()
        for hashtag_list in self.data['hashtag_text']:
            all_hashtags.update(hashtag_list)
        
        hashtag_list = list(all_hashtags)
        cooccurrence_matrix = np.zeros((len(hashtag_list), len(hashtag_list)))
        
        for hashtags in self.data['hashtag_text']:
            for i, tag1 in enumerate(hashtag_list):
                for j, tag2 in enumerate(hashtag_list):
                    if tag1 in hashtags and tag2 in hashtags:
                        cooccurrence_matrix[i][j] += 1
        
        # Apply clustering
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        clusters = kmeans.fit_predict(cooccurrence_matrix)
        
        # Create cluster DataFrame
        cluster_df = pd.DataFrame({
            'hashtag': hashtag_list,
            'cluster': clusters
        })
        
        self.narrative_clusters = cluster_df
        return cluster_df

# Perform hashtag analysis
analyzer = HashtagAnalyzer(processed_data)
trending_hashtags = analyzer.analyze_hashtag_frequency()
potential_campaigns = analyzer.detect_coordinated_campaigns()
narrative_clusters = analyzer.create_narrative_clusters()

print("🔍 Hashtag Analysis Complete")
print(f"📈 Top 10 trending hashtags:")
print(trending_hashtags.head(10))
print(f"\n⚠️  Detected {len(potential_campaigns)} potential coordinated campaigns")