In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report



# Extract the 'article' column for analysis
headlines =  merged_df['article'].fillna('')  # Fill NaN with empty strings

# Step 2: Define Keywords for Each Category
categories = {
    'Breaking News': ['breaking', 'alert', 'urgent'],
    'Politics': ['election', 'policy', 'congress', 'senate'],
    'World News': ['international', 'global', 'UN', 'war', 'diplomacy'],
    'Business/Finance': ['market', 'stocks', 'finance', 'economy'],
    'Technology': ['tech', 'software', 'hardware', 'AI', 'cyber'],
    'Science': ['research', 'study', 'scientific', 'discovery'],
    'Health': ['health', 'medicine', 'wellness', 'disease'],
    'Entertainment': ['celebrity', 'movie', 'TV', 'music'],
    'Sports': ['match', 'game', 'tournament', 'player'],
    'Environment': ['climate', 'ecology', 'environment'],
    'Crime': ['crime', 'arrest', 'police', 'court','kill','kidnapped'],
    'Education': ['school', 'university', 'education', 'study'],
    'Weather': ['weather', 'storm', 'hurricane', 'forecast'],
    'Other': []  # Default category for unmatched articles
}

# Step 3: Function to Tag Articles Based on Keywords
def assign_category(article):
    for category, keywords in categories.items():
        if any(keyword in article.lower() for keyword in keywords):
            return category
    return 'Other'  # Default category

# Apply the function to assign categories
merged_df['predicted_category'] =  merged_df['article'].apply(assign_category)

# Step 4: TF-IDF Vectorization and Apply LSA
tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
lsa = TruncatedSVD(n_components=100)

# Create a pipeline for SVM classification after LSA
classifier = SVC(kernel='linear')
model = make_pipeline(tfidf, lsa, classifier)

# Prepare data for training and testing
labels = merged_df['predicted_category'].values
X_train, X_test, y_train, y_test = train_test_split(headlines, labels, test_size=0.2, random_state=42)

# Train the classifier
model.fit(X_train, y_train)

# Predict and evaluate the classifier
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

# Step 1: Extract Title and Article Texts and Use a Smaller Sample
# Use only a sample of 10,000 rows to avoid memory issues
sampled_df = merged_df.sample(n=10000, random_state=42)
titles = sampled_df['title'].fillna('')
articles = sampled_df['article'].fillna('')

# Step 2: TF-IDF Vectorization with Reduced Number of Features
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=500)  # Reduce to 500 features

# Combine the titles and articles for joint fitting
combined_texts = titles.tolist() + articles.tolist()

# Fit and transform the combined texts
tfidf_matrix = tfidf_vectorizer.fit_transform(combined_texts)

# Separate the transformed vectors back into titles and articles
tfidf_titles = tfidf_matrix[:len(titles)]
tfidf_articles = tfidf_matrix[len(titles):]

# Step 3: Compute Cosine Similarity
# Calculate cosine similarity between each title and its corresponding article
similarity_scores = cosine_similarity(tfidf_titles, tfidf_articles)

# Extract the diagonal similarity scores
similarity_scores_diagonal = [similarity_scores[i, i] for i in range(len(titles))]

# Step 4: Add the Similarity Scores to the DataFrame
sampled_df['similarity_score'] = similarity_scores_diagonal

# Print the DataFrame with the new similarity scores
print(sampled_df[['title', 'article', 'similarity_score']].head())

# Step 5: Visualize the Similarity Scores
plt.figure(figsize=(10, 6))
plt.hist(sampled_df['similarity_score'], bins=20, color='skyblue', edgecolor='black')
plt.title('Distribution of Similarity Scores between Titles and Articles')
plt.xlabel('Similarity Score')
plt.ylabel('Frequency')
plt.show()

# Step 6: Analyze the Average and Median Similarity Scores
average_similarity = sampled_df['similarity_score'].mean()
median_similarity = sampled_df['similarity_score'].median()

print(f"Average Similarity Score: {average_similarity:.2f}")
print(f"Median Similarity Score: {median_similarity:.2f}")


# categorise the title/content into a known set of topic categories
##### The main goal is to take these titles and sort them into different groups based on what they are about. These groups are called topic categories.we use it by using Topic modeling which is a technique used to discover the underlying themes or topics in a collection of documents. It involves analyzing the text data to identify patterns and group similar documents based on their content. The process typically includes several steps, with data preparation and preprocessing being crucial for effective topic modeling. We are using a method called Latent Dirichlet Allocation (LDA) to help us with this sorting. 

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from gensim import corpora
from gensim.models import LdaModel
from gensim import matutils  
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt

# Download NLTK stopwords
nltk.download('stopwords')


# step 1
###  Data Preparation and Preprocessing for Topic Modeling
##### here we do do data Normalization to ensures uniformity, which helps in accurately identifying words regardless of their case.and we remove noises so that we focus on the actual words that contribute to the meaning of the text.we also so Stopword  Stopwords.at last we Merg the 'title' and 'article' columns provides a more comprehensive context for each document, and prerocessed it so that it can lead us to better topic identification.

In [None]:
# Function to preprocess text
def preprocess_text(text):
    # Convert text to lowercase(Normalization)
    text = text.lower()
    
    # Remove punctuation and special characters using regex(Noise removal)
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove stopwords (common words like "and," "the," "is") removal,which do not carry significant meaning and can skew the results)
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    return text

# Combine 'title' and 'article' into a single column
merged_df['combined_text'] = merged_df['title'] + " " + merged_df['article']

# Convert to a list for processing
merged_df['processed_text'] = merged_df['combined_text'].apply(preprocess_text)



print("processed_text:")
print(merged_df['processed_text'].head())


# Step-2
### we use tfidf_vectorizer to helps in reducing dimensionality and focusing on the most significant terms by limiting the nuique words(features number) and we use fit_transform to it so that we can fit or learn the vocabulary and IDF values from the processed data and transform text data to sparse matrix .

In [None]:
### Step 2: TF-IDF Vectorization ###

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust max_features based on your dataset size

# Transform the processed text data into TF-IDF vectors
tfidf_matrix = tfidf_vectorizer.fit_transform(merged_df['processed_text'])

print("\nTF-IDF Matrix Shape:")
print(tfidf_matrix.shape)


In [None]:
### Step 3: LDA Topic Modeling ###

# Convert the TF-IDF matrix to a format compatible with Gensim
corpus = matutils.Sparse2Corpus(tfidf_matrix, documents_columns=False)

# Create a Gensim dictionary from the processed text
id2word = corpora.Dictionary([text.split() for text in merged_df['processed_text']])


# Build the LDA model
lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=3, random_state=42, passes=10)


In [None]:
### Step 4: Interpret and Categorize Results ###

# Print the topics discovered by the LDA model
for idx, topic in lda_model.print_topics(-1):
    print(f"\nTopic #{idx}:")
    print(topic)

# Categorize documents by their dominant topic
document_topics = [max(lda_model.get_document_topics(doc), key=lambda x: x[1])[0] for doc in corpus]
merged_df['topic_category'] = document_topics

print("\nCategorized Documents with Topics:")
print(merged_df[['title', 'topic_category']])

In [None]:
### Step 4: Interpret and Categorize Results ###

# Print the topics discovered by the LDA model
for idx, topic in lda_model.print_topics(-1):
    print(f"\nTopic #{idx}:")
    print(topic)

# Categorize documents by their dominant topic
document_topics = [max(lda_model.get_document_topics(doc), key=lambda x: x[1])[0] for doc in corpus]
merged_df['topic_category'] = document_topics

print("\nCategorized Documents with Topics:")
print(merged_df[['title', 'topic_category']])

In [None]:

### Step 4: Visualize Topics with pyLDAvis ###




# Prepare data for pyLDAvis
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)

# Display the interactive visualization within the notebook
pyLDAvis.display(lda_display)

### Step 5: Plot Topic Distribution Across Documents ###

# Get topic distribution for each document
document_topics = [lda_model.get_document_topics(doc) for doc in corpus]

# Prepare data for plotting
topic_distribution = pd.DataFrame([[topic[1] for topic in doc] for doc in document_topics])
topic_distribution.columns = [f'Topic {i+1}' for i in range(topic_distribution.shape[1])]

# Plot the topic distribution
topic_distribution.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.xlabel('Document Index')
plt.ylabel('Topic Proportion')
plt.title('Topic Distribution Across Documents')
plt.show()

# analyzing the diversity of topics reported by different websites. 
## step-1
so to answer the question Which websites reported the most diverse topics,we have to Assign Topics to Articles Using the topic modeling results of the above to determine the most dominant topic for each article.


In [None]:
import numpy as np
from urllib.parse import urlparse
from gensim import corpora

def extract_main_domain(url):
    try:
        parsed_url = urlparse(url)
        domain = parsed_url.netloc
        # Remove the 'www.' prefix if it exists
        if domain.startswith('www.'):
            domain = domain[4:]
        return domain
    except Exception as e:
        return None

# Apply the function to extract the main domain from the URL column
merged_df['Domain'] = merged_df['url'].apply(extract_main_domain)

# Assuming `X` is a list of lists, where each sublist is a document represented by its tokens
# Example: X = [['word1', 'word2'], ['word3', 'word4'], ...]

# Create a dictionary from the data (mapping of word IDs to words)
id2word = corpora.Dictionary(X)

# Convert the corpus to a bag-of-words format
corpus_bow = [id2word.doc2bow(doc) for doc in X]

# Get the most dominant topic for each document
def get_dominant_topic(topic_distribution):
    return np.argmax(topic_distribution)

# Calculate topic distribution for each document
document_topics = [lda_model.get_document_topics(doc) for doc in corpus_bow]
topic_distribution = np.array([[topic[1] for topic in doc] for doc in document_topics])
dominant_topics = [get_dominant_topic(doc_topics) for doc_topics in topic_distribution]

# Add the dominant topic to the DataFrame
merged_df['dominant_topic'] = dominant_topics


### step-2 
We aggregate Data by Domain which means Group articles by their domains and calculate how many unique topics are reported by each domain.

In [None]:
import pandas as pd

# Define the function to calculate topic diversity
def calculate_topic_diversity(topic_counts):
    if isinstance(topic_counts, dict) and len(topic_counts) > 0:
        return len([count for count in topic_counts.values() if count > 0])
    else:
        return 0


# Group by 'Domain' and count unique topics in 'dominant_topic'
domain_topic_summary = merged_df.groupby('domain')['dominant_topic'].apply(
    lambda x: x.value_counts().to_dict()  # Convert to dictionary of topic counts
).reset_index()

# Remove the extra 'level_1' column
domain_topic_summary = domain_topic_summary.drop(columns=['level_1'])

# Apply the calculate_topic_diversity function
domain_topic_summary['Topic_Diversity'] = domain_topic_summary['dominant_topic'].apply(calculate_topic_diversity)

# Drop the 'dominant_topic' column since it's no longer needed
domain_topic_summary = domain_topic_summary.drop(columns=['dominant_topic'])

# Sort by 'Topic_Diversity'
domain_topic_summary = domain_topic_summary.sort_values(by='Topic_Diversity', ascending=False)

# Print the final output
print(domain_topic_summary)



In [None]:
import pandas as pd
from urllib.parse import urlparse
from gensim import corpora
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from gensim.corpora import MmCorpus
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# Sample merged_df DataFrame setup (Replace this with actual data loading)
# merged_df = pd.read_csv('path_to_your_data.csv')

# Define the function to extract the main domain from a URL
def extract_main_domain(url):
    try:
        parsed_url = urlparse(url)
        domain = parsed_url.netloc
        if domain.startswith('www.'):
            domain = domain[4:]
        return domain
    except Exception as e:
        return None

# Apply the domain extraction function to the URL column
merged_df['domain'] = merged_df['url'].apply(extract_main_domain)

# Combine 'title' and 'article' into one text column for topic modeling
merged_df['text'] = merged_df['title'] + ' ' + merged_df['article']

# Tokenize the text data
def tokenize(text):
    return text.lower().split()

# Apply tokenization
documents = merged_df['text'].map(tokenize).tolist()

# Create a Gensim Dictionary
id2word = Dictionary(documents)

# Create a Gensim Corpus
corpus = [id2word.doc2bow(doc) for doc in documents]

# Step 2: Train LDA Model
lda_model = LdaModel(corpus, num_topics=5, id2word=id2word, passes=15)

# Step 3: Assign Topics to Documents
def get_dominant_topic(ldamodel, corpus):
    topics = [ldamodel.get_document_topics(doc) for doc in corpus]
    dominant_topics = [max(doc, key=lambda x: x[1])[0] if doc else None for doc in topics]
    return dominant_topics

merged_df['dominant_topic'] = get_dominant_topic(lda_model, corpus)

# Step 4: Calculate Topic Diversity for Each Domain
def calculate_topic_diversity(topic_counts):
    if isinstance(topic_counts, dict):
        return len(topic_counts)
    else:
        return 0

# Group by domain and calculate topic counts
domain_topic_summary = merged_df.groupby('domain')['dominant_topic'].apply(
    lambda x: x.value_counts().to_dict()  # Convert to dictionary of topic counts
)

# Apply calculate_topic_diversity to get topic diversity counts
domain_topic_summary = domain_topic_summary.apply(calculate_topic_diversity).reset_index()

# Check the columns before renaming
print(domain_topic_summary.columns)

# Rename columns based on the actual number of columns
if len(domain_topic_summary.columns) == 2:
    domain_topic_summary.columns = ['Domain', 'Topic_Diversity']
else:
    print("Unexpected number of columns:", len(domain_topic_summary.columns))
    # Adjust renaming according to the actual column names

# Sort by topic diversity
domain_topic_summary = domain_topic_summary.sort_values(by='Topic_Diversity', ascending=False)

print(domain_topic_summary)

# Step 5: Visualize Topic Distribution Across Documents
# Prepare data for plotting
topic_distribution = pd.DataFrame(
    [dict(ldamodel.get_document_topics(doc)) for doc in corpus]
).fillna(0)  # Replace NaN with 0

# Plot the topic distribution
topic_distribution.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.xlabel('Document Index')
plt.ylabel('Topic Proportion')
plt.title('Topic Distribution Across Documents')
plt.show()


## step-3
Analyze Diversity and Determine which websites cover the most diverse range of topics and visualize this information.

# 	Model the events that the news articles are written about

In [None]:
import pandas as pd
import torch
import numpy as np
import random
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AlbertForSequenceClassification, AlbertTokenizer, AdamW, get_linear_schedule_with_warmup
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder



In [None]:
def extract_main_domain(url):
    try:
        parsed_url = urlparse(url)
        domain = parsed_url.netloc
        # Remove the 'www.' prefix if it exists
        if domain.startswith('www.'):
            domain = domain[4:]
        return domain
    except Exception as e:
        return None

# Apply the function to extract the main domain from the URL column
da_rating['Domain'] =da_rating['url'].apply(extract_main_domain)


even if i classified to batch with size 32 because i am usinig 8GBram , it is hard to do it. so i just put the code here and in future i will make sure that i have a laptop with enough capability of doing it,so because of that i cand do any plotting


In [None]:
import pandas as pd
from transformers import AlbertTokenizer
import torch
from sklearn.preprocessing import LabelEncoder
import random

# Assuming da_rating is your DataFrame and has already been cleaned
# Randomly sample 100 entries
sampled_data = da_rating.sample(n=100, random_state=42)  # Ensure reproducibility with random_state

# Extract texts and labels from the sampled data
texts = sampled_data['content'].tolist()
labels = sampled_data['category'].tolist()

# Initialize the tokenizer
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

# Tokenize the sampled texts
encoded_dict = tokenizer.batch_encode_plus(
    texts,
    add_special_tokens=True,
    max_length=512,
    padding='max_length',
    return_attention_mask=True,
    return_tensors='pt',
    truncation=True
)

# Convert lists of tensors to PyTorch tensors
input_ids = encoded_dict['input_ids']
attention_masks = encoded_dict['attention_mask']

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
encoded_labels = torch.tensor(encoded_labels)

# Verify lengths and shapes
print(f"Number of samples: {len(texts)}")
print(f"Input IDs shape: {input_ids.shape}")
print(f"Attention Masks shape: {attention_masks.shape}")
print(f"Encoded Labels shape: {encoded_labels.shape}")

# Now you can use input_ids, attention_masks, and encoded_labels for model training or testing




In [None]:
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Split data into training and validation sets
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    input_ids, encoded_labels, test_size=0.1, random_state=42
)
train_masks, val_masks = train_test_split(
    attention_masks, test_size=0.1, random_state=42
)

# Convert to TensorDataset
train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
val_dataset = TensorDataset(val_inputs, val_masks, val_labels)


In [None]:
# Create DataLoader for training set
train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=16  # You can adjust the batch size
)

# Create DataLoader for validation set
val_dataloader = DataLoader(
    val_dataset,
    sampler=SequentialSampler(val_dataset),
    batch_size=16  # You can adjust the batch size
)


In [None]:
from transformers import AlbertForSequenceClassification

# Load the ALBERT model for sequence classification
model = AlbertForSequenceClassification.from_pretrained(
    'albert-base-v2',
    num_labels=len(label_encoder.classes_),  # Number of unique labels
    output_attentions=False,
    output_hidden_states=False,
)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Number of training epochs
epochs = 3

# Total number of training steps
total_steps = len(train_dataloader) * epochs

# Set up the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,  # Default value in transformers
    num_training_steps=total_steps
)

# Set up the loss function
loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
#for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    print('-' * 10)

    # Training
    model.train()
    total_train_loss = 0

    for step, batch in enumerate(train_dataloader):
        batch_inputs = batch[0].to(device)
        batch_masks = batch[1].to(device)
        batch_labels = batch[2].to(device)

        model.zero_grad()

        outputs = model(
            input_ids=batch_inputs,
            attention_mask=batch_masks,
            labels=batch_labels
        )

        loss = outputs.loss
        logits = outputs.logits

        total_train_loss += loss.item()

        loss.backward()

        # Clip the norm of the gradients to 1.0 to avoid exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f'Average Training Loss: {avg_train_loss:.2f}')

    # Validation
    model.eval()
    total_val_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for batch in val_dataloader:
            batch_inputs = batch[0].to(device)
            batch_masks = batch[1].to(device)
            batch_labels = batch[2].to(device)

            outputs = model(
                input_ids=batch_inputs,
                attention_mask=batch_masks,
                labels=batch_labels
            )

            loss = outputs.loss
            logits = outputs.logits

            total_val_loss += loss.item()

            preds = torch.argmax(logits, dim=1).flatten()
            correct_predictions += (preds == batch_labels).cpu().numpy().sum()

    avg_val_loss = total_val_loss / len(val_dataloader)
    val_accuracy = correct_predictions / len(val_labels)
    print(f'Validation Loss: {avg_val_loss:.2f}')
    print(f'Validation Accuracy: {val_accuracy:.2f}')


In [None]:
import pandas as pd
from urllib.parse import urlparse

# Sample 500 news articles (assuming da_rating is your DataFrame)
sampled_articles = da_rating.sample(n=500, random_state=42)

# Extract the main domain from URLs
def extract_main_domain(url):
    try:
        domain = urlparse(url).netloc
        return domain.split('.')[0]
    except Exception as e:
        return "Unknown"

sampled_articles['domain'] = sampled_articles['url'].apply(extract_main_domain)


In [None]:
# Extract texts for prediction
texts_to_predict = sampled_articles['content'].tolist()

# Tokenize the texts
encoded_dict = tokenizer.batch_encode_plus(
    texts_to_predict,
    add_special_tokens=True,
    max_length=512,
    padding='max_length',
    return_attention_mask=True,
    return_tensors='pt',
    truncation=True
)

input_ids = encoded_dict['input_ids'].to(device)
attention_masks = encoded_dict['attention_mask'].to(device)

# Make predictions
model.eval()
with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_masks)
    predictions = torch.argmax(outputs.logits, dim=1)

# Decode labels
predicted_labels = label_encoder.inverse_transform(predictions.cpu().numpy())
sampled_articles['predicted_category'] = predicted_labels


In [None]:
import matplotlib.pyplot as plt

# Aggregate data to count occurrences of each event by domain
event_counts = sampled_articles.groupby(['domain', 'predicted_category']).size().reset_index(name='counts')

# Pivot the data for visualization
pivot_table = event_counts.pivot(index='domain', columns='predicted_category', values='counts').fillna(0)
import seaborn as sns

plt.figure(figsize=(14, 10))
sns.heatmap(pivot_table, cmap="YlGnBu", annot=True, fmt="g")
plt.title('Event Counts by Media Domain')
plt.xlabel('Event Category')
plt.ylabel('Media Domain')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


●	How many events are covered in the data?

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

# Vectorize the content
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(sampled_articles['content'])

# Perform clustering
num_clusters = 5  # Example number of clusters
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
clusters = kmeans.fit_predict(X)

# Add cluster labels to the DataFrame
sampled_articles['cluster'] = clusters


In [None]:
# Convert 'published_at' to datetime
sampled_articles['published_at'] = pd.to_datetime(sampled_articles['published_at'])

# Find the earliest report for each event
earliest_reports = sampled_articles.groupby('cluster')['published_at'].min()

# Merge with the original DataFrame to get the news sites
earliest_reports_df = pd.merge(sampled_articles, earliest_reports, on=['cluster', 'published_at'])

# Find the earliest reporting news sites for each event
earliest_reports_sites = earliest_reports_df[['cluster', 'domain', 'published_at']].drop_duplicates()
print(earliest_reports_sites)


In [None]:
# Count occurrences of each event
event_reporting_counts = sampled_articles['cluster'].value_counts()
print("Events with the highest reporting:")
print(event_reporting_counts)


In [None]:
# Pivot table to create a matrix of event counts by domain
pivot_table = sampled_articles.pivot_table(index='domain', columns='cluster', values='article_id', aggfunc='count', fill_value=0)

# Compute the correlation matrix
correlation_matrix = pivot_table.corr()

# Visualization
import seaborn as sns
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, cmap="coolwarm", annot=True)
plt.title('Correlation Between News Sites Reporting Events')
plt.xlabel('Event Clusters')
plt.ylabel('News Sites')
plt.show()
