In [None]:
import pandas as pd
import re
from bs4 import BeautifulSoup
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import swifter

In [None]:
# Load the full dataset (this reads ~200K articles)
df = pd.read_parquet('https://storage.googleapis.com/msca-bdp-data-open/news_final_project/news_final_project.parquet', engine='pyarrow')

# Preview the shape and the first few rows
print("Dataset shape:", df.shape)
df.head()

In [None]:
# Basic info
df.info()

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
df.head()

In [None]:
# Check the range of article dates
df['date'] = pd.to_datetime(df['date'], errors='coerce')
print("Date Range:", df['date'].min(), "to", df['date'].max())

# Article length (number of characters in text)
df['text_length'] = df['text'].astype(str).apply(len)
df['text_length'].describe()

In [None]:
# Check language distribution
df['language'].value_counts()

In [None]:
# Find empty or very short articles
df[df['text_length'] < 50][['title', 'text', 'text_length']].sample(5)

In [None]:
# Plot Article Count Over Time
import matplotlib.pyplot as plt

# Group by month and count
df['year_month'] = df['date'].dt.to_period('M')
article_counts = df.groupby('year_month').size()

# Plot
article_counts.plot(kind='line', figsize=(12, 6), title='Article Count Over Time')
plt.ylabel('Number of Articles')
plt.xlabel('Year-Month')
plt.grid(True)
plt.tight_layout()
plt.show()

### Data Cleaning

In [None]:
# Convert date column to datetime
df['date'] = pd.to_datetime(df['date'], errors='coerce')

In [None]:
# Drop duplicates
df = df.drop_duplicates(subset=['title', 'text'])

In [None]:
# Remove short articles (less than 50 characters)
df['text_length'] = df['text'].astype(str).apply(len)
df = df[df['text_length'] >= 50]

In [None]:
def clean_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    # Remove repeated whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove emails
    text = re.sub(r'\S+@\S+', '', text)
    # Remove special characters & digits (except basic punctuation)
    text = re.sub(r"[^a-zA-Z.,!?'\s]", '', text)
    # Normalize whitespace again
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [None]:
df['text'] = df['text'].swifter.apply(clean_text)
df['title'] = df['title'].swifter.apply(clean_text)

In [None]:
# Reset index
df = df.reset_index(drop=True)

In [None]:
# Drop helper column
df.drop(columns=['text_length'], inplace=True)

In [None]:
# Show cleaned shape and preview
print("Cleaned dataset shape:", df.shape)
df.head()

### Topic Modeling with BERTopic

In [None]:
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

In [None]:
import torch
print(torch.backends.mps.is_available())

In [None]:
# Load transformer model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# Create BERTopic model using the custom embedding model
topic_model = BERTopic(embedding_model=embedding_model, verbose=True)

In [None]:
# Extract documents
docs = df['text'].tolist()

In [None]:
topics, probs = topic_model.fit_transform(docs)

In [None]:
# Add topic column to df
df['topic'] = topics

In [None]:
industry_keywords = {
    "Finance": [
        "stock", "market", "investment", "nasdaq", "bank", "currency", "crypto", "equity", "fintech", "trading",
        "shares", "bonds", "interest rate", "asset", "portfolio", "hedge fund", "exchange", "decentralized finance",
        "payment", "financial services", "robo-advisor", "wealth management", "finra", "securities", "credit",
        "valuation", "ipo", "debt", "derivatives", "risk management"],
    "Healthcare": [
        "patient", "hospital", "diagnosis", "therapy", "healthcare", "clinical", "treatment", "medical", "doctor",
        "nurse", "biotech", "covid", "vaccine", "fda", "pharma", "drug", "genomics", "public health", "telemedicine",
        "health tech", "clinical trial", "health data", "ehr", "radiology", "diagnostic", "health insurance",
        "mental health", "imaging", "medtech", "robotic surgery", "cancer", "prescription", "surgeon",
        "hospitalization", "outpatient"],
    "Education": [
        "student", "school", "university", "curriculum", "learning", "teaching", "classroom", "online course",
        "edtech", "professor", "exam", "degree", "certificate", "elearning", "tutorial", "lecture", "academic",
        "homework", "syllabus", "tutor", "k-12", "mooc", "higher education", "remote learning", "grade",
        "assignment", "lms", "grading", "student success", "exam proctoring", "enrollment", "scholarship",
        "textbook", "instructional design"],
    "Media / News": [
        "press", "newswire", "reporter", "headline", "journalist", "publication", "article", "newsletter", "media",
        "cnn", "bbc", "reuters", "bloomberg", "nytimes", "forbes", "guardian", "washington post", "news release",
        "media outlet", "broadcast", "coverage", "opinion", "editorial", "newsroom", "column", "interview", "breaking news",
        "media strategy", "fact checking", "news curation", "media bias", "breaking coverage", "subscription model",
        "content syndication", "media partnership"],
    "Retail / Consumer": [
        "ecommerce", "retail", "shopping", "consumer", "brand", "product", "shop", "cart", "checkout", "amazon",
        "fashion", "warehouse", "store", "grocery", "inventory", "supply chain", "logistics", "promotion", "sale",
        "payment", "delivery", "online store", "customer", "review", "discount", "return policy", "order", "receipt",
        "point of sale", "digital shelf", "retail analytics", "coupon", "in-store", "loyalty program", "merchandise"],
    "Transportation": [
        "vehicle", "car", "fleet", "autonomous", "driver", "road", "traffic", "logistics", "delivery", "rideshare",
        "rail", "bus", "highway", "driving", "mobility", "EV", "charging station", "tire", "fuel", "infrastructure",
        "navigation", "transport", "transit", "freight", "commute"],
    "Legal / Regulatory": [
        "law", "court", "regulation", "rights", "compliance", "privacy", "lawsuit", "litigation", "policy",
        "jurisdiction", "ethics", "intellectual property", "terms", "legal framework", "gdpr", "contract",
        "attorney", "subpoena", "data protection", "anti-trust", "legal challenge", "enforcement"],
    "Agriculture": [
        "farming", "crop", "soil", "harvest", "livestock", "irrigation", "pesticide", "fertilizer", "drought",
        "tractor", "yield", "planting", "agriculture", "greenhouse", "weather", "agribusiness", "seed",
        "barn", "farmer", "organic", "sustainability", "rural", "cultivation", "field"],
    "Real Estate / Housing": [
        "property", "rent", "mortgage", "house", "apartment", "zillow", "landlord", "realtor", "tenant",
        "residential", "housing", "lease", "broker", "condo", "foreclosure", "neighborhood", "zoning",
        "renovation", "realty", "mortgage rate", "listing", "property tax", "homeowner"],
    "Travel / Tourism": [
        "hotel", "flight", "vacation", "airline", "airport", "travel", "tourism", "destination", "booking",
        "resort", "itinerary", "passport", "trip", "luggage", "cruise", "visa", "guidebook", "tour",
        "excursion", "hospitality", "accommodation", "check-in"]
}


In [None]:
# Label topics using keyword matching
def label_topic(keywords):
    for industry, kw_list in industry_keywords.items():
        if any(kw.lower() in keywords for kw in kw_list):
            return industry
    return "Other"

In [None]:
# Extract keywords from BERTopic and apply labels
topics_keywords = topic_model.get_topics()
topic_to_label = {}
for topic_id, word_list in topics_keywords.items():
    top_keywords = [kw.lower() for kw, _ in word_list[:10]]
    topic_to_label[topic_id] = label_topic(top_keywords)

df['topic_label'] = df['topic'].map(topic_to_label)

In [None]:
# set category order for plotting
df['topic_label'] = pd.Categorical(
    df['topic_label'],
    categories=list(industry_keywords.keys()),
    ordered=True
)

df[['title', 'topic', 'topic_label']].head()

In [None]:
df[df['topic_label'].notnull()].shape

In [None]:
topic_model.visualize_topics()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Count and sort
topic_counts = df['topic_label'].value_counts().sort_values(ascending=False)

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x=topic_counts.values, y=topic_counts.index, palette="tab10")
plt.title("Article Count by Topic Label")
plt.xlabel("Number of Articles")
plt.ylabel("Topic Label")
plt.tight_layout()
plt.show()

In [None]:
# Ensure 'date' is datetime
df['date'] = pd.to_datetime(df['date'])
df['year_month'] = df['date'].dt.to_period('M').astype(str)

# Group by time + topic label
topic_timeline = df.groupby(['year_month', 'topic_label']).size().reset_index(name='count')
timeline_pivot = topic_timeline.pivot(index='year_month', columns='topic_label', values='count').fillna(0)

# Plot
timeline_pivot.plot(figsize=(14, 6), marker='o')
plt.title("Articles Over Time by Topic Label")
plt.xlabel("Year-Month")
plt.ylabel("Number of Articles")
plt.xticks(rotation=45)
plt.legend(title="Topic Label", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

### Topic Modeling with LDA

In [None]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora
from gensim.models import LdaModel
from gensim.matutils import Sparse2Corpus
from tqdm import tqdm

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [None]:
texts = df['text'].tolist()

In [None]:
# Basic text preprocessing
def clean_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Keep only letters
    text = text.lower()
    text = text.split()
    text = [word for word in text if word not in stop_words and len(word) > 3]
    return text

# Clean all texts
processed_texts = [clean_text(doc) for doc in tqdm(texts)]

In [None]:
# Create dictionary and corpus
dictionary = corpora.Dictionary(processed_texts)
corpus = [dictionary.doc2bow(text) for text in processed_texts]

In [None]:
# Train LDA model
lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=10,  # Set number of topics
    passes=10,
    random_state=42,
    per_word_topics=True
)

In [None]:
# Print top words for each topic
for idx, topic in lda_model.print_topics(num_topics=10, num_words=10):
    print(f"Topic {idx}:\n{topic}\n")

In [None]:
# Assign dominant topic to each document
def get_dominant_topic(bow):
    topic_probs = lda_model.get_document_topics(bow)
    if topic_probs:
        return max(topic_probs, key=lambda x: x[1])[0]
    else:
        return -1

df['lda_topic'] = [get_dominant_topic(bow) for bow in corpus]

In [None]:
# Create a dictionary mapping topic number to industry
topic_industry_mapping = {
    0: "Generative AI & Tech News",
    1: "Healthcare & Medical Imaging",
    2: "Market Research & Regional Industry News",
    3: "AI, Data Science & Cybersecurity",
    4: "Indian Business & Tech Updates",
    5: "Media Releases & Tech Platforms",
    6: "Local News & Weather Reports",
    7: "Financial Markets & Services",
    8: "Entertainment & Celebrity News",
    9: "Consumer Tech & Product Reviews"
}

In [None]:
# Map topic to industry
df['lda_industry'] = df['lda_topic'].map(topic_industry_mapping)
df.head()

In [None]:
# Prepare BERTopic distribution
bertopic_counts = df['topic_label'].value_counts().sort_values(ascending=True)

# Prepare LDA distribution
lda_counts = df['lda_industry'].value_counts().sort_index()  # 0 to 9
lda_counts.index = [i for i in lda_counts.index]

# Plot side-by-side
fig, axs = plt.subplots(1, 2, figsize=(14, 6))

# BERTopic plot
sns.barplot(x=bertopic_counts.values, y=bertopic_counts.index, ax=axs[0], palette="tab10")
axs[0].set_title("BERTopic: Articles per Topic Label")
axs[0].set_xlabel("Number of Articles")
axs[0].set_ylabel("Topic Label")

# LDA plot
sns.barplot(x=lda_counts.values, y=lda_counts.index, ax=axs[1], palette="tab10")
axs[1].set_title("LDA: Articles per Topic")
axs[1].set_xlabel("Number of Articles")
axs[1].set_ylabel("Topic ID")

plt.tight_layout()
plt.show()

In [None]:
# Ensure datetime column exists
df['date'] = pd.to_datetime(df['date'])
df['year_month'] = df['date'].dt.to_period('M').astype(str)

# Group by BERTopic topic label
bertopic_time = df.groupby(['year_month', 'topic_label']).size().reset_index(name='count')
ber_pivot = bertopic_time.pivot(index='year_month', columns='topic_label', values='count').fillna(0)

# Group by LDA topic
lda_time = df.groupby(['year_month', 'lda_industry']).size().reset_index(name='count')
lda_time['lda_industry'] = lda_time['lda_industry'].apply(lambda x: f"{x}")
lda_pivot = lda_time.pivot(index='year_month', columns='lda_industry', values='count').fillna(0)

# Plot
fig, axs = plt.subplots(2, 1, figsize=(14, 10), sharex=True)

ber_pivot.plot(ax=axs[0], marker='o')
axs[0].set_title("BERTopic: Article Volume Over Time")
axs[0].set_ylabel("Count")
axs[0].legend(loc='upper left', bbox_to_anchor=(1, 1))

lda_pivot.plot(ax=axs[1], marker='o')
axs[1].set_title("LDA: Article Volume Over Time")
axs[1].set_ylabel("Count")
axs[1].legend(loc='upper left', bbox_to_anchor=(1, 1))

plt.xlabel("Year-Month")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### Another Method with BERTopic

In [None]:
topic_info = topic_model.get_topic_info()

In [None]:
topic_info['Representation'].head(10)

In [None]:
for i in range(10):
    print(f"\nTopic {i} keywords:")
    print(topic_model.get_topic(i))

In [None]:
df= df[df['topic'] != -1]

In [None]:
# Get the top 10 topic IDs by count
top_10_topics = df['topic'].value_counts().head(10).index.tolist()
print("Top 10 topic IDs:", top_10_topics)

In [None]:
topic_to_industry = {
    0: "Transportation / Tech News",
    1: "Media / News",
    2: "AI Hardware",
    3: "Finance",
    4: "Education",
    5: "Media / Community Tools",
    6: "Cybersecurity",
    7: "Tech Platforms",
    8: "Finance",
    9: "Tech Platforms"
}

In [None]:
# Only keep rows with one of the top 10 topics
df_top10 = df[df['topic'].isin(top_10_topics)].copy()

# Map topic to industry
df_top10['top_industry'] = df_top10['topic'].map(topic_to_industry)

# Preview
df_top10.head()

In [None]:
# Count and sort industries descending
industry_counts = df_top10['top_industry'].value_counts().sort_values(ascending=True)

# Horizontal bar plot (largest on top)
industry_counts.plot(kind='barh', title='Top Industries by Article Count', figsize=(10,6))
plt.xlabel('Number of Articles')
plt.tight_layout()
plt.show()

In [None]:
# Group by time + topic label
topic_timeline = df_top10.groupby(['year_month', 'top_industry']).size().reset_index(name='count')
timeline_pivot = topic_timeline.pivot(index='year_month', columns='top_industry', values='count').fillna(0)

# Plot
timeline_pivot.plot(figsize=(14, 6), marker='o')
plt.title("BERTopic: Articles Over Time by Topic 10 Label")
plt.xlabel("Year-Month")
plt.ylabel("Number of Articles")
plt.xticks(rotation=45)
plt.legend(title="Topic 10 Label", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
df_top10.shape

### NMF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [None]:
# Step 1: TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=10, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(df['text'])

In [None]:
# Step 2: Apply NMF
nmf_model = NMF(n_components=10, random_state=42)
W = nmf_model.fit_transform(tfidf)
H = nmf_model.components_

In [None]:
# Step 3: Display top words per topic
feature_names = tfidf_vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(H):
    top_keywords = [feature_names[i] for i in topic.argsort()[:-11:-1]]
    print(f"Topic {topic_idx}: {', '.join(top_keywords)}")

In [None]:
NMF_topics = {
    0: "Media / News (with AI flavor)",
    1: "Noise / Irrelevant (Image Metadata)",
    2: "Retail / Consumer & Transportation",
    3: "Finance",
    4: "Media / News Distribution",
    5: "Tech Platforms & Enterprise AI",
    6: "News & Utility Info",
    7: "Finance (Market Research & International Indices)",
    8: "Finance (Indian Market Emphasis)",
    9: "Tech Platforms & Consumer Devices"
}

### Entity Extraction (NER)

In [None]:
import spacy
from tqdm import tqdm

In [None]:
# Load spaCy's English model
nlp = spacy.load("en_core_web_lg")

In [None]:
df_final=df[df['topic_label'].notnull()]
df_final.head()

In [None]:
# Function to extract named entities
def extract_entities(text):
    doc = nlp(text)
    orgs = [ent.text for ent in doc.ents if ent.label_ == "ORG"]
    people = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
    locations = [ent.text for ent in doc.ents if ent.label_ in ["GPE", "LOC"]]
    return pd.Series([orgs, people, locations])

In [None]:
# Enable progress bar
tqdm.pandas()

In [None]:
# Apply to the full article text
df_final[['organizations', 'people', 'locations']] = df_final['text'].progress_apply(extract_entities)

In [None]:
df_final[['title', 'organizations', 'people', 'locations']].head()

#### Organazations

In [None]:
from collections import Counter

def normalize_org(name):
    if name in ['Open AI', 'OpenAI']:
        return 'OpenAI'
    elif name in ['Meta Platforms', 'Meta']:
        return 'Meta'
    elif name in ['Amazon.com', 'Amazon']:
        return 'Amazon'
    elif name in ['Microsoft Corporation', 'Microsoft']:
        return 'Microsoft'
    elif name in ['Nvidia', 'NVIDIA']:
        return 'Nvidia'
    elif name in ['GPT', 'ChatGPT']:
        return 'ChatGPT'
    return name

In [None]:
# Convert to month
df_final['year_month'] = pd.to_datetime(df_final['date']).dt.to_period('M').astype(str)

# Store counts per (month, org)
records = []

for idx, row in tqdm(df_final.iterrows(), total=len(df_final)):
    month = row['year_month']
    orgs = row['organizations']
    if isinstance(orgs, list):
        for org in orgs:
            org_norm = normalize_org(org)
            records.append((month, org_norm))

# Create DataFrame from counts
df_org_mentions = pd.DataFrame(records, columns=['month', 'organization'])

# Remove "AI" mentions from time records
df_org_mentions = df_org_mentions[df_org_mentions['organization'] != 'AI']

# Top 10 orgs overall
top_orgs = df_org_mentions['organization'].value_counts().head(10).index.tolist()

# Filter to top orgs
df_top_orgs = df_org_mentions[df_org_mentions['organization'].isin(top_orgs)]

# Group and pivot
org_trend = (
    df_top_orgs.groupby(['month', 'organization'])
    .size()
    .reset_index(name='count')
    .pivot(index='month', columns='organization', values='count')
    .fillna(0)
)

In [None]:
# Flatten and normalize organization mentions
all_orgs = [normalize_org(org) for sublist in df_final['organizations'] if isinstance(sublist, list) for org in sublist]

# Remove "AI" from the list before counting
filtered_orgs = [org for org in all_orgs if org != 'AI']

# Count top 10 without "AI"
top_org_counts = Counter(filtered_orgs).most_common(10)
org_names, org_counts = zip(*top_org_counts)

# Plot again
plt.figure(figsize=(10, 5))
plt.barh(org_names[::-1], org_counts[::-1], color='steelblue')
plt.xlabel("Mention Count")
plt.title("Top 10 Organizations in AI News (Excluding 'AI')")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(16, 6))
org_trend.plot(marker='o', figsize=(16, 6))
plt.title("Top 10 Organizations Mentioned Over Time")
plt.xlabel("Year-Month")
plt.ylabel("Number of Mentions")
plt.xticks(rotation=45)
plt.grid(True)
plt.legend(title="Organization", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

#### people

In [None]:
# Normalization function
def normalize_person(name):
    name = name.strip()
    name_lower = name.lower()

    if 'altman' in name_lower:
        return 'Sam Altman'
    elif 'musk' in name_lower:
        return 'Elon Musk'
    elif name_lower in ['biden']:
        return 'Joe Biden'
    elif 'trump' in name_lower:
        return 'Donald Trump'
    elif 'mackintosh' in name_lower:
        return 'Phil Mackintosh'
    elif name_lower == 'claude':
        return 'Claude (Anthropic)'
    elif 'huang' in name_lower:
        return 'Jensen Huang'
    else:
        return name

In [None]:
from collections import Counter

# Flatten and normalize people
all_people = [
    normalize_person(p)
    for sublist in df_final['people'] if isinstance(sublist, list)
    for p in sublist
]

# Get top 10 people
top_people_counts = Counter(all_people).most_common(10)
people_names, people_counts = zip(*top_people_counts)

In [None]:
plt.figure(figsize=(10, 5))
plt.barh(people_names[::-1], people_counts[::-1], color='salmon')
plt.xlabel("Mention Count")
plt.title("Top 10 People in AI News")
plt.tight_layout()
plt.show()

In [None]:
# Add normalized mentions to timeline records
df_final['year_month'] = pd.to_datetime(df_final['date']).dt.to_period('M').astype(str)

records = []
for i, row in df_final.iterrows():
    if isinstance(row['people'], list):
        for p in row['people']:
            person = normalize_person(p)
            records.append((row['year_month'], person))

# Convert to DataFrame
df_people_mentions = pd.DataFrame(records, columns=['month', 'person'])

# Filter to top 10
df_people_mentions = df_people_mentions[df_people_mentions['person'].isin(people_names)]

# Group and pivot
trend_people = (
    df_people_mentions.groupby(['month', 'person'])
    .size()
    .reset_index(name='count')
    .pivot(index='month', columns='person', values='count')
    .fillna(0)
)

In [None]:
trend_people.plot(marker='o', figsize=(14, 6))
plt.title("Mentions of Top People in AI News Over Time")
plt.xlabel("Year-Month")
plt.ylabel("Number of Mentions")
plt.xticks(rotation=45)
plt.grid(True)
plt.legend(title="Person", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

#### location

In [None]:
def normalize_location(loc):
    loc = loc.strip().lower()
    if loc in ['us', 'u.s.', 'u.s', 'usa', 'united states of america']:
        return 'United States'
    elif loc in ['uk', 'u.k.', 'england']:
        return 'United Kingdom'
    elif loc in ['china mainland']:
        return 'China'
    elif loc in ['eu', 'european union']:
        return 'Europe'
    else:
        return loc.title()

In [None]:
# Flatten and Normalize All Locations
all_locations = [
    normalize_location(loc)
    for sublist in df_final['locations']
    if isinstance(sublist, list)
    for loc in sublist
]

In [None]:
# Count Top 10 Locations
top_location_counts = Counter(all_locations).most_common(10)
location_names, location_counts = zip(*top_location_counts)
top_locations = list(location_names)

In [None]:
# Bar Plot
plt.figure(figsize=(10, 5))
plt.barh(location_names[::-1], location_counts[::-1], color='mediumseagreen')
plt.xlabel("Mention Count")
plt.title("Top 10 Locations Mentioned in AI News")
plt.tight_layout()
plt.show()

In [None]:
# Prepare date column
df_final['year_month'] = pd.to_datetime(df_final['date']).dt.to_period('M').astype(str)

# Build time-based mention records
records = []
for i, row in df_final.iterrows():
    if isinstance(row['locations'], list):
        for loc in row['locations']:
            norm_loc = normalize_location(loc)
            if norm_loc in top_locations:
                records.append((row['year_month'], norm_loc))

In [None]:
# Convert to DataFrame
df_location_mentions = pd.DataFrame(records, columns=['month', 'location'])

# Group and pivot
trend_locations = (
    df_location_mentions.groupby(['month', 'location'])
    .size()
    .reset_index(name='count')
    .pivot(index='month', columns='location', values='count')
    .fillna(0)
)

In [None]:
# Plot time series
trend_locations.plot(marker='o', figsize=(14, 6))
plt.title("Mentions of Top 10 Locations in AI News Over Time")
plt.xlabel("Year-Month")
plt.ylabel("Number of Mentions")
plt.xticks(rotation=45)
plt.grid(True)
plt.legend(title="Location", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

#### Technologies

In [None]:
tech_aliases = {
    # General concepts
    "large language model": "LLM",
    "llm": "LLM",
    "natural language processing": "NLP",
    "nlp": "NLP",
    "machine learning": "Machine Learning",
    "deep learning": "Deep Learning",
    "neural network": "Neural Network",

    # Specific technologies
    "chatgpt": "ChatGPT",
    "gpt-3": "GPT-3",
    "gpt-4": "GPT-4",
    "gpt-4 turbo": "GPT-4",
    "claude": "Claude",
    "claude 2": "Claude",
    "llama": "LLaMA",
    "llama 2": "LLaMA",
    "bard": "Bard",
    "gemini": "Gemini",
    "transformer": "Transformer",
    "cuda": "CUDA",
    "stable diffusion": "Stable Diffusion",
    "dall-e": "DALL-E",
    "midjourney": "Midjourney",
    "autogpt": "AutoGPT",
    "langchain": "LangChain"
}

In [None]:
def extract_and_map_technologies(text):
    text_lower = text.lower()
    found = set()
    for key, unified in tech_aliases.items():
        if key in text_lower:
            found.add(unified)
    return list(found)

df_final['technologies'] = df_final['text'].apply(extract_and_map_technologies)

In [None]:
all_techs = [
    tech for sublist in df_final['technologies']
    if isinstance(sublist, list)
    for tech in sublist
]

top_techs = [tech for tech, _ in Counter(all_techs).most_common(10)]
tech_counts = Counter(all_techs).most_common(10)

In [None]:
# bar plot
tech_names, tech_values = zip(*tech_counts)

plt.figure(figsize=(10, 5))
plt.barh(tech_names[::-1], tech_values[::-1], color='slateblue')
plt.title("Top 10 Mentioned AI Technologies")
plt.xlabel("Mention Count")
plt.tight_layout()
plt.show()

In [None]:
df_final['year_month'] = pd.to_datetime(df_final['date']).dt.to_period('M').astype(str)

tech_records = []
for _, row in df_final.iterrows():
    if isinstance(row['technologies'], list):
        for tech in row['technologies']:
            if tech in top_techs:
                tech_records.append((row['year_month'], tech))

df_tech_mentions = pd.DataFrame(tech_records, columns=['month', 'technology'])

In [None]:
# Pivot the count by month
tech_trend = (
    df_tech_mentions.groupby(['month', 'technology'])
    .size()
    .reset_index(name='count')
    .pivot(index='month', columns='technology', values='count')
    .fillna(0)
)

In [None]:
# Plot
tech_trend.plot(marker='o', figsize=(14, 6))
plt.title("Technology Mentions Over Time")
plt.xlabel("Year-Month")
plt.ylabel("Number of Mentions")
plt.xticks(rotation=45)
plt.grid(True)
plt.legend(title="Technology", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

### Topic-Level Sentiment Analysis

In [None]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer

# Initialize VADER
sia = SentimentIntensityAnalyzer()

In [None]:
def get_sentiment(text):
    return sia.polarity_scores(text)['compound']

# Compute sentiment per article
df_final['sentiment'] = df_final['text'].apply(get_sentiment)

In [None]:
topic_sentiment = df_final.groupby('topic')['sentiment'].mean().reset_index()
topic_sentiment.columns = ['topic', 'avg_sentiment']
topic_sentiment = topic_sentiment.sort_values(by='avg_sentiment', ascending=False)

# Optional: Merge back to include topic keywords or industry
df_topic_keywords = topic_model.get_topic_info()[['Topic', 'Name']]
topic_sentiment = topic_sentiment.merge(df_topic_keywords, left_on='topic', right_on='Topic')
topic_sentiment[['topic', 'Name', 'avg_sentiment']]

In [None]:
# Group by time + topic label
df_final['year_month'] = df_final['date'].dt.to_period('M').astype(str)

topic_sentiment_time = df_final.groupby(['year_month', 'topic_label'])['sentiment'].mean().reset_index()

# Pivot for line plot
pivot_topic_sent = topic_sentiment_time.pivot(index='year_month', columns='topic_label', values='sentiment').fillna(0)

In [None]:
# Define label function
def sentiment_label(score):
    if score >= 0.5:
        return 'Positive'
    elif score <= 0.5 and score >=0.05:
        return 'Slightly Positive'
    elif score <= -0.5:
        return 'Negative'
    elif score >= -0.5 and score <=-0.05:
        return 'Slightly Negative'
    else:
        return 'Neutral'

# Apply to your DataFrame
df_final['sentiment_label'] = df_final['sentiment'].apply(sentiment_label)

In [None]:
sentiment_counts = df_final['sentiment_label'].value_counts()

# Plot overall sentiment distribution
plt.figure(figsize=(8, 4))
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette='Set2')
plt.title("Sentiment Distribution Across All Articles")
plt.xlabel("Sentiment")
plt.ylabel("Number of Articles")
plt.tight_layout()
plt.show()

In [None]:
df_final['year_month'] = pd.to_datetime(df_final['date']).dt.to_period('M').astype(str)
# Calculate average sentiment per topic per month
topic_sentiment_time = (
    df_final.groupby(['year_month', 'topic_label'])['sentiment']
    .mean()
    .reset_index()
)
pivot_topic_sentiment = topic_sentiment_time.pivot(
    index='year_month',
    columns='topic_label',
    values='sentiment'
).fillna(0)

In [None]:
plt.figure(figsize=(16, 6))
pivot_topic_sentiment.plot(marker='o', figsize=(16, 6))
plt.title("Topic-Level Sentiment Over Time")
plt.xlabel("Year-Month")
plt.ylabel("Average Sentiment Score")
plt.xticks(rotation=45)
plt.grid(True)
plt.legend(title="Topic", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

### Entity-Level Sentiment Analysis

In [None]:
# Build entity-level records
records = []
for i, row in df_final.iterrows():
    if isinstance(row['organizations'], list):
        for org in row['organizations']:
            records.append((row['year_month'], org.strip(), row['sentiment']))

df_entity_sent = pd.DataFrame(records, columns=['month', 'entity', 'sentiment'])

In [None]:
def normalize_entity(name):
    name = name.lower()
    name_map = {
        'nvidia': 'NVIDIA',
        'nvidias': 'NVIDIA',
        'openai': 'OpenAI',
        'microsoft': 'Microsoft',
        'meta': 'Meta',
        'google': 'Google',
        'chatgpt': 'ChatGPT',
        'amazon': 'Amazon',
        'apple': 'Apple',
        'gray media group': 'Gray Media Group',
        'gpt': 'ChatGPT'
    }
    return name_map.get(name, name.title())

df_entity_sent['entity'] = df_entity_sent['entity'].apply(normalize_entity)

In [None]:
top_entities = [
    ent for ent, _ in Counter(df_entity_sent['entity']).most_common(15)
    if ent.lower() != 'ai'
][:10]

df_entity_sent_top = df_entity_sent[df_entity_sent['entity'].isin(top_entities)]

In [None]:
df_entity_sent_top['sentiment_label'] = df_entity_sent_top['sentiment'].apply(sentiment_label)

In [None]:
# Count sentiment labels
sentiment_counts = (
    df_entity_sent_top.groupby(['entity', 'sentiment_label'])
    .size()
    .reset_index(name='count')
)

# Plot bar chart
plt.figure(figsize=(14, 6))
sns.barplot(
    data=sentiment_counts,
    x='entity',
    y='count',
    hue='sentiment_label',
    palette='Set2'
)
plt.title("Entity-Level Sentiment Distribution (Excluding 'AI')")
plt.xlabel("Entity")
plt.ylabel("Number of Mentions")
plt.xticks(rotation=45, ha='right')
plt.legend(title="Sentiment")
plt.tight_layout()
plt.show()

In [None]:
# Average sentiment over time per entity
sentiment_time = (
    df_entity_sent_top.groupby(['month', 'entity'])['sentiment']
    .mean()
    .reset_index()
    .pivot(index='month', columns='entity', values='sentiment')
    .fillna(0)
)

In [None]:
# Plot time series
sentiment_time.plot(marker='o', figsize=(16, 6))
plt.title("Entity-Level Sentiment Over Time")
plt.xlabel("Year-Month")
plt.ylabel("Average Sentiment Score")
plt.xticks(rotation=45)
plt.grid(True)
plt.legend(title="Entity", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

### Visualization of sentiment analysis over time

In [None]:
# Ensure date is datetime
df_final['date'] = pd.to_datetime(df_final['date'])

# Extract year-month
df_final['year_month'] = df_final['date'].dt.to_period('M').astype(str)

# Average sentiment per month
sentiment_over_time = (
    df_final.groupby('year_month')['sentiment']
    .mean()
    .reset_index()
)

In [None]:
plt.figure(figsize=(14, 5))
plt.plot(sentiment_over_time['year_month'], sentiment_over_time['sentiment'], marker='o')
plt.title('Average Sentiment Over Time')
plt.xlabel('Year-Month')
plt.ylabel('Average Sentiment Score')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()