## Categorise the title/content into known set of topic categories

In [None]:
import sys
sys.path.append('../')  # Add the parent directory to the module search path
import pandas as pd
import nltk
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')

# Load the data
mini_data_file_path = '../data/mini_data.csv'
data = pd.read_csv(mini_data_file_path)

# Combine 'title' and 'content' columns into a single text column
data['text'] = data['title'] + ' ' + data['content']

# Preprocess the text data using NLTK
stopwords = nltk.corpus.stopwords.words('english')
stemmer = nltk.stem.PorterStemmer()

def preprocess_text(text):
    # Tokenization
    tokens = nltk.word_tokenize(text.lower())

    # Remove stopwords and punctuation
    tokens = [token for token in tokens if token.isalpha() and token not in stopwords]

    # Stemming
    tokens = [stemmer.stem(token) for token in tokens]

    # Join the tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

# Apply the preprocess_text function to the 'text' column
data['preprocessed_text'] = data['text'].apply(preprocess_text)

# Get the preprocessed text data
preprocessed_text_data = data['preprocessed_text']

# Create a CountVectorizer instance
vectorizer = CountVectorizer()

# Fit and transform the preprocessed text data
X = vectorizer.fit_transform(preprocessed_text_data)

# Get the feature names from the vectorizer
feature_names = vectorizer.get_feature_names()

# Apply LDA model to the vectorized data
n_topics = 5  # Number of topics/categories
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(X)

# Get the most important words for each topic
top_words_per_topic = []
for topic_idx, topic in enumerate(lda.components_):
    top_words = [feature_names[i] for i in topic.argsort()[:-6:-1]]
    top_words_per_topic.append(top_words)

# Assign categories to the topics
categories = ['Sports', 'Politics', 'Technology', 'Entertainment', 'Health']

# Print the topics and their associated categories
for topic_idx, top_words in enumerate(top_words_per_topic):
    category = categories[topic_idx]
    print(f'Topic {topic_idx + 1}: {", ".join(top_words)} (Category: {category})')

## Topics and Trends Analysis

In [None]:
import sys
sys.path.append('../')  # Add the parent directory to the module search path
import pandas as pd
import nltk
import matplotlib.pyplot as plt
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')

# Load the data
mini_data_file_path = '../data/mini_data.csv'
data = pd.read_csv(mini_data_file_path)

# Combine 'title' and 'content' columns into a single text column
data['text'] = data['title'] + ' ' + data['content']

# Preprocess the text data using NLTK
stopwords = nltk.corpus.stopwords.words('english')
stemmer = nltk.stem.PorterStemmer()

def preprocess_text(text):
    # Tokenization
    tokens = nltk.word_tokenize(text.lower())

    # Remove stopwords and punctuation
    tokens = [token for token in tokens if token.isalpha() and token not in stopwords]

    # Stemming
    tokens = [stemmer.stem(token) for token in tokens]

    # Join the tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

# Apply the preprocess_text function to the 'text' column
data['preprocessed_text'] = data['text'].apply(preprocess_text)

# Get the preprocessed text data
preprocessed_text_data = data['preprocessed_text']

# Create a CountVectorizer instance
vectorizer = CountVectorizer()

# Fit and transform the preprocessed text data
X = vectorizer.fit_transform(preprocessed_text_data)

# Get the feature names from the vectorizer
feature_names = vectorizer.get_feature_names()

# Apply LDA model to the vectorized data
n_topics = 5  # Number of topics/categories
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
topic_weights = lda.fit_transform(X)

# Calculate the diversity of topics covered by different websites
data['topic_diversity'] = topic_weights.std(axis=1)

# Get the unique websites
websites = data['website'].unique()

# Sort the websites based on topic diversity
sorted_websites = sorted(websites, key=lambda w: data[data['website'] == w]['topic_diversity'].mean(), reverse=True)

# Print the websites with the most diverse topics
print("Websites with the most diverse topics:")
for website in sorted_websites[:5]:
    print(website)

# Plot the 2D scatter plot of topic trends over time
data['date'] = pd.to_datetime(data['date'])

# Initialize the color map
cmap = plt.get_cmap('viridis')

# Plot the scatter plot
plt.figure(figsize=(10, 6))
for topic_idx in range(n_topics):
    topic = f"Topic {topic_idx + 1}"
    topic_counts = data.groupby(['date', 'topic_diversity'])[topic].sum()
    color = [cmap(i) for i in topic_counts]
    plt.scatter(topic_counts.index.get_level_values('date'), [topic_idx] * len(topic_counts), c=color, cmap='viridis', alpha=0.7)

# Set the x-axis and y-axis labels
plt.xlabel('Date')
plt.ylabel('Topics')

# Add a colorbar
cbar = plt.colorbar()
cbar.set_label('Topic Count')

# Show the plot
plt.show()