# Parliamentary Text Analysis

This notebook analyzes parliamentary text data using advanced NLP techniques including topic modeling, entity recognition, and network analysis.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
import spacy
from wordcloud import WordCloud
import string
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.sklearn

# Set visualization styles
plt.style.use('ggplot')
sns.set_palette("viridis")
sns.set_context("notebook")

## 0. Setup 

### Get data




In [None]:
%%bash
#

### Install language corpus

In [None]:
# %%bash
# python -m spacy download de_core_news_lg

Collecting de-core-news-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_lg-3.8.0/de_core_news_lg-3.8.0-py3-none-any.whl (567.8 MB)
[2K     [91m━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━�━[0m [32m0.0/567.8 MB[0m [31m?[0m eta [36m-:--:--[0m��━━━━━━━━━━━[0m [32m0.3/567.8 MB[0m [31m?[0m eta [36m-:--:--[0m��━━━━━━━━━━━[0m [32m0.5/567.8 MB[0m [31m1.5 MB/s[0m eta [36m0:06:21[0m��━━━━━━━━━━━[0m [32m0.8/567.8 MB[0m [31m1.3 MB/s[0m eta [36m0:07:02[0m��━━━━━━━━━━━[0m [32m1.0/567.8 MB[0m [31m1.4 MB/s[0m eta [36m0:06:39[0m��━━━━━━━━━━━[0m [32m1.6/567.8 MB[0m [31m1.7 MB/s[0m eta [36m0:05:37[0m��━━━━━━━━━━━[0m [32m1.8/567.8 MB[0m [31m1.5 MB/s[0m eta [36m0:06:11[0m��━━━━━━━━━━━[0m [32m2.1/567.8 MB[0m [31m1.5 MB/s[0m eta [36m0:06:19[0m��━━━━━━━━━━━[0m [32m2.1/567.8 MB[0m [31m1.5 MB/s[0m eta [36m0:06:19[0m��━━━━━━━━━━━[0m [32m2.6/567.8 MB[0m [31m1.5 MB/s[0m eta [36m0:06:24[0m��━━━━━━━━━

TypeError: %d format: a real number is required, not NoneType


[31mAborted.[0m


## 1. Data Loading and Basic Statistics

In [1]:
# Load spaCy model with sentencizer
print("Loading spaCy model...")
nlp = spacy.load("de_core_news_sm")
nlp.add_pipe('sentencizer')  # Add sentencizer for sentence boundary detection

# Load the text file
file_path = "../data/02_text-analysis/parlament-241024.txt"
print(f"Loading text from: {file_path}")

with open(file_path, 'r', encoding='utf-8') as f:
    text = f.read()

# Basic text statistics
print(f"\n📊 Document Statistics:")
print(f"Total characters: {len(text)}")
print(f"Total words: {len(text.split())}")
print(f"Total lines: {len(text.splitlines())}")

Loading spaCy model...


NameError: name 'spacy' is not defined

In [None]:
# Process with spaCy
print("Processing text with spaCy...")
doc = nlp(text)
print(f"Total tokens: {len(doc)}")
print(f"Total sentences: {len(list(doc.sents))}")
print(f"Unique words: {len(set([token.text.lower() for token in doc if token.is_alpha]))}")

# Create a sample preview of the text
preview_length = 300
text_preview = text[:preview_length] + "..." if len(text) > preview_length else text
print(f"\nText preview: {text_preview}")

## 2. Word Frequency Analysis

In [None]:
# Word frequency analysis (excluding stopwords and punctuation)
print("\nAnalyzing word frequencies...")
word_freq = Counter([token.text.lower() for token in doc if token.is_alpha and not token.is_stop])
common_words = pd.DataFrame(word_freq.most_common(20), columns=['Word', 'Frequency'])

# Visualization of word frequencies
plt.figure(figsize=(12, 6))
sns.barplot(x='Frequency', y='Word', data=common_words)
plt.title('Most Common Words', fontsize=16)
plt.xlabel('Frequency', fontsize=14)
plt.ylabel('Word', fontsize=14)
plt.tight_layout()
plt.show()

# Word cloud visualization
print("\nGenerating word cloud...")
wordcloud = WordCloud(width=800, height=400,
                      background_color='white',
                      colormap='viridis',
                      collocations=False,
                      max_words=200).generate_from_frequencies(word_freq)

plt.figure(figsize=(12, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.tight_layout()
plt.show()

## 3. Topic Analysis

Using Latent Dirichlet Allocation (LDA) to identify main topics in the text.

In [None]:
# Split text into sentences for topic modeling
sentences = [sent.text for sent in doc.sents]

# Create document-term matrix using TF-IDF
print("Creating TF-IDF vectors...")
tfidf_vectorizer = TfidfVectorizer(
    max_df=0.95, min_df=2,
    stop_words=nlp.Defaults.stop_words
)
tfidf = tfidf_vectorizer.fit_transform(sentences)
feature_names = tfidf_vectorizer.get_feature_names_out()

# LDA Topic Modeling
n_topics = 5  # Can be adjusted based on expected content
print(f"Running LDA with {n_topics} topics...")
lda = LatentDirichletAllocation(
    n_components=n_topics,
    random_state=42,
    learning_method='online'
)
lda.fit(tfidf)

# Display topics and their top words
n_top_words = 10
print(f"\nTop {n_top_words} words per topic:")
for topic_idx, topic in enumerate(lda.components_):
    top_words_idx = topic.argsort()[:-n_top_words - 1:-1]
    top_words = [feature_names[i] for i in top_words_idx]
    print(f"Topic #{topic_idx + 1}: {', '.join(top_words)}")

# Interactive topic visualization
try:
    print("\nGenerating interactive topic visualization...")
    pyLDAvis_data = pyLDAvis.sklearn.prepare(lda, tfidf, tfidf_vectorizer)
    pyLDAvis.display(pyLDAvis_data)
except Exception as e:
    print(f"Could not generate interactive visualization: {e}")

## 4. Document Structure Analysis

In [None]:
# Identify document sections based on patterns
print("\nAnalyzing document structure...")

# Split into paragraphs
paragraphs = [p for p in text.split('\n\n') if p.strip()]
print(f"Total paragraphs: {len(paragraphs)}")

# Potential section headers (capitalized lines, numbered sections, etc.)
section_pattern = re.compile(r'^([0-9]+\.\s+|[A-Z\s]+:).*$', re.MULTILINE)
potential_headers = section_pattern.findall(text)

print(f"Potential section headers identified: {len(potential_headers)}")
if potential_headers:
    print("Sample headers:")
    for header in potential_headers[:5]:
        print(f"  - {header.strip()}")

# Paragraph length distribution
para_lengths = [len(p.split()) for p in paragraphs]
plt.figure(figsize=(12, 6))
sns.histplot(para_lengths, kde=True, bins=30)
plt.axvline(np.mean(para_lengths), color='r', linestyle='--',
            label=f'Mean: {np.mean(para_lengths):.2f} words')
plt.title('Paragraph Length Distribution', fontsize=16)
plt.xlabel('Words per Paragraph', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.legend()
plt.tight_layout()
plt.show()

# Discourse markers analysis
discourse_markers = [
    'jedoch', 'allerdings', 'trotzdem', 'dennoch', 'deshalb',
    'daher', 'somit', 'folglich', 'außerdem', 'zudem',
    'einerseits', 'andererseits', 'zunächst', 'schließlich'
]

marker_counts = {}
for marker in discourse_markers:
    pattern = re.compile(r'\b' + marker + r'\b', re.IGNORECASE)
    marker_counts[marker] = len(pattern.findall(text))

# Display discourse marker frequencies
marker_df = pd.DataFrame(list(marker_counts.items()), columns=['Discourse Marker', 'Count'])
marker_df = marker_df.sort_values('Count', ascending=False)
marker_df = marker_df[marker_df['Count'] > 0]  # Only show markers that appear

if not marker_df.empty:
    plt.figure(figsize=(12, 6))
    sns.barplot(x='Count', y='Discourse Marker', data=marker_df)
    plt.title('Discourse Marker Frequencies', fontsize=16)
    plt.xlabel('Count', fontsize=14)
    plt.ylabel('Discourse Marker', fontsize=14)
    plt.tight_layout()
    plt.show()
else:
    print("No common discourse markers found in the text.")

## 5. Named Entity Recognition and Analysis

In [None]:
# Named Entity Recognition analysis
print("\nAnalyzing named entities...")
# Filter out single words and common false positives
filtered_entities = [
    ent for ent in doc.ents
    if len(ent.text.strip()) > 3
]

entity_counts = Counter([ent.label_ for ent in filtered_entities])
entity_df = pd.DataFrame(entity_counts.items(), columns=['Entity Type', 'Count'])
entity_df = entity_df.sort_values('Count', ascending=False)

# Add entity type descriptions
entity_descriptions = {
    'LOC': 'Location',
    'ORG': 'Organization',
    'PER': 'Person',
    'MISC': 'Miscellaneous',
    'DATE': 'Date',
    'CARDINAL': 'Cardinal Number',
    'MONEY': 'Monetary Value',
    'GPE': 'Geopolitical Entity',
    'ORDINAL': 'Ordinal Number',
    'QUANTITY': 'Quantity'
}

# Create a new column with descriptions
entity_df['Description'] = entity_df['Entity Type'].map(lambda x: entity_descriptions.get(x, x))

# Visualization of named entities
plt.figure(figsize=(12, 6))
sns.barplot(x='Count', y='Entity Type', data=entity_df)
plt.title('Distribution of Named Entities', fontsize=16)
plt.xlabel('Count', fontsize=14)
plt.ylabel('Entity Type', fontsize=14)
plt.tight_layout()
plt.show()

# Display sample named entities
print("\n🏷️ Sample Named Entities:")
for i, ent in enumerate(sorted(filtered_entities, key=lambda x: len(x.text), reverse=True)[:15]):
    desc = entity_descriptions.get(ent.label_, ent.label_)
    print(f"{ent.text} - {ent.label_} ({desc})")

# Create entity dictionaries by type
entities_by_type = {}
for ent in filtered_entities:
    if ent.label_ not in entities_by_type:
        entities_by_type[ent.label_] = []
    if ent.text not in entities_by_type[ent.label_]:
        entities_by_type[ent.label_].append(ent.text)

# Display top entities by type
print("\n📋 Top entities by type:")
for ent_type, ents in entities_by_type.items():
    if ent_type in entity_descriptions:
        type_desc = entity_descriptions[ent_type]
        print(f"\n{ent_type} ({type_desc}):")
        for ent in sorted(set(ents), key=ents.count, reverse=True)[:5]:
            print(f"  - {ent}")

## 6. Entity Network Analysis

Analyzing relationships between named entities in the text.

In [None]:
print("\nBuilding entity co-occurrence network...")
# Create a graph of entity co-occurrences within sentences
G = nx.Graph()

# Track entities by sentence
entities_by_sentence = []
for sent in doc.sents:
    sent_ents = []
    sent_doc = nlp(sent.text)  # Re-process to ensure entity recognition
    for ent in sent_doc.ents:
        if len(ent.text.strip()) > 3:  # Filter short entities
            # Add node to graph
            if not G.has_node(ent.text):
                G.add_node(ent.text, type=ent.label_,
                          description=entity_descriptions.get(ent.label_, ent.label_))
            sent_ents.append(ent.text)

    # Add edges between co-occurring entities
    if len(sent_ents) > 1:
        entities_by_sentence.append(sent_ents)
        for i, ent1 in enumerate(sent_ents):
            for ent2 in sent_ents[i+1:]:
                if G.has_edge(ent1, ent2):
                    G[ent1][ent2]['weight'] += 1
                else:
                    G.add_edge(ent1, ent2, weight=1)

print(f"Created network with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")

# Visualize the entity network (if not too large)
if 5 <= G.number_of_nodes() <= 100:
    plt.figure(figsize=(14, 10))

    # Calculate node sizes based on degree centrality
    centrality = nx.degree_centrality(G)
    node_size = [centrality[node] * 3000 + 100 for node in G.nodes()]

    # Calculate edge widths based on weight
    edge_width = [G[u][v]['weight'] * 0.5 for u, v in G.edges()]

    # Node colors by entity type
    color_map = {'PER': 'crimson', 'ORG': 'skyblue', 'LOC': 'green',
                'GPE': 'orange', 'DATE': 'purple', 'MISC': 'gray'}
    node_colors = [color_map.get(G.nodes[node]['type'], 'gray') for node in G.nodes()]

    # Create layout
    pos = nx.spring_layout(G, seed=42)

    # Draw network components
    nx.draw_networkx_nodes(G, pos, node_size=node_size, node_color=node_colors, alpha=0.7)
    nx.draw_networkx_edges(G, pos, width=edge_width, alpha=0.5)
    nx.draw_networkx_labels(G, pos, font_size=10, font_family="sans-serif")

    # Create legend
    legend_elements = [plt.Line2D([0], [0], marker='o', color='w',
                                 markerfacecolor=color, markersize=10, label=entity_type)
                      for entity_type, color in color_map.items()]
    plt.legend(handles=legend_elements, title="Entity Types")

    plt.title("Entity Co-occurrence Network", fontsize=16)
    plt.axis('off')
    plt.tight_layout()
    plt.show()

    # Calculate and display centrality measures
    print("\n🔍 Entity Centrality Measures:")
    centrality_df = pd.DataFrame({
        'Entity': list(centrality.keys()),
        'Degree Centrality': list(centrality.values()),
        'Betweenness Centrality': list(nx.betweenness_centrality(G).values()),
        'Type': [G.nodes[node]['type'] for node in centrality.keys()]
    })
    centrality_df = centrality_df.sort_values('Degree Centrality', ascending=False).head(10)
    print(centrality_df)
else:
    if G.number_of_nodes() < 5:
        print("Too few entities to create a meaningful network.")
    else:
        print("Network is too large to visualize effectively in this notebook.")
        # Still calculate and display top entities by centrality
        centrality = nx.degree_centrality(G)
        top_entities = sorted(centrality.items(), key=lambda x: x[1], reverse=True)[:10]
        print("\n🔍 Top 10 entities by degree centrality:")
        for entity, score in top_entities:
            print(f"  - {entity}: {score:.4f}")

## 7. Summary and Key Findings

In [None]:
print("\n📝 Summary of Analysis:")
print(f"- Document contains {len(doc)} tokens in {len(list(doc.sents))} sentences")
print(f"- Found {len(filtered_entities)} named entities across {len(entity_counts)} different types")
print(f"- Most common entity type: {entity_df.iloc[0]['Entity Type']} ({entity_df.iloc[0]['Count']} instances)")
print(f"- Identified {n_topics} main topics in the document")
if G.number_of_nodes() > 0:
    print(f"- Entity network has {G.number_of_nodes()} nodes and {G.number_of_edges()} connections")
print("\nAnalysis complete!")