# This code does data wrangling of the corpus
The code takes the unprocessed corpus that was created through reading the PDFs. It cleans up the corpus through the usual processes (lemmatization, removal of filler words). Importantly, it does not tokenize on the word level, but keeps the sentence structure to provide more insight into contexts.

In [None]:
import pandas as pd
import pyreadr
import stanza
import re
from typing import List, Dict

In [None]:
# Download Swedish language model (run this once, then comment out)
# stanza.download('sv')

In [None]:
# Load Swedish stopwords
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords', quiet=True)
swedish_stopwords = set(stopwords.words('swedish'))

# Your custom stopwords (artifacts and missed words)
custom_stopwords = {
    "samt",
    "│",
    "_____________________________________________________________________________",
    "▪",
    "underbilaga",
    "rsa",
    "2023-2026"
}

# Combine all stopwords
all_stopwords = swedish_stopwords.union(custom_stopwords)

print(f"Using {len(all_stopwords)} stopwords ({len(custom_stopwords)} custom)\n")

In [None]:
# Load your RDS file
print("Loading RDS file...")
rds_data = pyreadr.read_r('/Users/theodorselimovic/Library/CloudStorage/OneDrive-Personal/Sciences Po/Master Thesis/Text analysis code/R project/readtext_success.rds')
df = rds_data[list(rds_data.keys())[0]]

print(f"Loaded dataframe with {len(df)} rows and {len(df.columns)} columns")
print(f"Columns: {df.columns.tolist()}\n")

# Use 'file' column for document identification
if 'file' not in df.columns:
    raise ValueError("Column 'file' not found. Available: " + str(df.columns.tolist()))

doc_id_col = 'file'
print(f"Using '{doc_id_col}' column for document identification\n")

def parse_document_name(doc_name: str) -> Dict[str, str]:
    """Parse document name to extract municipality, year, and maskad status."""
    doc_name = doc_name.replace('.pdf', '')
    parts = doc_name.split()
    
    municipality = parts[1] if len(parts) > 1 else None
    year = parts[2] if len(parts) > 2 else None
    maskad = "Maskad" in doc_name
    
    return {
        'municipality': municipality,
        'year': year,
        'maskad': maskad
    }

def lemmatize_sentence(sentence, remove_stopwords=True):
    """Lemmatize a single sentence and return lemmatized text and word count."""
    lemmatized_words = []
    
    for word in sentence.words:
        lemma = word.lemma.lower()
        original = word.text
        
        # Skip punctuation-only tokens
        if re.match(r'^[^\w\s]+$', original):
            continue
        
        # Check if it's a stopword
        if remove_stopwords:
            if lemma in all_stopwords or original.lower() in all_stopwords:
                continue
        
        lemmatized_words.append(lemma)
    
    lemmatized_text = ' '.join(lemmatized_words)
    word_count = len(lemmatized_words)
    
    return lemmatized_text, word_count

def process_document_to_sentences(doc_name, text, remove_stopwords=True):
    """Process a document and return a list of sentence-level records."""
    if pd.isna(text) or text == '':
        return []
    
    metadata = parse_document_name(doc_name)
    doc = nlp(text)
    sentences_data = []
    
    for sent_idx, sentence in enumerate(doc.sentences, 1):
        lemmatized_text, word_count = lemmatize_sentence(sentence, remove_stopwords)
        
        if lemmatized_text.strip():
            sentence_record = {
                'doc_id': doc_name,
                'municipality': metadata['municipality'],
                'year': metadata['year'],
                'maskad': metadata['maskad'],
                'sentence_id': sent_idx,
                'sentence_text': lemmatized_text,
                'word_count': word_count
            }
            sentences_data.append(sentence_record)
    
    return sentences_data

# Initialize Swedish pipeline with lemmatization (WITHOUT mwt)
print("Loading Stanza Swedish model...")
nlp = stanza.Pipeline('sv', processors='tokenize,pos,lemma', use_gpu=False)
print("Model loaded successfully!\n")

# Process all documents into sentence-level data
print("Processing documents to sentence-level data...")
print("(This may take a while for large corpora)\n")

all_sentences = []

for idx, row in df.iterrows():
    print(f"Processing document {idx + 1}/{len(df)}...", end='\r')
    
    doc_id = row[doc_id_col]
    text = row['text']
    
    sentences = process_document_to_sentences(doc_id, text, remove_stopwords=True)
    all_sentences.extend(sentences)

print(f"\nProcessing complete!                                ")

# Create sentence-level dataframe
df_sentences = pd.DataFrame(all_sentences)

In [None]:
# Display statistics
print(f"\n{'='*80}")
print("STATISTICS")
print(f"{'='*80}")
print(f"Original documents: {len(df)}")
print(f"Total sentences extracted: {len(df_sentences)}")
print(f"Average sentences per document: {len(df_sentences) / len(df):.1f}")
print(f"Average words per sentence: {df_sentences['word_count'].mean():.1f}")
print(f"Median words per sentence: {df_sentences['word_count'].median():.0f}")
print(f"Min words per sentence: {df_sentences['word_count'].min()}")
print(f"Max words per sentence: {df_sentences['word_count'].max()}")

# Show word count distribution
print(f"\nWord count distribution:")
print(df_sentences['word_count'].describe())

# Show sentence length quartiles
print(f"\nSentence length quartiles:")
quartiles = df_sentences['word_count'].quantile([0.25, 0.5, 0.75, 0.95])
print(f"25th percentile: {quartiles[0.25]:.0f} words")
print(f"50th percentile: {quartiles[0.50]:.0f} words")
print(f"75th percentile: {quartiles[0.75]:.0f} words")
print(f"95th percentile: {quartiles[0.95]:.0f} words")

# Show example sentences
print(f"\n{'='*80}")
print("EXAMPLE SENTENCES")
print(f"{'='*80}")

import random

random_id = random.choice(df_sentences['doc_id'].unique())
subset = df_sentences[df_sentences['doc_id'] == random_id]

for idx in range(min(15, len(subset))):
    row = subset.iloc[idx]
    print(f"\nDocument: {row['doc_id']}")
    print(f"Municipality: {row['municipality']}, Year: {row['year']}, Maskad: {row['maskad']}")
    print(f"Sentence {row['sentence_id']} ({row['word_count']} words):")
    print(f"Lemmatized: {row['sentence_text'][:500]}")
    print("-" * 80)

In [None]:
# Option 2: Save to Parquet
print(f"\n{'='*80}")
df_sentences.to_parquet('sentences_lemmatized.parquet', index=False)
print("Saved to: sentences_lemmatized.parquet")
print("\nYour sentence-level data is ready for word vector analysis!")