In [1]:
import numpy as np
import pandas as pd
import fasttext
import fasttext.util
from typing import List, Dict, Tuple, Set

In [2]:
# ============================================================================
# CONFIGURATION
# ============================================================================

# Method for combining word vectors into sentence vector
# Options: 'mean' (average) or 'sum' (addition)
AGGREGATION_METHOD = 'mean'  # Mean is standard for sentence embeddings

In [3]:
# ============================================================================
# PART 1: LOAD DATA
# ============================================================================

print("="*80)
print("LOADING DATA")
print("="*80)

# Load your expanded terms
df_expanded = pd.read_csv('expanded_terms_lemmatized.csv')

# Create lookup dictionary: category -> set of lemmas
category_lemmas = {}
all_target_lemmas = set()

for category in df_expanded['category'].unique():
    lemmas = set(df_expanded[df_expanded['category'] == category]['lemma'].tolist())
    category_lemmas[category] = lemmas
    all_target_lemmas.update(lemmas)
    print(f"{category}: {len(lemmas)} terms")

print(f"\nTotal unique target terms across all categories: {len(all_target_lemmas)}")

# Load your sentence-level data
print("\nLoading sentence data...")
df_sentences = pd.read_parquet('sentences_lemmatized.parquet')
print(f"Loaded {len(df_sentences)} sentences")
print(f"Columns: {df_sentences.columns.tolist()}")

LOADING DATA
resilience: 109 terms
risk: 114 terms

Total unique target terms across all categories: 219

Loading sentence data...
Loaded 260393 sentences
Columns: ['doc_id', 'municipality', 'year', 'maskad', 'sentence_id', 'sentence_text', 'word_count']


In [4]:
# ============================================================================
# PART 2: FILTER SENTENCES CONTAINING EXPANDED TERMS
# ============================================================================

print("\n" + "="*80)
print("FILTERING SENTENCES WITH TARGET TERMS")
print("="*80)

def find_target_terms_in_sentence(sentence_text: str, all_target_lemmas: Set[str]) -> List[str]:
    """Find which target terms appear in a sentence."""
    words = sentence_text.split()
    found_terms = [word for word in words if word in all_target_lemmas]
    return found_terms

def get_categories_for_terms(terms: List[str], category_lemmas: Dict) -> List[str]:
    """Get all categories that these terms belong to."""
    categories = set()
    for term in terms:
        for category, lemmas in category_lemmas.items():
            if term in lemmas:
                categories.add(category)
    return list(categories)

# Find sentences containing target terms
print("Identifying sentences with target terms...")
df_sentences['target_terms'] = df_sentences['sentence_text'].apply(
    lambda x: find_target_terms_in_sentence(x, all_target_lemmas)
)

# Filter to only sentences with at least one target term
df_with_targets = df_sentences[df_sentences['target_terms'].apply(len) > 0].copy()
print(f"Sentences containing target terms: {len(df_with_targets)} ({len(df_with_targets)/len(df_sentences)*100:.2f}%)")

# Assign categories to each sentence
df_with_targets['categories'] = df_with_targets['target_terms'].apply(
    lambda terms: get_categories_for_terms(terms, category_lemmas)
)

print(f"\nSentences by number of categories:")
print(df_with_targets['categories'].apply(len).value_counts().sort_index())


FILTERING SENTENCES WITH TARGET TERMS
Identifying sentences with target terms...
Sentences containing target terms: 41311 (15.86%)

Sentences by number of categories:
categories
1    35052
2     6259
Name: count, dtype: int64


In [5]:
# ============================================================================
# PART 3: LOAD FASTTEXT MODEL AND VECTORIZE SENTENCES
# ============================================================================

print("\n" + "="*80)
print("LOADING FASTTEXT MODEL")
print("="*80)

print("Loading FastText model (this may take a few minutes)...")
model = fasttext.load_model('/Users/theodorselimovic/Sciences Po/Material/word vectors/cc.sv.300.bin')
print(f"Model loaded with {model.get_dimension()} dimensions")
print(f"Model has subword information - can handle OOV words!\n")

def vectorize_sentence(sentence_text: str, model, method: str = 'mean') -> Dict:
    """
    Convert a sentence to a single vector by aggregating word vectors.
    
    Parameters:
    -----------
    sentence_text : str
        The lemmatized sentence
    model : fasttext.FastText._FastText
        FastText model (loaded from .bin)
    method : str
        'mean' (average) or 'sum' (addition)
    
    Returns:
    --------
    Dict with 'vector', 'words_found', 'words_total', 'coverage'
    
    References:
    -----------
    Arora et al. (2017) "A Simple but Tough-to-Beat Baseline for Sentence Embeddings"
    
    Note:
    -----
    FastText can generate vectors for most words using subword info, but very
    unusual artifacts (like "▪" or "│") may still fail to produce meaningful vectors.
    We check vector norms to identify such cases.
    """
    words = sentence_text.split()
    vectors = []
    words_with_vectors = 0
    
    for word in words:
        # FastText can get a vector for any word (uses subword info)
        vector = model.get_word_vector(word)
        
        # Check if the vector is meaningful (not just zeros or near-zeros)
        # Artifacts like "▪" might produce very low-norm vectors
        if np.linalg.norm(vector) > 0.01:  # Threshold for meaningful vector
            vectors.append(vector)
            words_with_vectors += 1
    
    if len(vectors) == 0:
        # No meaningful vectors - return zero vector
        return {
            'vector': np.zeros(model.get_dimension()),
            'words_found': 0,
            'words_total': len(words),
            'coverage': 0.0
        }
    
    vectors_array = np.array(vectors)
    
    # Aggregate vectors
    if method == 'mean':
        sentence_vector = np.mean(vectors_array, axis=0)
    elif method == 'sum':
        sentence_vector = np.sum(vectors_array, axis=0)
    else:
        raise ValueError(f"Unknown aggregation method: {method}")
    
    coverage = words_with_vectors / len(words) if len(words) > 0 else 0.0
    
    return {
        'vector': sentence_vector,
        'words_found': words_with_vectors,
        'words_total': len(words),
        'coverage': coverage
    }

print(f"Vectorizing sentences using '{AGGREGATION_METHOD}' aggregation...")
print("This may take a few minutes...\n")

vectorization_results = df_with_targets['sentence_text'].apply(
    lambda x: vectorize_sentence(x, model, method=AGGREGATION_METHOD)
)

# Unpack results into separate columns
df_with_targets['sentence_vector'] = vectorization_results.apply(lambda x: x['vector'])
df_with_targets['words_found'] = vectorization_results.apply(lambda x: x['words_found'])
df_with_targets['words_total'] = vectorization_results.apply(lambda x: x['words_total'])
df_with_targets['coverage'] = vectorization_results.apply(lambda x: x['coverage'])

df_vectorized = df_with_targets.copy()
print(f"Successfully vectorized {len(df_vectorized)} sentences")


LOADING FASTTEXT MODEL
Loading FastText model (this may take a few minutes)...
Model loaded with 300 dimensions
Model has subword information - can handle OOV words!

Vectorizing sentences using 'mean' aggregation...
This may take a few minutes...

Successfully vectorized 41311 sentences


In [6]:
# ============================================================================
# PART 4: SUMMARY STATISTICS
# ============================================================================

print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)

print(f"\nTotal vectorized sentences: {len(df_vectorized)}")
print(f"Average coverage: {df_vectorized['coverage'].mean():.2%}")
print(f"Median coverage: {df_vectorized['coverage'].median():.2%}")

print(f"\nCoverage distribution:")
print(df_vectorized['coverage'].describe())

print(f"\nSentences with coverage < 80%:")
low_coverage = df_vectorized[df_vectorized['coverage'] < 0.8]
print(f"  Count: {len(low_coverage)} ({len(low_coverage)/len(df_vectorized)*100:.2f}%)")
if len(low_coverage) > 0:
    print(f"\n  Example low coverage sentences:")
    for idx in range(min(3, len(low_coverage))):
        example = low_coverage.iloc[idx]
        print(f"    Coverage: {example['coverage']:.2%} | Words: {example['words_found']}/{example['words_total']}")
        print(f"    Text: {example['sentence_text'][:100]}...")
        print()

print(f"\nSentences by year:")
year_counts = df_vectorized.groupby('year').size().sort_index()
print(year_counts)

print(f"\nSentences by municipality (top 10):")
print(df_vectorized.groupby('municipality').size().sort_values(ascending=False).head(10))

# Expand categories into separate rows for category-level analysis
print(f"\n" + "="*80)
print("EXPANDING TO CATEGORY-SENTENCE PAIRS")
print("="*80)

rows_expanded = []
for _, row in df_vectorized.iterrows():
    for category in row['categories']:
        rows_expanded.append({
            'doc_id': row['doc_id'],
            'municipality': row['municipality'],
            'year': row['year'],
            'maskad': row['maskad'],
            'sentence_id': row['sentence_id'],
            'category': category,
            'target_terms': ', '.join(row['target_terms']),
            'sentence_text': row['sentence_text'],
            'sentence_vector': row['sentence_vector'],
            'word_count': row['word_count'],
            'words_found': row['words_found'],
            'coverage': row['coverage']
        })

df_final = pd.DataFrame(rows_expanded)
print(f"Total category-sentence pairs: {len(df_final)}")
print(f"\nPairs by category:")
print(df_final.groupby('category').size())


SUMMARY STATISTICS

Total vectorized sentences: 41311
Average coverage: 99.93%
Median coverage: 100.00%

Coverage distribution:
count    41311.000000
mean         0.999326
std          0.009487
min          0.666667
25%          1.000000
50%          1.000000
75%          1.000000
max          1.000000
Name: coverage, dtype: float64

Sentences with coverage < 80%:
  Count: 10 (0.02%)

  Example low coverage sentences:
    Coverage: 79.17% | Words: 38/48
    Text: h spridning nukleär ämne stor brand avse bebyggd område skogsmark j) störning bränsle drivmedelsförs...

    Coverage: 75.76% | Words: 25/33
    Text: l) störning elektronisk kommunikation m) störning elförsörjning/ffjrvärmeförmeförmeförmeförmeförmefö...

    Coverage: 73.17% | Words: 30/41
    Text: avfallshantering 6) beredningskemikalium 1 dricksvatten 8) process teknisk vatten 3) elocent 12 fjär...


Sentences by year:
year
2011           130
2012            49
2015          7759
2016           650
2017           120
2019

In [7]:
# ============================================================================
# PART 5: SAVE RESULTS
# ============================================================================

print("\n" + "="*80)
print("SAVING RESULTS")
print("="*80)

# Save metadata (without vectors for easy inspection)
df_metadata = df_final.drop(columns=['sentence_vector'])
df_metadata.to_csv('sentence_vectors_metadata.csv', index=False, encoding='utf-8')
print("Saved metadata to: sentence_vectors_metadata.csv")

# Save full data with vectors to parquet (efficient for large data)
df_final.to_parquet('sentence_vectors_with_metadata.parquet', index=False)
print("Saved full data with vectors to: sentence_vectors_with_metadata.parquet")

# Save just the vectors as numpy array (for quick loading in analysis)
sentence_vectors = np.stack(df_final['sentence_vector'].values)
np.save('sentence_vectors.npy', sentence_vectors)
print(f"Saved sentence vectors to: sentence_vectors.npy")
print(f"Shape: {sentence_vectors.shape} (sentences × vector_dim)")

# Save index mapping (to link numpy array back to metadata)
df_final.reset_index(drop=True).to_csv(
    'sentence_vectors_index.csv',
    columns=['doc_id', 'municipality', 'year', 'category'],
    index=True,
    encoding='utf-8'
)
print("Saved index mapping to: sentence_vectors_index.csv")


SAVING RESULTS
Saved metadata to: sentence_vectors_metadata.csv
Saved full data with vectors to: sentence_vectors_with_metadata.parquet
Saved sentence vectors to: sentence_vectors.npy
Shape: (47570, 300) (sentences × vector_dim)
Saved index mapping to: sentence_vectors_index.csv


In [8]:
# ============================================================================
# PART 6: EXAMPLE ANALYSES
# ============================================================================

print("\n" + "="*80)
print("EXAMPLE: AVERAGE VECTORS BY YEAR AND CATEGORY")
print("="*80)

for category in sorted(df_final['category'].unique()):
    print(f"\n{category.upper()}:")
    cat_data = df_final[df_final['category'] == category]
    
    for year in sorted(cat_data['year'].unique()):
        year_data = cat_data[cat_data['year'] == year]
        year_vectors = np.stack(year_data['sentence_vector'].values)
        centroid = np.mean(year_vectors, axis=0)
        
        print(f"  {year}: {len(year_data):4d} sentences, "
              f"centroid norm: {np.linalg.norm(centroid):6.3f}, "
              f"avg coverage: {year_data['coverage'].mean():.2%}")

print("\n" + "="*80)
print("EXAMPLE: COMPARING YEARS WITHIN CATEGORY")
print("="*80)

# Example: Calculate cosine similarity between year centroids
from sklearn.metrics.pairwise import cosine_similarity

for category in sorted(df_final['category'].unique()):
    cat_data = df_final[df_final['category'] == category]
    years = sorted(cat_data['year'].unique())
    
    if len(years) < 2:
        continue
    
    print(f"\n{category.upper()} - Year-to-Year Similarity:")
    
    # Calculate centroid for each year
    year_centroids = {}
    for year in years:
        year_data = cat_data[cat_data['year'] == year]
        year_vectors = np.stack(year_data['sentence_vector'].values)
        year_centroids[year] = np.mean(year_vectors, axis=0)
    
    # Calculate similarities between consecutive years
    for i in range(len(years) - 1):
        year1, year2 = years[i], years[i+1]
        sim = cosine_similarity(
            year_centroids[year1].reshape(1, -1),
            year_centroids[year2].reshape(1, -1)
        )[0, 0]
        print(f"  {year1} → {year2}: {sim:.4f}")

print("\n" + "="*80)
print("DATA READY FOR ANALYSIS!")
print("="*80)
print("\nYou now have:")
print("1. sentence_vectors_with_metadata.parquet - Full dataset with vectors")
print("2. sentence_vectors_metadata.csv - Metadata for inspection")
print("3. sentence_vectors.npy - Just the vectors for efficient loading")
print("4. sentence_vectors_index.csv - Index to map vectors to metadata")
print("\nNext steps:")
print("- Temporal analysis: Track how contexts change over time")
print("- Municipality comparison: Compare semantic spaces across regions")
print("- Clustering: Group similar contexts")
print("- Dimensionality reduction: Visualize with PCA/t-SNE")


EXAMPLE: AVERAGE VECTORS BY YEAR AND CATEGORY

RESILIENCE:
  2011:   42 sentences, centroid norm:  0.398, avg coverage: 99.69%
  2012:   14 sentences, centroid norm:  0.426, avg coverage: 100.00%
  2015: 1966 sentences, centroid norm:  0.410, avg coverage: 99.96%
  2016:  200 sentences, centroid norm:  0.416, avg coverage: 100.00%
  2017:   20 sentences, centroid norm:  0.404, avg coverage: 100.00%
  2019: 2802 sentences, centroid norm:  0.408, avg coverage: 99.93%
  2019.docx:   18 sentences, centroid norm:  0.431, avg coverage: 100.00%
  2020:  205 sentences, centroid norm:  0.423, avg coverage: 100.00%
  2021:   52 sentences, centroid norm:  0.389, avg coverage: 100.00%
  2023: 5346 sentences, centroid norm:  0.403, avg coverage: 99.97%
  2024:  251 sentences, centroid norm:  0.401, avg coverage: 99.95%
  Sala:   59 sentences, centroid norm:  0.426, avg coverage: 100.00%

RISK:
  2011:  111 sentences, centroid norm:  0.445, avg coverage: 99.84%
  2012:   46 sentences, centroid norm