In [1]:
import numpy as np
import pandas as pd
import fasttext
import fasttext.util
import stanza
from typing import List, Set, Dict
import gc

In [2]:
# ============================================================================
# PART 1: LOAD FASTTEXT MODEL
# ============================================================================

print("="*80)
print("LOADING FASTTEXT MODEL")
print("="*80)

# Load the .bin file
bin_file_path = '/Users/theodorselimovic/Sciences Po/Material/word vectors/cc.sv.300.bin'

print("Loading FastText model with subword information...")
model = fasttext.load_model(bin_file_path)
print(f"Model loaded successfully!")
print(f"Vector dimension: {model.get_dimension()}")
print(f"Model has subword information - can handle any word!\n")

LOADING FASTTEXT MODEL
This may take 10 minutes for a 7GB file...

Loading FastText model with subword information...
Model loaded successfully!
Vector dimension: 300
Model has subword information - can handle any word!



In [5]:
# ============================================================================
# PART 2: DEFINE SEED TERMS AND FIND SIMILAR WORDS
# ============================================================================

print("="*80)
print("FINDING SIMILAR WORDS TO SEED TERMS")
print("="*80)

# Define your seed terms
seed_terms = {
    'resilience': ['resiliens', 'motståndskraft', 'återhämtning'],
    'risk': ['risk', 'riskanalys', 'riskbedömning', 'sårbarhet'],
    'komplexity': ['komplex', 'svår', 'komplicerad', 'utmaning', 'otydlig'],
    'dependencies': ['beroende', 'ömsesidighet'],
    'actors': ['kommun', 'stat', 'länsstyrelse', 'region', 'näringsliv', 'civilsamhälle', 'förening']
    # Add more categories as needed
}

def find_similar_words(model, seed_words: List[str], top_n: int = 50) -> Dict[str, float]:
    """
    Find words similar to a list of seed words using FastText.
    
    Parameters:
    -----------
    model : fasttext.FastText._FastText
        The loaded FastText model
    seed_words : List[str]
        List of seed words to find similar words for
    top_n : int
        Number of similar words to retrieve per seed word
    
    Returns:
    --------
    Dict mapping words to their maximum similarity scores across all seed words
    """
    similar_words = {}
    
    for seed_word in seed_words:
        print(f"Processing '{seed_word}'...")
        
        try:
            # Get nearest neighbors
            # Returns: [(word, similarity), ...]
            neighbors = model.get_nearest_neighbors(seed_word, k=top_n)
            
            for similarity, word in neighbors:
                if word in similar_words:
                    # Keep maximum similarity score if word appears multiple times
                    similar_words[word] = max(similar_words[word], similarity)
                else:
                    similar_words[word] = similarity
                    
            print(f"  Found {len(neighbors)} similar words")
            
        except Exception as e:
            print(f"  Error processing '{seed_word}': {e}")
    
    return similar_words

# Find similar words for each category
all_similar_words = {}

for category, seeds in seed_terms.items():
    print(f"\n{'='*60}")
    print(f"Processing category: {category.upper()}")
    print(f"Seed words: {', '.join(seeds)}")
    print('='*60)
    
    similar = find_similar_words(model, seeds, top_n=50)
    all_similar_words[category] = similar
    
    # Show top 10 most similar words
    sorted_similar = sorted(similar.items(), key=lambda x: x[1], reverse=True)[:10]
    print(f"\nTop 10 similar words for {category}:")
    for word, score in sorted_similar:
        print(f"  {word:25s} {score:.4f}")

FINDING SIMILAR WORDS TO SEED TERMS

Processing category: RESILIENCE
Seed words: resiliens, motståndskraft, återhämtning
Processing 'resiliens'...
  Found 50 similar words
Processing 'motståndskraft'...
  Found 50 similar words
Processing 'återhämtning'...
  Found 50 similar words

Top 10 similar words for resilience:
  Resiliens                 0.7995
  återhämtningen            0.7994
  motståndskraften          0.7752
  återhämtningsprocess      0.7690
  resiliensen               0.7678
  återhämtnings             0.7638
  återhämtningstid          0.7624
  återhämtningsperiod       0.7614
  återhämtningar            0.7378
  tålighet                  0.7364

Processing category: RISK
Seed words: risk, riskbedömning, sårbarhet
Processing 'risk'...
  Found 50 similar words
Processing 'riskbedömning'...
  Found 50 similar words
Processing 'sårbarhet'...
  Found 50 similar words

Top 10 similar words for risk:
  riskbedömningen           0.8459
  riskbedömningar           0.8196
  risk

In [None]:
# ============================================================================
# PART 3: CREATE EXPANDED TERM LIST
# ============================================================================

print("\n" + "="*80)
print("CREATING EXPANDED TERM LISTS")
print("="*80)

# Combine all similar words with their categories
expanded_terms = []

for category, similar_words in all_similar_words.items():
    for word, similarity in similar_words.items():
        expanded_terms.append({
            'category': category,
            'word': word,
            'similarity_score': similarity
        })

df_expanded = pd.DataFrame(expanded_terms)

# Sort by similarity score
df_expanded = df_expanded.sort_values(['category', 'similarity_score'], 
                                      ascending=[True, False])

print(f"\nTotal expanded terms: {len(df_expanded)}")
print(f"Terms by category:")
print(df_expanded.groupby('category').size())

In [None]:
# ============================================================================
# PART 4: LEMMATIZE EXPANDED TERMS
# ============================================================================

print("\n" + "="*80)
print("LEMMATIZING EXPANDED TERMS")
print("="*80)

# Initialize Stanza
print("Loading Stanza Swedish model...")
nlp = stanza.Pipeline('sv', processors='tokenize,pos,lemma', use_gpu=False)
print("Model loaded!\n")

def lemmatize_word(word: str, nlp) -> str:
    """
    Lemmatize a single word using Stanza.
    
    Parameters:
    -----------
    word : str
        Word to lemmatize
    nlp : stanza.Pipeline
        Stanza pipeline
    
    Returns:
    --------
    str : Lemmatized word
    """
    try:
        doc = nlp(word)
        if doc.sentences and doc.sentences[0].words:
            return doc.sentences[0].words[0].lemma.lower()
        return word.lower()
    except:
        return word.lower()

print("Lemmatizing expanded terms...")
df_expanded['lemma'] = df_expanded['word'].apply(lambda x: lemmatize_word(x, nlp))

# Remove duplicates after lemmatization
print(f"Terms before deduplication: {len(df_expanded)}")
df_expanded = df_expanded.drop_duplicates(subset=['category', 'lemma'])
print(f"Terms after deduplication: {len(df_expanded)}")

In [None]:
# ============================================================================
# PART 5: SAVE RESULTS
# ============================================================================

print("\n" + "="*80)
print("RESULTS SUMMARY")
print("="*80)

for category in df_expanded['category'].unique():
    cat_data = df_expanded[df_expanded['category'] == category]
    print(f"\n{category.upper()}: {len(cat_data)} unique lemmas")
    print("Top 15 terms by similarity:")
    top_terms = cat_data.nlargest(15, 'similarity_score')
    for _, row in top_terms.iterrows():
        print(f"  {row['lemma']:25s} (original: {row['word']:25s}) {row['similarity_score']:.4f}")

# Save to CSV
output_file = 'expanded_terms_lemmatized.csv'
df_expanded.to_csv(output_file, index=False, encoding='utf-8')
print(f"\n\nSaved expanded terms to: {output_file}")

# Create a simple list of lemmas by category for easy filtering
category_lemmas = {}
for category in df_expanded['category'].unique():
    category_lemmas[category] = set(
        df_expanded[df_expanded['category'] == category]['lemma'].tolist()
    )

print("\n" + "="*80)
print("READY FOR ANALYSIS")
print("="*80)
print("You can now use the lemmatized terms to filter your sentence dataframe.")
print("\nExample usage:")
print("  resilience_lemmas = category_lemmas['resilience']")
print("  df_resilience = df_sentences[")
print("      df_sentences['sentence_text'].apply(")
print("          lambda x: any(lemma in x.split() for lemma in resilience_lemmas)")
print("      )")
print("  ]")

In [None]:
# ============================================================================
# OPTIONAL: MEMORY CLEANUP
# ============================================================================

# If you're done with the model and need to free memory:
del model
gc.collect()
print("\nFreed memory by deleting model")