### Imports

In [None]:
import os
import re
from collections import defaultdict
import math

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Part 1

In [None]:
def preprocess_text(text):
    # Add spaces around special characters
    text = re.sub(r'([;,!?<>()\[\]&])', r' \1 ', text)
    # Convert to lowercase and split into tokens
    return re.findall(r'\b\w+\b', text.lower())

def get_ngrams(tokens, n):
    return [' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

**Four Cohesion Metrics:**

- SCP (Symmetrical Conditional Probability) - measures how likely the components co-occur compared to their independent occurrences

- Dice - Measures the overlap between components

- φ² (Phi-squared) - Chi-square based association measure

- MI (Mutual Information) - Information-theoretic measure

In [None]:

def calculate_scp(n_gram, n_gram_freq, sub_gram_freqs):
    numerator = n_gram_freq ** 2
    denominator = sum(f1 * f2 for f1, f2 in sub_gram_freqs) / len(sub_gram_freqs)
    return numerator / denominator if denominator else 0

# ------------------------------------------------------------------------

def calculate_dice(n_gram, n_gram_freq, sub_gram_freqs):
    if len(sub_gram_freqs) == 0:
        return 0
    
    sum_dice = 0
    for f1, f2 in sub_gram_freqs:
        sum_dice += (2 * n_gram_freq) / (f1 + f2) if (f1 + f2) else 0
    
    return sum_dice / len(sub_gram_freqs)

# ------------------------------------------------------------------------

def calculate_phi_squared(n_gram, n_gram_freq, sub_gram_freqs, total_ngrams):
    if len(sub_gram_freqs) == 0:
        return 0
    
    sum_phi = 0
    for f1, f2 in sub_gram_freqs:
        N = total_ngrams
        numerator = (N * n_gram_freq - f1 * f2) ** 2
        denominator = f1 * f2 * (N - f1) * (N - f2) if (f1 * f2 * (N - f1) * (N - f2)) else 1
        sum_phi += numerator / denominator
    
    return sum_phi / len(sub_gram_freqs)

# ------------------------------------------------------------------------

def calculate_mi(n_gram, n_gram_freq, sub_gram_freqs, total_ngrams):
    if len(sub_gram_freqs) == 0:
        return 0
    
    sum_mi = 0
    for f1, f2 in sub_gram_freqs:
        if f1 == 0 or f2 == 0:
            continue
        mi = math.log2((n_gram_freq * total_ngrams) / (f1 * f2))
        sum_mi += mi
    
    return sum_mi / len(sub_gram_freqs) if len(sub_gram_freqs) > 0 else 0

In [None]:
def extract_relevant_expressions(corpus_path, max_n=7, min_freq=2):
    # Count n-grams and their frequencies
    ngram_counts = defaultdict(int)
    for root, _, files in os.walk(corpus_path):
        for file in files:
            with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                text = f.read()
                tokens = preprocess_text(text)
                for n in range(2, max_n+1):
                    for ngram in get_ngrams(tokens, n):
                        ngram_counts[ngram] += 1

    # Filter by minimum frequency
    ngram_counts = {k: v for k, v in ngram_counts.items() if v >= min_freq}

    # Calculate cohesion scores
    cohesion_scores = {}
    for ngram, freq in ngram_counts.items():
        words = ngram.split()
        n = len(words)
        
        if n == 2:
            f_xy = freq
            f_x = ngram_counts.get(words[0], 0)
            f_y = ngram_counts.get(words[1], 0)
            cohesion_scores[ngram] = (f_xy ** 2) / (f_x * f_y) if (f_x * f_y) else 0
        else:
            sub_pairs = []
            for i in range(1, n):
                left = ' '.join(words[:i])
                right = ' '.join(words[i:])
                f_left = ngram_counts.get(left, 0)
                f_right = ngram_counts.get(right, 0)
                sub_pairs.append((f_left, f_right))
            
            cohesion_scores[ngram] = calculate_scp(ngram, freq, sub_pairs)   # change here the cohesion	metric 

    # LocalMaxs algorithm
    relevant_expr = []
    for ngram in cohesion_scores:
        words = ngram.split()
        n = len(words)
        current_score = cohesion_scores[ngram]
        
        # Check if current ngram is a local maximum
        is_local_max = True
        
        # Check (n-1)-grams
        if n > 2:
            for i in range(n-1):
                sub_ngram = ' '.join(words[:i] + words[i+1:])
                if cohesion_scores.get(sub_ngram, 0) >= current_score:
                    is_local_max = False
                    break
        
        # Check (n+1)-grams (simplified - we dont have all possible (n+1)-grams)
        if is_local_max and n < max_n:
            # In a complete implementation, we check all possible (n+1)-grams containing this ngram
            pass
        
        if is_local_max:
            relevant_expr.append((ngram, cohesion_scores[ngram], ngram_counts[ngram]))

    # Sort by cohesion score descending
    relevant_expr.sort(key=lambda x: x[1], reverse=True)
    
    return relevant_expr

In [4]:
corpus_files = ['corpus2mw', 'corpus4mw']
    
# Process both corpora files
for corpus in corpus_files:
    print(f"\nProcessing {corpus}...")
    rexpr = extract_relevant_expressions(corpus)
    
    # Print top 20 relevant expressions
    print(f"\nTop 20 Relevant Expressions from {corpus}:")
    for i, (ngram, score, freq) in enumerate(rexpr[:20], 1):
        print(f"{i}. {ngram} (score: {score:.2f}, freq: {freq})")


Processing corpus2mw...

Top 20 Relevant Expressions from corpus2mw:
1. credited her second grade (score: 3.00, freq: 2)
2. males had a median (score: 3.00, freq: 37)
3. the bonneville salt flats (score: 3.00, freq: 2)
4. kennet and avon canal (score: 3.00, freq: 3)
5. someone living alone who (score: 3.00, freq: 49)
6. beta theta pi fraternity (score: 3.00, freq: 2)
7. http nl newsbank com (score: 3.00, freq: 2)
8. nl newsbank com nl (score: 3.00, freq: 2)
9. newsbank com nl search (score: 3.00, freq: 2)
10. com nl search we (score: 3.00, freq: 2)
11. nl search we archives (score: 3.00, freq: 2)
12. search we archives p_product (score: 3.00, freq: 2)
13. p_action search p_maxdocs 200 (score: 3.00, freq: 2)
14. search p_maxdocs 200 p_topdoc (score: 3.00, freq: 2)
15. p_maxdocs 200 p_topdoc 1 (score: 3.00, freq: 2)
16. 200 p_topdoc 1 p_text_direct (score: 3.00, freq: 2)
17. p_topdoc 1 p_text_direct 0 (score: 3.00, freq: 2)
18. p_field_direct 0 document_id p_perpage (score: 3.00, freq: 

In [None]:
# ...

# Part 2

In [1]:
# ...