### Imports

In [16]:
import os
import re
from collections import defaultdict
import math

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Part 1

In [17]:
# a)
def preprocess_text(text):
    # Add spaces around special characters
    text = re.sub(r'([;,!?<>()\[\]&])', r' \1 ', text)
    # Convert to lowercase and split into tokens
    return re.findall(r'\b\w+\b', text.lower())

# d)
def get_ngrams(tokens, n):
    return [' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

**Four Cohesion Metrics:**

- SCP (Symmetrical Conditional Probability) - measures how likely the components co-occur compared to their independent occurrences

- Dice - Measures the overlap between components

- φ² (Phi-squared) - Chi-square based association measure

- MI (Mutual Information) - Information-theoretic measure

In [18]:
# c)
def calculate_scp(n_gram, n_gram_freq, sub_gram_freqs):
    numerator = n_gram_freq ** 2
    denominator = sum(f1 * f2 for f1, f2 in sub_gram_freqs) / len(sub_gram_freqs)
    return numerator / denominator if denominator!=0 else 0

# ------------------------------------------------------------------------

def calculate_dice(n_gram, n_gram_freq, sub_gram_freqs):
    if len(sub_gram_freqs) == 0:
        return 0
    
    sum_dice = 0
    for f1, f2 in sub_gram_freqs:
        sum_dice += (2 * n_gram_freq) / (f1 + f2) if (f1 + f2) else 0
    
    return sum_dice / len(sub_gram_freqs)

# ------------------------------------------------------------------------

def calculate_phi_squared(n_gram, n_gram_freq, sub_gram_freqs, total_ngrams):
    if len(sub_gram_freqs) == 0:
        return 0
    
    sum_phi = 0
    for f1, f2 in sub_gram_freqs:
        N = total_ngrams
        numerator = (N * n_gram_freq - f1 * f2) ** 2
        denominator = f1 * f2 * (N - f1) * (N - f2) if (f1 * f2 * (N - f1) * (N - f2)) else 1
        sum_phi += numerator / denominator
    
    return sum_phi / len(sub_gram_freqs)

# ------------------------------------------------------------------------

def calculate_mi(n_gram, n_gram_freq, sub_gram_freqs, total_ngrams):
    if len(sub_gram_freqs) == 0:
        return 0
    
    sum_mi = 0
    for f1, f2 in sub_gram_freqs:
        if f1 == 0 or f2 == 0:
            continue
        mi = math.log2((n_gram_freq * total_ngrams) / (f1 * f2))
        sum_mi += mi
    
    return sum_mi / len(sub_gram_freqs) if len(sub_gram_freqs) > 0 else 0

# ------------------------------------------------------------------------

def calculate_cohesion_score(ngram, freq, sub_pairs, total_ngrams, metric):
    if metric == 'scp':
        return calculate_scp(ngram, freq, sub_pairs)
    elif metric == 'dice':
        return calculate_dice(ngram, freq, sub_pairs)
    elif metric == 'phi_squared':
        return calculate_phi_squared(ngram, freq, sub_pairs, total_ngrams)
    elif metric == 'mi':
        return calculate_mi(ngram, freq, sub_pairs, total_ngrams)
    else:
        raise ValueError("Invalid metric. Please choose from 'scp', 'dice', 'phi_squared'")

- filtrar freq apenas no fim
- procurar ngramas desde 7 (max) para baixo

In [19]:
def extract_relevant_expressions(corpus_path, max_n=7, min_freq=2, metric='scp'):
    # Count all ngrams first
    ngram_counts = defaultdict(int)
    for root, _, files in os.walk(corpus_path):
        for file in files:
            with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                text = f.read()
                tokens = preprocess_text(text)
                for n in range(2, max_n+1):
                    for ngram in get_ngrams(tokens, n):
                        ngram_counts[ngram] += 1

    ngram_counts = {k: v for k, v in ngram_counts.items() if v >= min_freq}

    # Build n+1 lookup structure
    nplus1_lookup = defaultdict(list)
    for ngram in ngram_counts:
        words = ngram.split()
        if len(words) < max_n:
            # Create both possible extensions
            nplus1_lookup['* ' + ngram].append(ngram)
            nplus1_lookup[ngram + ' *'].append(ngram)

    # Calculate cohesion scores
    cohesion_scores = {}
    for ngram, freq in ngram_counts.items():
        words = ngram.split()
        if len(words) == 2:
            f_x = ngram_counts.get(words[0], 0)
            f_y = ngram_counts.get(words[1], 0)
            cohesion_scores[ngram] = (freq ** 2) / (f_x * f_y) if (f_x * f_y) else 0
        else:
            sub_pairs = []
            for i in range(1, len(words)):
                left = ' '.join(words[:i])
                right = ' '.join(words[i:])
                sub_pairs.append((ngram_counts.get(left, 0), ngram_counts.get(right, 0)))
            cohesion_scores[ngram] = calculate_cohesion_score(ngram, freq, sub_pairs, len(ngram_counts), metric)

    # Optimized LocalMaxs check
    relevant_expr = []
    for ngram in cohesion_scores:
        words = ngram.split()
        current_score = cohesion_scores[ngram]
        is_local_max = True

        # Check (n-1)-grams
        if len(words) > 2:
            for i in range(len(words)):
                sub_ngram = ' '.join(words[:i] + words[i+1:])
                if cohesion_scores.get(sub_ngram, 0) >= current_score:
                    is_local_max = False
                    break

        # Optimized (n+1)-gram check using lookup
        if is_local_max and len(words) < max_n:
            for pattern in ['* ' + ngram, ngram + ' *']:
                for parent in nplus1_lookup.get(pattern, []):
                    if cohesion_scores.get(parent, 0) >= current_score:
                        is_local_max = False
                        break
                if not is_local_max:
                    break

        if is_local_max:
            relevant_expr.append((ngram, cohesion_scores[ngram], ngram_counts[ngram]))

    return sorted(relevant_expr, key=lambda x: x[1], reverse=True)

In [20]:
corpus_files = ['corpus2mw', 'corpus4mw']
    
# Process both corpora files
for corpus in corpus_files:
    print(f"\nProcessing {corpus}...")
    rexpr = extract_relevant_expressions(corpus)
    
    # Print top 10 relevant expressions
    print(f"\nTop 10 Relevant Expressions from {corpus}:")
    for i, (ngram, score, freq) in enumerate(rexpr[:10], 1):
        print(f"{i}. {ngram} (score: {score:.2f}, frequency: {freq})")


Processing corpus2mw...

Top 10 Relevant Expressions from corpus2mw:
1. is assisted in his her duties by (score: 1.50, frequency: 2)
2. running british science fiction television series doctor (score: 1.50, frequency: 4)
3. county greater poland voivodeship in west central (score: 1.50, frequency: 6)
4. population belonged to no church are agnostic (score: 1.50, frequency: 3)
5. see http en wikipedia org wiki wikipedia (score: 1.50, frequency: 2)
6. fte basis for a student teacher ratio (score: 1.50, frequency: 2)
7. starlings are small to medium sized passerine (score: 1.50, frequency: 3)
8. gregarious their preferred habitat is fairly open (score: 1.50, frequency: 3)
9. viewers votes in the sing off they (score: 1.50, frequency: 2)
10. leaf string is in the language 0 (score: 1.50, frequency: 2)

Processing corpus4mw...

Top 10 Relevant Expressions from corpus4mw:
1. represented northern illinois university the huskies competed (score: 1.50, frequency: 2)
2. four doubles titles on t

In [21]:
# Now with other cohesion scores
for corpus in corpus_files:
    print(f"\nProcessing {corpus}...")
    rexpr = extract_relevant_expressions(corpus, metric='dice')   # CHANGE METRIC HERE -> 'scp' / 'dice' / 'phi_squared' / 'mi'
    
    # Print top 10 relevant expressions
    print(f"\nTop 10 Relevant Expressions from {corpus}:")
    for i, (ngram, score, freq) in enumerate(rexpr[:10], 1):
        print(f"{i}. {ngram} (score: {score:.2f}, frequency: {freq})")


Processing corpus2mw...

Top 10 Relevant Expressions from corpus2mw:
1. is assisted in his her duties by (score: 1.33, frequency: 2)
2. see http en wikipedia org wiki wikipedia (score: 1.33, frequency: 2)
3. fte basis for a student teacher ratio (score: 1.33, frequency: 2)
4. starlings are small to medium sized passerine (score: 1.33, frequency: 3)
5. gregarious their preferred habitat is fairly open (score: 1.33, frequency: 3)
6. viewers votes in the sing off they (score: 1.33, frequency: 2)
7. 21 occupation of goldsboro march 24 advance (score: 1.33, frequency: 2)
8. 24 advance on raleigh april 10 14 (score: 1.33, frequency: 2)
9. 14 occupation of raleigh april 14 bennett (score: 1.33, frequency: 2)
10. army march to washington d c via (score: 1.33, frequency: 2)

Processing corpus4mw...

Top 10 Relevant Expressions from corpus4mw:
1. represented northern illinois university the huskies competed (score: 1.33, frequency: 2)
2. four doubles titles on the itf tour (score: 1.33, frequen

In [22]:
# f) Evaluate the results of the extractor through the Precision, Recall and F metric, for at least two corpora. Consider one or more languages.

# ...

# Part 2

In [23]:
def get_explicit_keywords(relevant_expr, top_n=15):
    return [ngram for ngram, _, _ in relevant_expr[:top_n]]

def get_implicit_keywords(doc_text, explicit_keywords, top_n=15):
    if not explicit_keywords:
        return []
    
    vectorizer = TfidfVectorizer(vocabulary=explicit_keywords)
    try:
        doc_vector = vectorizer.fit_transform([doc_text])
        keyword_vectors = vectorizer.transform(explicit_keywords)
        sims = cosine_similarity(doc_vector, keyword_vectors)[0]
        return [explicit_kw for _, explicit_kw in sorted(zip(sims, explicit_keywords), reverse=True)[:top_n]]
    except ValueError:
        return []

In [24]:
def process_corpus(corpus_path):
    results = {}
    
    for root, _, files in os.walk(corpus_path):
        for file in files:
            with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                text = f.read()
                relevant_expr = extract_relevant_expressions(text)
                explicit = get_explicit_keywords(relevant_expr)
                implicit = get_implicit_keywords(text, explicit)
                
                results[file] = {
                    'explicit': explicit,
                    'implicit': implicit
                }
    
    return results

In [25]:
results = process_corpus("corpus2mw")
for doc, keys in list(results.items())[:3]:
    print(f"\n{doc}\nExplicit: {keys['explicit'][:3]}\nImplicit: {keys['implicit'][:3]}")

ValueError: scandir: path too long for Windows