### Imports

In [1]:
import os
import re
from collections import defaultdict
import math
import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Part 1

In [2]:
# a)
def preprocess_text(text):
    # Add spaces around special characters
    text = re.sub(r'([;,!?<>()\[\]&])', r' \1 ', text)
    # Convert to lowercase and split into tokens
    return re.findall(r'\b\w+\b', text.lower())

# d)
def get_ngrams(tokens, n):
    return [' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

**Four Cohesion Metrics:**

- SCP (Symmetrical Conditional Probability) - measures how likely the components co-occur compared to their independent occurrences

- Dice - Measures the overlap between components

- φ² (Phi-squared) - Chi-square based association measure

- MI (Mutual Information) - Information-theoretic measure

In [3]:
# c)
def calculate_scp(n_gram, n_gram_freq, sub_gram_freqs):
    numerator = n_gram_freq ** 2
    denominator = sum(f1 * f2 for f1, f2 in sub_gram_freqs) / len(sub_gram_freqs)
    return numerator / denominator if denominator!=0 else 0

# ------------------------------------------------------------------------

def calculate_dice(n_gram, n_gram_freq, sub_gram_freqs):
    if len(sub_gram_freqs) == 0:
        return 0
    
    sum_dice = 0
    for f1, f2 in sub_gram_freqs:
        sum_dice += (2 * n_gram_freq) / (f1 + f2) if (f1 + f2) else 0
    
    return sum_dice / len(sub_gram_freqs)

# ------------------------------------------------------------------------

def calculate_phi_squared(n_gram, n_gram_freq, sub_gram_freqs, total_ngrams):
    if len(sub_gram_freqs) == 0:
        return 0
    
    sum_phi = 0
    for f1, f2 in sub_gram_freqs:
        N = total_ngrams
        numerator = (N * n_gram_freq - f1 * f2) ** 2
        denominator = f1 * f2 * (N - f1) * (N - f2) if (f1 * f2 * (N - f1) * (N - f2)) else 1
        sum_phi += numerator / denominator
    
    return sum_phi / len(sub_gram_freqs)

# ------------------------------------------------------------------------

def calculate_mi(n_gram, n_gram_freq, sub_gram_freqs, total_ngrams):
    if len(sub_gram_freqs) == 0:
        return 0
    
    sum_mi = 0
    for f1, f2 in sub_gram_freqs:
        if f1 == 0 or f2 == 0:
            continue
        mi = math.log2((n_gram_freq * total_ngrams) / (f1 * f2))
        sum_mi += mi
    
    return sum_mi / len(sub_gram_freqs) if len(sub_gram_freqs) > 0 else 0

# ------------------------------------------------------------------------
# choose between metrics
def calculate_cohesion_score(ngram, freq, sub_pairs, total_ngrams, metric):
    if metric == 'scp':
        return calculate_scp(ngram, freq, sub_pairs)
    elif metric == 'dice':
        return calculate_dice(ngram, freq, sub_pairs)
    elif metric == 'phi_squared':
        return calculate_phi_squared(ngram, freq, sub_pairs, total_ngrams)
    elif metric == 'mi':
        return calculate_mi(ngram, freq, sub_pairs, total_ngrams)
    else:
        raise ValueError("Invalid metric. Please choose from 'scp', 'dice', 'phi_squared'")

In [5]:
def extract_relevant_expressions(corpus_path, max_n=7, min_freq=2, metric='scp'):
    start_time = time.time()
    
    ngram_counts = defaultdict(int)
    nplus1_map = defaultdict(list)
    word_counts = defaultdict(int)
    total_ngrams = 0

    # Count of ngrams (top-down)
    for root, _, files in os.walk(corpus_path):
        for file in files:
            with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                tokens = preprocess_text(f.read())
                total_ngrams += len(tokens)
                
                for i in range(len(tokens)):
                    word_counts[tokens[i]] += 1
                    
                    for n in range(min(max_n, len(tokens)-i), 0, -1):  # descendent
                        ngram = ' '.join(tokens[i:i+n])
                        ngram_counts[ngram] += 1
                        
                        if n < max_n and i+n < len(tokens):
                            nplus1_map[ngram].append(' '.join(tokens[i:i+n+1]))
                            
    print("All ngrams counted -", round(time.time() - start_time, 2), "seconds")

    # Cohesion score
    cohesion_scores = {}
    for n in range(max_n, 1, -1):
        for ngram, freq in ((k,v) for k,v in ngram_counts.items() if len(k.split()) == n):
            words = ngram.split()
            sub_pairs = [(ngram_counts.get(' '.join(words[:i]), 0), 
                         ngram_counts.get(' '.join(words[i:]), 0)) 
                        for i in range(1, len(words))]
            
            cohesion_scores[ngram] = calculate_cohesion_score(ngram, freq, sub_pairs, total_ngrams, metric)

    print("All cohesion scores calculated -", round(time.time() - start_time, 2), "seconds")

    # LocalMaxs filter
    results = []
    for ngram, score in cohesion_scores.items():
        words = ngram.split()
        freq = ngram_counts[ngram]
        
        # Skip if below frequency min or single word
        if freq < min_freq or len(words) < 2:
            continue
            
        # Calculate max sub-score (n-1 grams)
        sub_scores = []
        for i in range(len(words)):
            sub_ngram = ' '.join(words[:i] + words[i+1:])
            sub_score = cohesion_scores.get(sub_ngram, 0)
            sub_scores.append(sub_score)
        max_sub = max(sub_scores) if sub_scores else 0
        
        # Calculate max parent score (n+1 grams)
        parent_scores = [cohesion_scores.get(p, 0) for p in nplus1_map.get(ngram, [])]
        max_parent = max(parent_scores) if parent_scores else 0
        
        threshold = math.sqrt(0.5 * (max_sub**2 + max_parent**2))
        if score > threshold:
            results.append((ngram, score, freq))
            
    print(f"Finished '{corpus_path}' - {round(time.time() - start_time, 2)} seconds")

    return sorted(results, key=lambda x: (-len(x[0].split()), -x[1], -x[2]))

In [6]:
# Test with first smaller corpus    'corpus2mw'

print("\nProcessing 'corpus2mw'...")
rexpr = extract_relevant_expressions('corpus2mw')   # SCP Metric

# Print top 10 relevant expressions
print(f"\nTop 10 Relevant Expressions from corpus2mw:")
for i, (ngram, score, freq) in enumerate(rexpr[:10], 1):
    print(f"{i}. {ngram} (score: {score:.2f}, frequency: {freq})")


Processing 'corpus2mw'...
All ngrams counted - 43.19 seconds
All cohesion scores calculated - 124.36 seconds
Finished 'corpus2mw' - 133.84 seconds

Top 10 Relevant Expressions from corpus2mw:
1. newsbank com nl search we archives p_product (score: 1.00, frequency: 2)
2. p_action search p_maxdocs 200 p_topdoc 1 p_text_direct (score: 1.00, frequency: 2)
3. p_field_direct 0 document_id p_perpage 10 p_sort ymd_date (score: 1.00, frequency: 2)
4. document_id p_perpage 10 p_sort ymd_date d s_trackval (score: 1.00, frequency: 2)
5. p_perpage 10 p_sort ymd_date d s_trackval googlepm (score: 1.00, frequency: 2)
6. prokaryotic names with standing in nomenclature lpsn (score: 1.00, frequency: 2)
7. url http en wikipedia org wiki curid (score: 1.00, frequency: 4788)
8. starlings are small to medium sized passerine (score: 0.95, frequency: 3)
9. nemili is a panchayat town in vellore (score: 0.67, frequency: 2)
10. synaptic potential that makes a postsynaptic neuron (score: 0.60, frequency: 2)


In [None]:
corpus_files = ['corpus2mw', 'corpus4mw']
    
# Process both corpora files
for corpus in corpus_files:
    print(f"\nProcessing {corpus}...")
    rexpr = extract_relevant_expressions(corpus)   # SCP Metric
    
    # Print top 10 relevant expressions
    print(f"\nTop 10 Relevant Expressions from {corpus}:")
    for i, (ngram, score, freq) in enumerate(rexpr[:10], 1):
        print(f"{i}. {ngram} (score: {score:.2f}, frequency: {freq})")


Processing corpus2mw...
All ngrams counted - 156.18
All cohesion scores calculated - 247.52

Top 10 Relevant Expressions from corpus2mw:
1. newsbank com nl search we archives p_product (score: 1.00, frequency: 2)
2. p_action search p_maxdocs 200 p_topdoc 1 p_text_direct (score: 1.00, frequency: 2)
3. p_field_direct 0 document_id p_perpage 10 p_sort ymd_date (score: 1.00, frequency: 2)
4. document_id p_perpage 10 p_sort ymd_date d s_trackval (score: 1.00, frequency: 2)
5. p_perpage 10 p_sort ymd_date d s_trackval googlepm (score: 1.00, frequency: 2)
6. prokaryotic names with standing in nomenclature lpsn (score: 1.00, frequency: 2)
7. url http en wikipedia org wiki curid (score: 1.00, frequency: 4788)
8. starlings are small to medium sized passerine (score: 0.95, frequency: 3)
9. nemili is a panchayat town in vellore (score: 0.67, frequency: 2)
10. synaptic potential that makes a postsynaptic neuron (score: 0.60, frequency: 2)

Processing corpus4mw...


In [None]:
# Now with other cohesion scores

metric = 'dice'   # CHANGE METRIC HERE -> 'scp' / 'dice' / 'phi_squared' / 'mi'

for corpus in corpus_files:
    print(f"\nProcessing {corpus}...")
    rexpr = extract_relevant_expressions(corpus, metric=metric)
    
    # Print top 5 relevant expressions
    print(f"\nTop 5 Relevant Expressions from {corpus} (with metric '{metric}'):")
    for i, (ngram, score, freq) in enumerate(rexpr[:5], 1):
        print(f"{i}. {ngram} (score: {score:.2f}, frequency: {freq})")

In [None]:
# f) Evaluate the results of the extractor through the Precision, Recall and F metric, for at least two corpora. Consider one or more languages.

# ...

# Part 2

In [None]:
def get_explicit_keywords(relevant_expr, top_n=15):
    return [ngram for ngram, _, _ in relevant_expr[:top_n]]

def get_implicit_keywords(doc_text, explicit_keywords, top_n=15):
    if not explicit_keywords:
        return []
    
    vectorizer = TfidfVectorizer(vocabulary=explicit_keywords)
    try:
        doc_vector = vectorizer.fit_transform([doc_text])
        keyword_vectors = vectorizer.transform(explicit_keywords)
        sims = cosine_similarity(doc_vector, keyword_vectors)[0]
        return [explicit_kw for _, explicit_kw in sorted(zip(sims, explicit_keywords), reverse=True)[:top_n]]
    except ValueError:
        return []

In [None]:
def process_corpus(corpus_path):
    results = {}
    
    for root, _, files in os.walk(corpus_path):
        for file in files:
            with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                text = f.read()
                relevant_expr = extract_relevant_expressions(text)
                explicit = get_explicit_keywords(relevant_expr)
                implicit = get_implicit_keywords(text, explicit)
                
                results[file] = {
                    'explicit': explicit,
                    'implicit': implicit
                }
    
    return results

In [None]:
results = process_corpus("corpus2mw")
for doc, keys in list(results.items())[:3]:
    print(f"\n{doc}\nExplicit: {keys['explicit'][:3]}\nImplicit: {keys['implicit'][:3]}")