In [1]:
!pip install conllu
!pip install matplotlib seaborn pandas

Collecting conllu
  Downloading conllu-6.0.0-py3-none-any.whl.metadata (21 kB)
Downloading conllu-6.0.0-py3-none-any.whl (16 kB)
Installing collected packages: conllu
Successfully installed conllu-6.0.0


In [3]:
import conllu
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from collections import defaultdict
from pathlib import Path
import itertools

In [4]:
def calculate_pos_edit_distance(sentence1, sentence2):
    """
    Calculate the edit distance between two sentences based on POS tags.

    Args:
        sentence1: A list of tokens from first treebank sentence
        sentence2: A list of tokens from second treebank sentence

    Returns:
        Edit distance value and normalized edit distance (0-1 scale)
    """
    # Extract POS tags from each sentence
    pos_seq1 = [token["upos"] for token in sentence1 if token["upos"]]
    pos_seq2 = [token["upos"] for token in sentence2 if token["upos"]]

    # Implement Levenshtein distance
    m, n = len(pos_seq1), len(pos_seq2)

    # Create matrix
    dp = [[0 for _ in range(n + 1)] for _ in range(m + 1)]

    # Initialize first row and column
    for i in range(m + 1):
        dp[i][0] = i
    for j in range(n + 1):
        dp[0][j] = j

    # Fill the matrix
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            cost = 0 if pos_seq1[i-1] == pos_seq2[j-1] else 1
            dp[i][j] = min(
                dp[i-1][j] + 1,      # deletion
                dp[i][j-1] + 1,      # insertion
                dp[i-1][j-1] + cost  # substitution
            )

    edit_distance = dp[m][n]

    # Normalize by the length of the longer sequence
    max_length = max(m, n)
    normalized_distance = edit_distance / max_length if max_length > 0 else 0

    return edit_distance, normalized_distance

class MultilingualTreebankAnalyzer:
    """Analyzer for multilingual parallel treebanks to compare POS ordering across languages."""

    def __init__(self, treebank_files):
        """
        Initialize with paths to multiple parallel CoNLL-U files.

        Args:
            treebank_files: Dictionary mapping language names to file paths
        """
        self.languages = list(treebank_files.keys())
        self.treebank_files = treebank_files

        # Load all treebanks
        self.treebanks = {}
        self.sentence_counts = {}

        for lang, file_path in treebank_files.items():
            try:
                self.treebanks[lang] = self._load_data(file_path)
                self.sentence_counts[lang] = len(self.treebanks[lang])
                print(f"Loaded {lang} treebank: {self.sentence_counts[lang]} sentences")
            except Exception as e:
                print(f"Error loading {lang} treebank: {e}")
                self.treebanks[lang] = []
                self.sentence_counts[lang] = 0

        # Get the minimum number of sentences across all treebanks
        self.min_sentences = min(self.sentence_counts.values()) if self.sentence_counts else 0

        # Store results
        self.edit_distance_results = {}

    def _load_data(self, file_path):
        """Load the CoNLL-U file into a list of sentences."""
        with open(file_path, "r", encoding="utf-8") as f:
            return list(conllu.parse(f.read()))

    def calculate_all_edit_distances(self):
        """
        Calculate edit distances between all language pairs.

        Returns:
            Dictionary with language pair tuples as keys and distance stats as values
        """
        # Generate all possible language pairs
        language_pairs = list(itertools.combinations(self.languages, 2))

        for lang1, lang2 in language_pairs:
            # Get treebanks
            treebank1 = self.treebanks[lang1]
            treebank2 = self.treebanks[lang2]

            # Calculate min sentences between these two languages
            min_sent = min(len(treebank1), len(treebank2))

            # Calculate edit distances
            edit_distances = []
            normalized_distances = []

            for i in range(min_sent):
                sent1 = treebank1[i]
                sent2 = treebank2[i]

                ed, norm_ed = calculate_pos_edit_distance(sent1, sent2)
                edit_distances.append(ed)
                normalized_distances.append(norm_ed)

            # Calculate statistics
            if edit_distances:
                avg_ed = sum(edit_distances) / len(edit_distances)
                avg_norm_ed = sum(normalized_distances) / len(normalized_distances)
                max_ed = max(edit_distances)
                min_ed = min(edit_distances)

                # Find examples of min and max edit distance
                min_idx = edit_distances.index(min_ed)
                max_idx = edit_distances.index(max_ed)

                # Calculate percentiles
                sorted_norm = sorted(normalized_distances)
                median_norm = sorted_norm[len(sorted_norm) // 2]
                q1_norm = sorted_norm[len(sorted_norm) // 4]
                q3_norm = sorted_norm[3 * len(sorted_norm) // 4]

                stats = {
                    "avg_edit_distance": avg_ed,
                    "avg_normalized_distance": avg_norm_ed,
                    "max_edit_distance": max_ed,
                    "min_edit_distance": min_ed,
                    "median_normalized": median_norm,
                    "q1_normalized": q1_norm,
                    "q3_normalized": q3_norm,
                    "min_example_idx": min_idx,
                    "max_example_idx": max_idx,
                    "all_distances": edit_distances,
                    "all_normalized": normalized_distances
                }

                # Store results
                self.edit_distance_results[(lang1, lang2)] = stats

        return self.edit_distance_results

    def generate_edit_distance_matrix(self):
        """
        Generate a matrix of average edit distances between all language pairs.

        Returns:
            Pandas DataFrame containing the distance matrix
        """
        # Make sure we have calculated distances
        if not self.edit_distance_results:
            self.calculate_all_edit_distances()

        # Create an empty DataFrame
        dist_matrix = pd.DataFrame(index=self.languages, columns=self.languages)

        # Fill diagonal with zeros (distance to self is zero)
        for lang in self.languages:
            dist_matrix.loc[lang, lang] = 0.0

        # Fill in the distances from our calculations
        for (lang1, lang2), stats in self.edit_distance_results.items():
            dist_matrix.loc[lang1, lang2] = stats["avg_normalized_distance"]
            dist_matrix.loc[lang2, lang1] = stats["avg_normalized_distance"]  # Matrix is symmetric

        return dist_matrix

    def visualize_edit_distance_matrix(self):
        """Create and save a heatmap of the edit distance matrix."""
        dist_matrix = self.generate_edit_distance_matrix()

        dist_matrix = dist_matrix.astype(float)

        dist_matrix = dist_matrix.fillna(0.0)

        plt.figure(figsize=(10, 8))
        sns.heatmap(dist_matrix, annot=True, cmap="YlGnBu", fmt=".3f")
        plt.title("Normalized Edit Distance Between Languages (POS Sequences)")
        plt.tight_layout()
        plt.savefig("multilingual_edit_distances.png")
        plt.close()
        print("\nGenerated 'multilingual_edit_distances.png'")

        return dist_matrix

    def print_edit_distance_stats(self):
        """Print detailed statistics about edit distances between all language pairs."""
        # Make sure we have calculated distances
        if not self.edit_distance_results:
            self.calculate_all_edit_distances()

        print("\nDetailed Edit Distance Statistics")
        print("=" * 60)

        for (lang1, lang2), stats in self.edit_distance_results.items():
            print(f"\n{lang1} vs {lang2}:")
            print("-" * 40)
            print(f"Number of sentence pairs analyzed: {len(stats['all_distances'])}")
            print(f"Average edit distance: {stats['avg_edit_distance']:.2f}")
            print(f"Average normalized edit distance: {stats['avg_normalized_distance']:.2f} (0-1 scale)")
            print(f"Minimum edit distance: {stats['min_edit_distance']}")
            print(f"Maximum edit distance: {stats['max_edit_distance']}")
            print(f"Normalized distance quartiles (Q1/Median/Q3): "
                  f"{stats['q1_normalized']:.2f} / {stats['median_normalized']:.2f} / {stats['q3_normalized']:.2f}")

            # Print examples of min and max edit distance
            print("\n  Example with minimum edit distance:")
            self._print_parallel_example(lang1, lang2, stats['min_example_idx'])

            print("\n  Example with maximum edit distance:")
            self._print_parallel_example(lang1, lang2, stats['max_example_idx'])

            # Create histogram for this language pair
            plt.figure(figsize=(8, 5))
            plt.hist(stats['all_normalized'], bins=20, alpha=0.7, color='green')
            plt.title(f'Distribution of Normalized Edit Distances: {lang1} vs {lang2}')
            plt.xlabel('Normalized Edit Distance (0-1)')
            plt.ylabel('Frequency')
            plt.grid(alpha=0.3)
            plt.savefig(f'edit_distance_{lang1}_{lang2}.png')
            plt.close()
            print(f"  Generated 'edit_distance_{lang1}_{lang2}.png'")

    def _print_parallel_example(self, lang1, lang2, idx):
        """Print a pair of parallel sentences with their POS sequences."""
        try:
            sent1 = self.treebanks[lang1][idx]
            sent2 = self.treebanks[lang2][idx]

            # Extract text and POS tags
            text1 = ' '.join([token["form"] for token in sent1])
            text2 = ' '.join([token["form"] for token in sent2])

            pos1 = [token["upos"] for token in sent1 if token["upos"]]
            pos2 = [token["upos"] for token in sent2 if token["upos"]]

            print(f"  {lang1}: {text1}")
            print(f"  {lang1} POS: {' '.join(pos1)}")
            print(f"  {lang2}: {text2}")
            print(f"  {lang2} POS: {' '.join(pos2)}")

            # Calculate and print edit distance for this example
            ed, norm_ed = calculate_pos_edit_distance(sent1, sent2)
            print(f"  Edit distance: {ed}, Normalized: {norm_ed:.2f}")

        except Exception as e:
            print(f"  Error printing example {idx}: {e}")


    def generate_dendrogram(self):
        """
        Generate a dendrogram showing language clustering based on edit distances.
        This visualizes which languages have the most similar POS ordering patterns.
        """
        # Get distance matrix
        dist_matrix = self.generate_edit_distance_matrix()

        # Create linkage matrix for hierarchical clustering
        from scipy.cluster.hierarchy import linkage, dendrogram

        # Convert DataFrame to condensed distance matrix (flattened upper triangular)
        distances = []
        for i in range(len(self.languages)):
            for j in range(i+1, len(self.languages)):
                distances.append(dist_matrix.iloc[i, j])

        # Calculate linkage
        Z = linkage(distances, method='average')

        # Plot dendrogram
        plt.figure(figsize=(10, 6))
        dendrogram(Z, labels=self.languages, leaf_rotation=90)
        plt.title('Language Clustering by POS Order Similarity')
        plt.xlabel('Languages')
        plt.ylabel('Distance')
        plt.tight_layout()
        plt.savefig('language_dendrogram.png')
        plt.close()
        print("\nGenerated 'language_dendrogram.png'")

    def analyze_all(self):
        """Run all analyses and generate all visualizations."""
        self.calculate_all_edit_distances()
        self.visualize_edit_distance_matrix()
        self.print_edit_distance_stats()
        self.generate_dendrogram()

def analyze_multilingual_treebanks(treebank_files):
    """
    Analyze multiple parallel treebanks.

    Args:
        treebank_files: Dictionary mapping language names to file paths
    """
    print("\n" + "=" * 60)
    print("MULTILINGUAL TREEBANK ANALYSIS")
    print("=" * 60)

    try:
        analyzer = MultilingualTreebankAnalyzer(treebank_files)
        analyzer.analyze_all()
    except Exception as e:
        print(f"Error in multilingual analysis: {e}")

# Example usage:
if __name__ == "__main__":
    # Get the treebank paths
    num_langs = int(input("How many languages do you want to compare? "))

    treebank_files = {}
    for i in range(num_langs):
        lang = input(f"Enter language name #{i+1}: ")
        path = input(f"Enter path to {lang} treebank file: ")
        treebank_files[lang] = path

    # Run multilingual analysis
    if len(treebank_files) > 1:
        analyze_multilingual_treebanks(treebank_files)
    else:
        print("At least 2 languages are needed for comparison.")

How many languages do you want to compare? 4
Enter language name #1: Azeri
Enter path to Azeri treebank file: /content/az_tuecl-ud-test.conllu
Enter language name #2: Turkish
Enter path to Turkish treebank file: /content/tr_tuecl-ud-test.fa.conllu
Enter language name #3: Kyrgyz
Enter path to Kyrgyz treebank file: /content/ky_tuecl-ud-test.conllu
Enter language name #4: Uzbek
Enter path to Uzbek treebank file: /content/uz_tuecl-ud-test.conllu

MULTILINGUAL TREEBANK ANALYSIS
Loaded Azeri treebank: 109 sentences
Loaded Turkish treebank: 148 sentences
Loaded Kyrgyz treebank: 145 sentences
Loaded Uzbek treebank: 148 sentences

Generated 'multilingual_edit_distances.png'

Detailed Edit Distance Statistics

Azeri vs Turkish:
----------------------------------------
Number of sentence pairs analyzed: 109
Average edit distance: 4.74
Average normalized edit distance: 0.59 (0-1 scale)
Minimum edit distance: 0
Maximum edit distance: 15
Normalized distance quartiles (Q1/Median/Q3): 0.50 / 0.60 / 0.