In [1]:
!pip install conllu
!pip install matplotlib
!pip install numpy
!pip install pandas
!pip install seaborn

Collecting conllu
  Downloading conllu-6.0.0-py3-none-any.whl.metadata (21 kB)
Downloading conllu-6.0.0-py3-none-any.whl (16 kB)
Installing collected packages: conllu
Successfully installed conllu-6.0.0


In [3]:
import conllu
import collections
import matplotlib.pyplot as plt
from pathlib import Path
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple, Counter, Optional
import seaborn as sns
from collections import defaultdict

class UzbekUDAnalyzer:
    """Analyzer for Universal Dependencies treebank."""

    def __init__(self, file_path: str):
        """Initialize with path to a CoNLL-U file."""
        self.file_path = file_path
        self.sentences = self._load_data()
        self.pos_counts = self._count_pos_tags()
        self.dependency_counts = self._count_dependency_relations()
        self.pos_bigrams = self._count_pos_bigrams()
        self.pos_trigrams = self._count_pos_trigrams()

    def _load_data(self) -> List:
        """Load the CoNLL-U file into a list of sentences."""
        with open(self.file_path, "r", encoding="utf-8") as f:
            return list(conllu.parse(f.read()))

    def _count_pos_tags(self) -> Counter:
        """Count the occurrences of each POS tag."""
        pos_counter = Counter()
        for sentence in self.sentences:
            for token in sentence:
                if token["upos"]:  # Ensure there's a POS tag
                    pos_counter[token["upos"]] += 1
        return pos_counter

    def _count_dependency_relations(self) -> Counter:
        """Count the occurrences of each dependency relation."""
        deprel_counter = Counter()
        for sentence in self.sentences:
            for token in sentence:
                if token["deprel"]:  # Ensure there's a dependency relation
                    deprel_counter[token["deprel"]] += 1
        return deprel_counter

    def _count_pos_bigrams(self) -> Counter:
        """Count POS tag bigrams."""
        bigram_counter = Counter()
        for sentence in self.sentences:
            pos_tags = [token["upos"] for token in sentence if token["upos"]]
            for i in range(len(pos_tags) - 1):
                bigram = (pos_tags[i], pos_tags[i + 1])
                bigram_counter[bigram] += 1
        return bigram_counter

    def _count_pos_trigrams(self) -> Counter:
        """Count POS tag trigrams."""
        trigram_counter = Counter()
        for sentence in self.sentences:
            pos_tags = [token["upos"] for token in sentence if token["upos"]]
            for i in range(len(pos_tags) - 2):
                trigram = (pos_tags[i], pos_tags[i + 1], pos_tags[i + 2])
                trigram_counter[trigram] += 1
        return trigram_counter

    def get_token_count(self) -> int:
        """Return the total number of tokens in the treebank."""
        return sum(len(sentence) for sentence in self.sentences)

    def get_sentence_count(self) -> int:
        """Return the total number of sentences in the treebank."""
        return len(self.sentences)

    def get_avg_sentence_length(self) -> float:
        """Calculate the average sentence length in tokens."""
        return self.get_token_count() / self.get_sentence_count()

    def print_basic_stats(self):
        """Print basic statistics about the treebank."""
        print(f"Total sentences: {self.get_sentence_count()}")
        print(f"Total tokens: {self.get_token_count()}")
        print(f"Average sentence length: {self.get_avg_sentence_length():.2f} tokens")
        print(f"Number of unique POS tags: {len(self.pos_counts)}")
        print(f"Number of unique dependency relations: {len(self.dependency_counts)}")

    def print_pos_distribution(self, top_n: int = 10):
        """Print the distribution of POS tags."""
        total = sum(self.pos_counts.values())
        print(f"\nPOS Tag Distribution (top {top_n}):")
        print("=" * 40)
        print(f"{'POS Tag':<10} {'Count':<10} {'Percentage':<10}")
        print("-" * 40)

        for pos, count in self.pos_counts.most_common(top_n):
            percentage = (count / total) * 100
            print(f"{pos:<10} {count:<10} {percentage:.2f}%")

    def print_dependency_distribution(self, top_n: int = 10):
        """Print the distribution of dependency relations."""
        total = sum(self.dependency_counts.values())
        print(f"\nDependency Relation Distribution (top {top_n}):")
        print("=" * 50)
        print(f"{'Dependency':<15} {'Count':<10} {'Percentage':<10}")
        print("-" * 50)

        for dep, count in self.dependency_counts.most_common(top_n):
            percentage = (count / total) * 100
            print(f"{dep:<15} {count:<10} {percentage:.2f}%")

    def print_common_pos_bigrams(self, top_n: int = 10):
        """Print the most common POS bigrams."""
        total = sum(self.pos_bigrams.values())
        print(f"\nCommon POS Bigrams (top {top_n}):")
        print("=" * 60)
        print(f"{'Bigram':<25} {'Count':<10} {'Percentage':<10}")
        print("-" * 60)

        for bigram, count in self.pos_bigrams.most_common(top_n):
            percentage = (count / total) * 100
            print(f"{' → '.join(bigram):<25} {count:<10} {percentage:.2f}%")

    def print_common_pos_trigrams(self, top_n: int = 10):
        """Print the most common POS trigrams."""
        total = sum(self.pos_trigrams.values())
        print(f"\nCommon POS Trigrams (top {top_n}):")
        print("=" * 70)
        print(f"{'Trigram':<35} {'Count':<10} {'Percentage':<10}")
        print("-" * 70)

        for trigram, count in self.pos_trigrams.most_common(top_n):
            percentage = (count / total) * 100
            print(f"{' → '.join(trigram):<35} {count:<10} {percentage:.2f}%")

    def analyze_verb_position(self):
        """Analyze the position of verbs in sentences."""
        positions = []
        for sentence in self.sentences:
            sentence_length = len(sentence)
            for token_idx, token in enumerate(sentence):
                if token["upos"] == "VERB":
                    # Calculate relative position (0-1 scale)
                    relative_pos = token_idx / max(1, sentence_length - 1)
                    positions.append(relative_pos)

        # Calculate statistics
        if positions:
            avg_pos = sum(positions) / len(positions)
            final_pos_count = sum(1 for p in positions if p > 0.9)
            final_pos_pct = (final_pos_count / len(positions)) * 100

            print("\nVerb Position Analysis:")
            print("=" * 40)
            print(f"Total verbs analyzed: {len(positions)}")
            print(f"Average relative position: {avg_pos:.2f} (0=start, 1=end)")
            print(f"Verbs in final position (>90%): {final_pos_count} ({final_pos_pct:.2f}%)")

            # Create histogram
            plt.figure(figsize=(10, 6))
            plt.hist(positions, bins=10, alpha=0.7, color='blue')
            plt.title('Distribution of Verb Positions in Sentences')
            plt.xlabel('Relative Position (0=start, 1=end)')
            plt.ylabel('Frequency')
            plt.grid(alpha=0.3)
            plt.savefig('verb_positions.png')
            plt.close()
            print("Generated 'verb_positions.png' - histogram of verb positions")
        else:
            print("No verbs found in the treebank")

    def analyze_det_adj_noun_order(self):
        """Analyze determiner-adjective-noun order in the treebank."""
        det_adj_noun = 0
        adj_det_noun = 0

        for sentence in self.sentences:
            pos_tags = [token["upos"] for token in sentence]

            for i in range(len(pos_tags) - 1):
                if pos_tags[i] == "DET" and pos_tags[i+1] == "ADJ" and pos_tags[i+2] == "NOUN":
                    det_adj_noun += 1
                elif pos_tags[i] == "ADJ" and pos_tags[i+1] == "DET" and pos_tags[i+2] == "NOUN":
                    adj_det_noun += 1

        total = det_adj_noun + adj_det_noun
        if total > 0:
            print("\nAdjective-Noun Order Analysis:")
            print("=" * 40)
            print(f"Determiner before Adjective: {det_adj_noun} ({det_adj_noun/total*100:.2f}%)")
            print(f"Adjective before Determiner: {adj_det_noun} ({adj_det_noun/total*100:.2f}%)")
        else:
            print("\nNo determiner-adjective-noun pairs found in the treebank")

    def analyze_adposition_usage(self):
        """Analyze whether the language uses prepositions or postpositions."""
        prepositions = 0
        postpositions = 0

        for sentence in self.sentences:
            for token_idx, token in enumerate(sentence):
                if token["upos"] == "ADP":
                    head_idx = token["head"] - 1  # CoNLL-U uses 1-based indexing

                    # Skip if head is out of bounds
                    if head_idx < 0 or head_idx >= len(sentence):
                        continue

                    if token_idx < head_idx:  # ADP comes before its head
                        prepositions += 1
                    else:  # ADP comes after its head
                        postpositions += 1

        total = prepositions + postpositions
        if total > 0:
            print("\nAdposition Usage Analysis:")
            print("=" * 40)
            print(f"Prepositions: {prepositions} ({prepositions/total*100:.2f}%)")
            print(f"Postpositions: {postpositions} ({postpositions/total*100:.2f}%)")
        else:
            print("\nNo adpositions found in the treebank")

    def analyze_subject_verb_object_order(self):
        """Analyze the order of subject, verb, and object."""
        # Dictionary to store counts of each order
        orders = {
            "SOV": 0, "SVO": 0, "VSO": 0,
            "VOS": 0, "OVS": 0, "OSV": 0
        }

        # Initialize counts of valid clauses with clear SVO elements
        valid_clauses = 0

        for sentence in self.sentences:
            # Find subject, verb, and object in the sentence
            s_pos, v_pos, o_pos = None, None, None

            for token in sentence:
                if token["deprel"] == "nsubj" and token["head"] > 0:
                    # The subject's position
                    s_pos = token["id"] - 1

                    # The verb position (head of the subject)
                    head_idx = token["head"] - 1
                    if head_idx < len(sentence) and sentence[head_idx]["upos"] in ["VERB", "AUX"]:
                        v_pos = head_idx

            # Find direct object connected to the verb
            if v_pos is not None:
                verb_id = v_pos + 1  # Convert to 1-based index
                for token in sentence:
                    if token["deprel"] == "obj" and token["head"] == verb_id:
                        o_pos = token["id"] - 1

            # If we have all three components, determine order
            if s_pos is not None and v_pos is not None and o_pos is not None:
                valid_clauses += 1
                positions = {s_pos: "S", v_pos: "V", o_pos: "O"}
                order = "".join(positions[pos] for pos in sorted([s_pos, v_pos, o_pos]))
                orders[order] += 1

        if valid_clauses > 0:
            print("\nSubject-Verb-Object Order Analysis:")
            print("=" * 40)
            print(f"Valid clauses analyzed: {valid_clauses}")

            # Sort orders by frequency (descending)
            sorted_orders = sorted(orders.items(), key=lambda x: x[1], reverse=True)

            for order, count in sorted_orders:
                if count > 0:
                    print(f"{order}: {count} ({count/valid_clauses*100:.2f}%)")

            # Create bar chart
            orders_list = [order for order, count in sorted_orders if count > 0]
            counts = [count for order, count in sorted_orders if count > 0]

            plt.figure(figsize=(10, 6))
            bars = plt.bar(orders_list, counts, color='skyblue')
            plt.title('Subject-Verb-Object Order Distribution')
            plt.xlabel('Word Order')
            plt.ylabel('Frequency')

            # Add percentage labels on top of bars
            for bar, count in zip(bars, counts):
                height = bar.get_height()
                plt.text(bar.get_x() + bar.get_width()/2., height + 0.1,
                        f'{count/valid_clauses*100:.1f}%',
                        ha='center', va='bottom')

            plt.savefig('svo_order.png')
            plt.close()
            print("Generated 'svo_order.png' - bar chart of word order distribution")
        else:
            print("\nNo valid subject-verb-object clauses found in the treebank")

    def visualize_pos_distribution(self):
        """Create a bar chart of POS tag distribution."""
        top_pos = dict(self.pos_counts.most_common(10))

        plt.figure(figsize=(12, 6))
        bars = plt.bar(top_pos.keys(), top_pos.values(), color='teal')
        plt.title('Distribution of POS Tags')
        plt.xlabel('POS Tag')
        plt.ylabel('Frequency')
        plt.xticks(rotation=45)

        # Add count labels on top of bars
        for bar in bars:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2., height + 0.1,
                    f'{height}', ha='center', va='bottom')

        plt.tight_layout()
        plt.savefig('pos_distribution.png')
        plt.close()
        print("\nGenerated 'pos_distribution.png' - bar chart of POS tag distribution")

    def analyze_pos_transitions(self):
        """Analyze and visualize common transitions between POS tags."""
        # Create a transition matrix
        unique_pos = list(self.pos_counts.keys())
        num_pos = len(unique_pos)
        pos_to_idx = {pos: idx for idx, pos in enumerate(unique_pos)}

        # Initialize transition matrix
        transition_matrix = np.zeros((num_pos, num_pos))

        # Fill the transition matrix
        for bigram, count in self.pos_bigrams.items():
            if len(bigram) == 2:  # Ensure it's a valid bigram
                i, j = pos_to_idx[bigram[0]], pos_to_idx[bigram[1]]
                transition_matrix[i, j] = count

        # Convert to probability matrix
        row_sums = transition_matrix.sum(axis=1, keepdims=True)
        prob_matrix = np.zeros_like(transition_matrix)
        np.divide(transition_matrix, row_sums, out=prob_matrix, where=row_sums!=0)

        # Plot the heatmap
        plt.figure(figsize=(12, 10))
        sns.heatmap(prob_matrix, annot=True, fmt='.2f', cmap='viridis',
                   xticklabels=unique_pos, yticklabels=unique_pos)
        plt.title('POS Tag Transition Probabilities')
        plt.xlabel('To POS')
        plt.ylabel('From POS')
        plt.tight_layout()
        plt.savefig('pos_transitions.png')
        plt.close()
        print("\nGenerated 'pos_transitions.png' - heatmap of POS tag transitions")

        # Return the most common transition for each POS tag
        most_common_next = {}
        for i, pos in enumerate(unique_pos):
            if row_sums[i][0] > 0:  # Check if the POS tag has any transitions
                next_pos_idx = np.argmax(prob_matrix[i])
                prob = prob_matrix[i][next_pos_idx]
                most_common_next[pos] = (unique_pos[next_pos_idx], prob)

        print("\nMost Common POS Tag Transitions:")
        print("=" * 50)
        print(f"{'From POS':<10} {'To POS':<10} {'Probability':<10}")
        print("-" * 50)

        for pos, (next_pos, prob) in sorted(most_common_next.items()):
            print(f"{pos:<10} {next_pos:<10} {prob:.4f}")

# Example usage
def analyze_treebank(file_path):
    analyzer = UzbekUDAnalyzer(file_path)

    print("=" * 50)
    print("UD TREEBANK ANALYSIS")
    print("=" * 50)

    # Basic statistics
    analyzer.print_basic_stats()

    # POS distribution
    analyzer.print_pos_distribution()
    analyzer.visualize_pos_distribution()

    # Dependency distribution
    analyzer.print_dependency_distribution()

    # Common POS sequences
    analyzer.print_common_pos_bigrams(15)
    analyzer.print_common_pos_trigrams(10)

    # Word order analysis
    analyzer.analyze_verb_position()
    analyzer.analyze_det_adj_noun_order()
    analyzer.analyze_adposition_usage()
    analyzer.analyze_subject_verb_object_order()

    # POS transitions
    analyzer.analyze_pos_transitions()

# To use the script, call the function with the path to your CoNLL-U file:
# analyze_uzbek_treebank("path/to/your/uzbek_treebank.conllu")

if __name__ == "__main__":
    # Replace with your actual file path
    conllu_file = input("Enter the path to your UD treebank file (CoNLL-U format): ")
    analyze_treebank(conllu_file)

Enter the path to your UD treebank file (CoNLL-U format): /content/tr_tuecl-ud-test.fa.conllu
UZBEK UD TREEBANK ANALYSIS
Total sentences: 148
Total tokens: 917
Average sentence length: 6.20 tokens
Number of unique POS tags: 15
Number of unique dependency relations: 39

POS Tag Distribution (top 10):
POS Tag    Count      Percentage
----------------------------------------
NOUN       220        23.99%
VERB       175        19.08%
PUNCT      167        18.21%
PROPN      92         10.03%
AUX        76         8.29%
ADJ        44         4.80%
ADV        42         4.58%
PRON       40         4.36%
_          19         2.07%
CCONJ      14         1.53%

Generated 'pos_distribution.png' - bar chart of POS tag distribution

Dependency Relation Distribution (top 10):
Dependency      Count      Percentage
--------------------------------------------------
punct           167        18.21%
root            148        16.14%
nsubj           122        13.30%
obl             71         7.74%
obj