In [None]:
pip install pyconll

Collecting pyconll
  Downloading pyconll-3.2.0-py3-none-any.whl.metadata (8.0 kB)
Downloading pyconll-3.2.0-py3-none-any.whl (27 kB)
Installing collected packages: pyconll
Successfully installed pyconll-3.2.0


In [7]:
import pyconll
from collections import defaultdict

def find_redup_relations(conll_file_path):
    """
    Finds all relations where a token has 'compound:redup' as its dependent
    and counts the frequency of each head relation type.

    Args:
        conll_file_path: Path to the CoNLL-U file to analyze.

    Returns:
        A tuple containing:
        - A dictionary mapping head tokens to their redup dependents and relations
        - A dictionary counting occurrences of each head relation type
    """
    # Load the CoNLL-U file
    corpus = pyconll.load_from_file(conll_file_path)

    results = {}
    relation_counts = defaultdict(int)

    for sentence in corpus:
        for token in sentence:
            # Skip multiword tokens (they don't participate in dependencies)
            if '-' in token.id:
                continue

            # Check if this token has any 'compound:redup' dependents
            redup_dependents = [dep for dep in sentence if
                               dep.head == token.id and
                               dep.deprel == "compound:redup"]

            if redup_dependents:
                # Get the head of this token (if it has one)
                head_token = None
                if token.head != "0":  # '0' means no head
                    try:
                        head_token = sentence[token.head]
                    except KeyError:
                        head_token = None

                # Store the information
                results[token] = {
                    'head': head_token,
                    'head_relation': token.deprel if token.head != "0" else "root",
                    'redup_dependents': redup_dependents
                }

                # Count the relation type
                relation = token.deprel if token.head != "0" else "root"
                relation_counts[relation] += 1

    return results, relation_counts

def print_results(results, relation_counts):
    """
    Prints the results in a readable format.
    """
    # Print relation counts first
    print("\nRELATION TYPE COUNTS:")
    print("---------------------")
    for relation, count in sorted(relation_counts.items(), key=lambda x: x[1], reverse=True):
        print(f"{relation}: {count}")

    # Then print detailed information
    print("\nDETAILED INFORMATION:")
    print("---------------------")
    for token, info in results.items():
        print(f"Token: {token.form} (ID: {token.id}, Lemma: {token.lemma})")
        print(f"  - Relation to head: {info['head_relation']}")
        if info['head']:
            print(f"  - Head token: {info['head'].form} (ID: {info['head'].id}, Lemma: {info['head'].lemma})")
        print("  - Reduplication dependents:")
        for dep in info['redup_dependents']:
            print(f"    - {dep.form} (ID: {dep.id}, Lemma: {dep.lemma})")
        print("-" * 40)

# Example usage
if __name__ == "__main__":
    # Replace with your actual CoNLL-U file path
    conll_file = "/content/nutukakadar-copy.conllu"

    try:
        relations, counts = find_redup_relations(conll_file)
        print_results(relations, counts)
        print(f"\nFound {len(relations)} tokens with compound:redup dependents.")
    except Exception as e:
        print(f"Error processing file: {e}")


RELATION TYPE COUNTS:
---------------------
advmod: 18
advcl: 13
amod: 6
compound:lvc: 3
ccomp: 2

DETAILED INFORMATION:
---------------------
Token: edib (ID: 7, Lemma: et)
  - Relation to head: compound:lvc
  - Head token: celb-i (ID: 5, Lemma: celb-i)
  - Reduplication dependents:
    - etmediğini (ID: 8, Lemma: et)
----------------------------------------
Token: yavaş (ID: 10, Lemma: yavaş)
  - Relation to head: advmod
  - Head token: uyuşturuyordu (ID: 16, Lemma: uyuş)
  - Reduplication dependents:
    - yavaş (ID: 11, Lemma: yavaş)
----------------------------------------
Token: korka (ID: 28, Lemma: kork)
  - Relation to head: advcl
  - Head token: fırlayan (ID: 30, Lemma: fırla)
  - Reduplication dependents:
    - korka (ID: 29, Lemma: kork)
----------------------------------------
Token: doğar (ID: 15, Lemma: doğ)
  - Relation to head: advcl
  - Head token: acıtıyor (ID: 29, Lemma: acı)
  - Reduplication dependents:
    - doğmaz (ID: 16, Lemma: doğ)
--------------------------

In [8]:
import pyconll
from collections import defaultdict

def find_redup_relations(conll_file_path):
    """
    Finds all relations where a token has 'compound:redup' as its dependent
    and counts the frequency of each head relation type.

    Args:
        conll_file_path: Path to the CoNLL-U file to analyze.

    Returns:
        A tuple containing:
        - A dictionary mapping head tokens to their redup dependents and relations
        - A dictionary counting occurrences of each head relation type
    """
    # Load the CoNLL-U file
    corpus = pyconll.load_from_file(conll_file_path)

    results = {}
    relation_counts = defaultdict(int)

    for sentence in corpus:
        for token in sentence:
            # Skip multiword tokens (they don't participate in dependencies)
            if '-' in token.id:
                continue

            # Check if this token has any 'compound:redup' dependents
            redup_dependents = [dep for dep in sentence if
                               dep.head == token.id and
                               dep.deprel == "compound:redup"]

            if redup_dependents:
                # Get the head of this token (if it has one)
                head_token = None
                if token.head != "0":  # '0' means no head
                    try:
                        head_token = sentence[token.head]
                    except KeyError:
                        head_token = None

                # Store the information
                results[token] = {
                    'head': head_token,
                    'head_relation': token.deprel if token.head != "0" else "root",
                    'redup_dependents': redup_dependents
                }

                # Count the relation type
                relation = token.deprel if token.head != "0" else "root"
                relation_counts[relation] += 1

    return results, relation_counts

def print_results(results, relation_counts):
    """
    Prints the results in a readable format.
    """
    # Print relation counts first
    print("\nRELATION TYPE COUNTS:")
    print("---------------------")
    for relation, count in sorted(relation_counts.items(), key=lambda x: x[1], reverse=True):
        print(f"{relation}: {count}")

    # Then print detailed information
    print("\nDETAILED INFORMATION:")
    print("---------------------")
    for token, info in results.items():
        print(f"Token: {token.form} (ID: {token.id}, Lemma: {token.lemma})")
        print(f"  - Relation to head: {info['head_relation']}")
        if info['head']:
            print(f"  - Head token: {info['head'].form} (ID: {info['head'].id}, Lemma: {info['head'].lemma})")
        print("  - Reduplication dependents:")
        for dep in info['redup_dependents']:
            print(f"    - {dep.form} (ID: {dep.id}, Lemma: {dep.lemma})")
        print("-" * 40)

# Example usage
if __name__ == "__main__":
    # Replace with your actual CoNLL-U file path
    conll_file = "/content/nutukakadar.conllu"

    try:
        relations, counts = find_redup_relations(conll_file)
        print_results(relations, counts)
        print(f"\nFound {len(relations)} tokens with compound:redup dependents.")
    except Exception as e:
        print(f"Error processing file: {e}")


RELATION TYPE COUNTS:
---------------------
advmod: 18
advcl: 14
amod: 5
compound:lvc: 3
obj: 1
acl: 1

DETAILED INFORMATION:
---------------------
Token: edib (ID: 7, Lemma: et)
  - Relation to head: compound:lvc
  - Head token: celb-i (ID: 5, Lemma: celb-i)
  - Reduplication dependents:
    - etmediğini (ID: 8, Lemma: et)
----------------------------------------
Token: yavaş (ID: 10, Lemma: yavaş)
  - Relation to head: advmod
  - Head token: uyuşturuyordu (ID: 16, Lemma: uyuş)
  - Reduplication dependents:
    - yavaş (ID: 11, Lemma: yavaş)
----------------------------------------
Token: korka (ID: 28, Lemma: kork)
  - Relation to head: advcl
  - Head token: fırlayan (ID: 30, Lemma: fırla)
  - Reduplication dependents:
    - korka (ID: 29, Lemma: kork)
----------------------------------------
Token: doğar (ID: 15, Lemma: doğ)
  - Relation to head: advcl
  - Head token: acıtıyor (ID: 29, Lemma: acı)
  - Reduplication dependents:
    - doğmaz (ID: 16, Lemma: doğ)
---------------------